diff --git a/.prettierrc b/.prettierrc index 568b1174..931e821f 100644 --- a/.prettierrc +++ b/.prettierrc @@ -4,6 +4,5 @@ "printWidth": 80, "singleQuote": true, "trailingComma": "none", - "semi": false, - "overrides": [{ "files": "*.md", "options": { "printWidth": 100 } }] + "semi": false } diff --git a/CHANGELOG.md b/CHANGELOG.md index b3db2f8a..4e561688 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,25 @@ +# [v5.1.0](https://github.com/coder-hxl/x-crawl/compare/v5.0.2...v5.1.0) (2023-04-12) + +### 🚀 Features + +- The beforeSave lifecycle function of crawlFile needs to return a Promise and resolve is a Buffer . +- The description, characteristics and type of the document change. + +### 🐞 Bug Fixes + +- Incorrect type hints and type restrictions, using overloaded functions instead. + +--- + +### 🚀 特征 + +- crawlFile 的 beforeSave 生命周期函数需要返回一个 Promise 并且 resolve 是 Buffer 。 +- 文档的描述、特征和类型发生变化。 + +### 🐞 漏洞修复 + +- 错误的类型提示和类型限制,改用重载函数。 + # [v5.0.2](https://github.com/coder-hxl/x-crawl/compare/v5.0.1...v5.0.2) (2023-04-10) ### 🚀 Features @@ -6,7 +28,7 @@ --- -### 🚀 Features +### 🚀 特征 - 新增重试时,重试批次数将显示在打印中。 @@ -18,7 +40,7 @@ --- -### 🚀 Features +### 🚀 特征 - 文档新的调整。 diff --git a/README.md b/README.md index 29c66c87..d73e34cb 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,14 @@ English | [简体中文](https://github.com/coder-hxl/x-crawl/blob/main/docs/cn.md) -x-crawl is a flexible nodejs crawler library. It can crawl pages in batches, network requests in batches, download file resources in batches, polling and crawling, etc. Flexible and simple to use, friendly to JS/TS developers. +x-crawl is a flexible nodejs crawler library. Used to crawl pages, crawl interfaces, crawl files, and poll crawls. Flexible and simple to use, friendly to JS/TS developers. > If you like x-crawl, you can give [x-crawl repository](https://github.com/coder-hxl/x-crawl) a star to support it, not only for its recognition, but also for Approved by the developer. ## Features - **🔥 Async/Sync** - Just change the mode property to toggle async/sync crawling mode. -- **⚙️ Multiple functions** - Batch crawling of pages, batch network requests, batch download of file resources, polling crawling, etc. +- **⚙️Multiple functions** - Can crawl pages, crawl interfaces, crawl files and poll crawls. And it supports crawling single or multiple. - **🖋️ Flexible writing method** - A function adapts to multiple crawling configurations and obtains crawling results. The writing method is very flexible. - **⏱️ Interval crawling** - no interval/fixed interval/random interval, can effectively use/avoid high concurrent crawling. - **🔄 Retry on failure** - It can be set for all crawling requests, for a single crawling request, and for a single request to set a failed retry. @@ -37,6 +37,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [page instance](#page-instance) - [Crawl interface](#Crawl-interface) - [Crawl files](#Crawl-files) + - [life cycle](#life-cycle) + - [beforeSave](#beforeSave) - [Start polling](#Start-polling) - [Config priority](#Config-Priority) - [Interval time](#Interval-time) @@ -78,9 +80,6 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [CrawlPageConfigObject](#CrawlPageConfigObject) - [CrawlDataConfigObject](#CrawlDataConfigObject) - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - [StartPollingConfig](#StartPollingConfig) - [API Result](#API-Result) - [XCrawlInstance](#XCrawlInstance) @@ -88,9 +87,6 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [CrawlPageSingleRes](#CrawlPageSingleRes) - [CrawlDataSingleRes](#CrawlDataSingleRes) - [CrawlFileSingleRes](#CrawlFileSingleRes) - - [CrawlPageRes](#CrawlPageRes) - - [CrawlDataRes](#CrawlDataRes) - - [CrawlFileRes](#CrawlFileRes) - [API Other](#API-Other) - [AnyObject](#AnyObject) - [More](#More) @@ -135,7 +131,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { // Gets the URL of the page's wheel image element const boxHandle = await page.$(elSelectorMap[id - 1]) - const urls = await boxHandle!.$$eval('picture img', (imgEls) => { + const urls = await boxHandle.$$eval('picture img', (imgEls) => { return imgEls.map((item) => item.src) }) imgUrls.push(...urls) @@ -224,7 +220,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // Close the browser @@ -253,7 +249,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then(async (res) => { +myXCrawl.crawlPage('https://www.example.com').then(async (res) => { const { browser, page } = res.data // Get a screenshot of the rendered page @@ -275,9 +271,13 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) const requestConfigs = [ - 'https://xxx.com/xxxx', - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', method: 'POST', data: { name: 'coderhxl' } } + 'https://www.example.com/api-1', + 'https://www.example.com/api-2', + { + url: 'https://www.example.com/api-3', + method: 'POST', + data: { name: 'coderhxl' } + } ] myXCrawl.crawlData({ requestConfigs }).then((res) => { @@ -296,7 +296,10 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], fileConfig: { storeDir: './upload' // storage folder } @@ -306,6 +309,45 @@ myXCrawl }) ``` +#### life cycle + +The crawlFile API has a lifetime function: + +- beforeSave: executed before saving the file + +##### beforeSave + +In the beforeSave function you can get a file of type Buffer, which you can process and return a Promise and resolve as a Buffer. + +**Resize picture** + +Use the sharp library to resize the images to be crawled: + +```js +import xCrawl from 'x-crawl' +import sharp from 'sharp' + +const testXCrawl = xCrawl() + +testXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1.jpg', + 'https://www.example.com/file-2.jpg' + ], + fileConfig: { + beforeSave(info) { + return sharp(info.data).resize(200).toBuffer() + } + } + }) + .then((res) => { + res.forEach((item) => { + console.log(item.data?.data.isSuccess) + }) + }) +``` + ### Start polling Start a polling crawl with [startPolling()](#startPolling) . @@ -321,7 +363,7 @@ const myXCrawl = xCrawl({ myXCrawl.startPolling({ h: 2, m: 30 }, async (count, stopPolling) => { // will be executed every two and a half hours // crawlPage/crawlData/crawlFile - const res = await myXCrawl.crawlPage('https://xxx.com') + const res = await myXCrawl.crawlPage('https://www.example.com') res.data.page.close() }) ``` @@ -356,7 +398,10 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 2000, min: 1000 } }) .then((res) => {}) @@ -378,7 +423,9 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) +myXCrawl + .crawlData({ url: 'https://www.example.com/api', maxRetry: 1 }) + .then((res) => {}) ``` The maxRetry attribute determines how many times to retry. @@ -394,9 +441,9 @@ const myXCrawl = xCrawl() myXCrawl .crawlData([ - { url: 'https://xxx.com/xxxx', priority: 1 }, - { url: 'https://xxx.com/xxxx', priority: 10 }, - { url: 'https://xxx.com/xxxx', priority: 8 } + { url: 'https://www.example.com/api-1', priority: 1 }, + { url: 'https://www.example.com/api-2', priority: 10 }, + { url: 'https://www.example.com/api-3', priority: 8 } ]) .then((res) => {}) ``` @@ -425,13 +472,20 @@ Create a crawler instance via call xCrawl. The request queue is maintained by th #### Type -- [XCrawlBaseConfig](#XCrawlBaseConfig) -- [XCrawlInstance](#XCrawlInstance) +The xCrawl API is a function. ```ts function xCrawl(baseConfig?: XCrawlBaseConfig): XCrawlInstance ``` +**Parameter Type:** + +- Look at the [XCrawlBaseConfig](#XCrawlBaseConfig) type + +**Return value type:** + +- View [XCrawlInstance](#XCrawlInstance) type + #### Example ```js @@ -439,7 +493,7 @@ import xCrawl from 'x-crawl' // xCrawl API const myXCrawl = xCrawl({ - baseUrl: 'https://xxx.com', + baseUrl: 'https://www.example.com', timeout: 10000, intervalTime: { max: 2000, min: 1000 } }) @@ -453,17 +507,41 @@ crawlPage is the method of the crawler instance, usually used to crawl page. #### Type -- Look at the [CrawlPageConfig](#CrawlPageConfig) type -- Look at the [CrawlPageSingleRes](#CrawlPageSingleRes) type -- Look at the [CrawlPageRes](#CrawlPageRes) type +The crawlPage API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined -) => Promise> +type crawlPage = { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise +} ``` +**Parameter Type:** + +- Look at the [PageRequestConfig](#PageRequestConfig) type +- Look at the [CrawlPageConfigObject](#CrawlPageConfigObject) type + +**Return value type:** + +- Look at the [CrawlPageSingleRes](#CrawlPageSingleRes) type + #### Example ```js @@ -472,7 +550,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() // crawlPage API -myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // Close the browser @@ -498,7 +576,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` The res you get will be an object. @@ -516,7 +594,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlPage({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com', proxy: 'xxx', maxRetry: 1 }) @@ -537,7 +615,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlPage([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -549,20 +630,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP If you want to crawl multiple pages, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this way of writing: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlPage({ + requestConfigs: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -575,17 +658,41 @@ crawlData is the method of the crawler instance, which is usually used to crawl #### Type -- Look at the [CrawlDataConfig](#CrawlDataConfig) type -- Look at the [CrawlDataSingleRes](#CrawlDataSingleRes) type -- Look at the [CrawlDataRes](#CrawlDataRes) type +The crawlData API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlData( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined -) => Promise> +type crawlData = { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> +} ``` +**Parameter Type:** + +- See [DataRequestConfig](#DataRequestConfig) type +- Look at the [CrawlDataConfigObject](#CrawlDataConfigObject) type + +**Return value type:** + +- Look at the [CrawlDataSingleRes](#CrawlDataSingleRes) type + #### Example ```js @@ -598,7 +705,10 @@ const myXCrawl = xCrawl({ myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 3000, min: 1000 }, cookies: 'xxx', maxRetry: 1 @@ -626,7 +736,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlData('https://www.example.com/api').then((res) => {}) ``` The res you get will be an object. @@ -644,7 +754,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/api', proxy: 'xxx', maxRetry: 1 }) @@ -665,7 +775,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlData([ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -677,20 +790,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP If you want to crawl multiple data, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this writing method: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlData({ + requestConfigs: [ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -703,17 +818,36 @@ crawlFile is the method of the crawler instance, which is usually used to crawl #### Type -- Look at the [CrawlFileConfig](#CrawlFileConfig) type -- Look at the [CrawlFileSingleRes](#CrawlFileSingleRes) type -- Look at the [CrawlFileRes](#CrawlFileRes) type +The crawlFile API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlFile( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined -) => Promise> +type crawlFile = { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise +} ``` +**Parameter Type:** + +- See [FileRequestConfig](#FileRequestConfig) type +- Look at the [CrawlFileConfigObject](#CrawlFileConfigObject) type + +**Return value type:** + +- Look at the [CrawlFileSingleRes](#CrawlFileSingleRes) type + #### Example ```js @@ -727,7 +861,10 @@ const myXCrawl = xCrawl({ // crawlFile API myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], storeDir: './upload', intervalTime: { max: 3000, min: 1000 }, maxRetry: 1 @@ -757,7 +894,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/file', proxy: 'xxx', maxRetry: 1, storeDir: './upload', @@ -781,8 +918,8 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile([ - { url: 'https://xxx.com/xxxx', storeDir: './upload' }, - { url: 'https://xxx.com/xxxx', storeDir: './upload', maxRetry: 2 } + { url: 'https://www.example.com/file-1', storeDir: './upload' }, + { url: 'https://www.example.com/file-2', storeDir: './upload', maxRetry: 2 } ]) .then((res) => {}) ``` @@ -795,20 +932,22 @@ For more configuration options of CrawlFileConfigObject, please refer to [CrawlF If you want to crawl multiple data, and the request configuration (storeDir, proxy, retry, etc.) does not want to be written repeatedly, and you need interval time, etc., you can try this way of writing: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlFile({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', storeDir: './upload/xxx' } - ], - storeDir: './upload', - intervalTime: { max: 3000, min: 1000 }, - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1', + { url: 'https://www.example.com/file-2', storeDir: './upload/xxx' } + ], + storeDir: './upload', + intervalTime: { max: 3000, min: 1000 }, + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -999,38 +1138,12 @@ export interface CrawlFileConfigObject { fileName: string filePath: string data: Buffer - }) => Buffer | void + }) => Promise } } ``` -##### CrawlPageConfig - -```ts -export type CrawlPageConfig = - | string - | PageRequestConfig - | (string | PageRequestConfig)[] - | CrawlPageConfigObject -``` - -##### CrawlDataConfig - -```ts -export type CrawlDataConfig = - | string - | DataRequestConfig - | (string | DataRequestConfig)[] - | CrawlDataConfigObject -``` - -##### CrawlFileConfig - -```ts -export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject -``` - -##### StartPollingConfig +##### startPollingConfig ```js export interface StartPollingConfig { @@ -1046,20 +1159,66 @@ export interface StartPollingConfig { ```ts export interface XCrawlInstance { - crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined - ) => Promise> + crawlPage: { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + } - crawlData: ( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined - ) => Promise> + crawlData: { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + } - crawlFile: ( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined - ) => Promise> + crawlFile: { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + } startPolling: ( config: StartPollingConfig, @@ -1124,36 +1283,6 @@ export interface CrawlFileSingleRes extends CrawlCommonRes { } ``` -#### CrawlPageRes - -```ts -export type CrawlPageRes = R extends - | (string | PageRequestConfig)[] - | CrawlPageConfigObject - ? CrawlPageSingleRes[] - : CrawlPageSingleRes -``` - -#### CrawlDataRes - -```ts -export type CrawlDataRes = R extends - | (string | DataRequestConfig)[] - | CrawlDataConfigObject - ? CrawlDataSingleRes[] - : CrawlDataSingleRes -``` - -#### CrawlFileRes - -```ts -export type CrawlFileRes = R extends - | FileRequestConfig[] - | CrawlFileConfigObject - ? CrawlFileSingleRes[] - : CrawlFileSingleRes -``` - ### API Other #### AnyObject @@ -1167,3 +1296,5 @@ export interface AnyObject extends Object { ## More If you have **problems, needs, good suggestions** please raise **Issues** in https://github.com/coder-hxl/x-crawl/issues. + +thank you for your support. diff --git a/docs/cn.md b/docs/cn.md index b02c859c..8ff8c4d4 100644 --- a/docs/cn.md +++ b/docs/cn.md @@ -2,14 +2,14 @@ [English](https://github.com/coder-hxl/x-crawl#x-crawl) | 简体中文 -x-crawl 是一个灵活的 nodejs 爬虫库。可批量爬取页面、批量网络请求、批量下载文件资源、轮询爬取等。用法灵活和简单,对 JS/TS 开发者友好。 +x-crawl 是一个灵活的 nodejs 爬虫库。用于爬页面、爬接口、爬文件以及轮询爬。用法灵活和简单,对 JS/TS 开发者友好。 > 如果你喜欢 x-crawl ,可以给 [x-crawl 存储库](https://github.com/coder-hxl/x-crawl) 点个 Star 支持一下,不仅是对它的认可,同时也是对开发者的认可。 ## 特征 - **🔥 异步/同步** - 只需更改一下 mode 属性即可切换 异步/同步 爬取模式。 -- **⚙️ 多种功能** - 可批量爬取页面、批量网络请求、批量下载文件资源、轮询爬取等。 +- **⚙️ 多种功能** - 可爬页面、爬接口、爬文件以及轮询爬。并且支持爬取单个或多个。 - **🖋️ 写法灵活** - 一种功能适配多种爬取配置、获取爬取结果的写法,写法非常灵活。 - **⏱️ 间隔爬取** - 无间隔/固定间隔/随机间隔,可以有效 使用/避免 高并发爬取。 - **🔄 失败重试** - 可针对所有爬取的请求设置,针对单次爬取的请求设置,针对单个请求设置进行失败重试。 @@ -37,6 +37,8 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer) - [page 实例](#page-实例) - [爬取接口](#爬取接口) - [爬取文件](#爬取文件) + - [生命周期](#生命周期) + - [beforeSave](#beforeSave) - [启动轮询](#启动轮询) - [配置优先级](#配置优先级) - [间隔时间](#间隔时间) @@ -79,9 +81,6 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer) - [CrawlPageConfigObject](#CrawlPageConfigObject) - [CrawlDataConfigObject](#CrawlDataConfigObject) - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - [StartPollingConfig](#StartPollingConfig) - [API Result](#API-Result) - [XCrawlInstance](#XCrawlInstance) @@ -89,9 +88,6 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer) - [CrawlPageSingleRes](#CrawlPageSingleRes) - [CrawlDataSingleRes](#CrawlDataSingleRes) - [CrawlFileSingleRes](#CrawlFileSingleRes) - - [CrawlPageRes](#CrawlPageRes) - - [CrawlDataRes](#CrawlDataRes) - - [CrawlFileRes](#CrawlFileRes) - [API Other](#API-Other) - [AnyObject](#AnyObject) - [更多](#更多) @@ -127,13 +123,13 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { // 存放图片 URL const imgUrls = [] - const elSelectorMap = ['.carousel-inner img', '.chief-recom-item img', '.bg-item img'] + const elSelectorMap = ['.carousel-inner', '.chief-recom-item', '.bg-item'] for (const item of res) { const { id } = item const { page } = item.data // 获取页面轮播图片元素的 URL - const urls = await page.$$eval(elSelectorMap[id - 1], (imgEls) => + const urls = await page.$$eval(`${elSelectorMap[id - 1]} img`, (imgEls) => imgEls.map((item) => item.src) ) imgUrls.push(...urls) @@ -222,7 +218,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // 关闭浏览器 @@ -251,7 +247,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then(async (res) => { +myXCrawl.crawlPage('https://www.example.com').then(async (res) => { const { browser, page } = res.data // 获取页面渲染后的截图 @@ -273,9 +269,13 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) const requestConfigs = [ - 'https://xxx.com/xxxx', - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', method: 'POST', data: { name: 'coderhxl' } } + 'https://www.example.com/api-1', + 'https://www.example.com/api-2', + { + url: 'https://www.example.com/api-3', + method: 'POST', + data: { name: 'coderhxl' } + } ] myXCrawl.crawlData({ requestConfigs }).then((res) => { @@ -294,7 +294,10 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], fileConfig: { storeDir: './upload' // 存放文件夹 } @@ -302,6 +305,45 @@ myXCrawl .then((res) => {}) ``` +#### 生命周期 + +crawlFile API 拥有一个声明周期函数: + +- beforeSave: 在保存文件前执行 + +##### beforeSave + +在 beforeSave 函数中你可以拿到 Buffer 类型的文件,你可以对该 Buffer 进行处理,然后需要返回一个 Promise ,并且 resolve 是 Buffer 。 + +**调整图片大小** + +使用 sharp 库对需要爬取的图片进行调整大小操作: + +```js +import xCrawl from 'x-crawl' +import sharp from 'sharp' + +const testXCrawl = xCrawl() + +testXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1.jpg', + 'https://www.example.com/file-2.jpg' + ], + fileConfig: { + beforeSave(info) { + return sharp(info.data).resize(200).toBuffer() + } + } + }) + .then((res) => { + res.forEach((item) => { + console.log(item.data?.data.isSuccess) + }) + }) +``` + ### 启动轮询 通过 [startPolling()](#startPolling) 启动一个轮询爬取。 @@ -316,7 +358,7 @@ const myXCrawl = xCrawl({ myXCrawl.startPolling({ h: 2, m: 30 }, async (count, stopPolling) => { // 每隔两个半小时会执行一次 // crawlPage/crawlData/crawlFile - const res = await myXCrawl.crawlPage('https://xxx.com') + const res = await myXCrawl.crawlPage('https://www.example.com') res.data.page.close() }) ``` @@ -351,7 +393,10 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 2000, min: 1000 } }) .then((res) => {}) @@ -373,7 +418,9 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) +myXCrawl + .crawlData({ url: 'https://www.example.com/api', maxRetry: 1 }) + .then((res) => {}) ``` maxRetry 属性决定要重试几次。 @@ -389,9 +436,9 @@ const myXCrawl = xCrawl() myXCrawl .crawlData([ - { url: 'https://xxx.com/xxxx', priority: 1 }, - { url: 'https://xxx.com/xxxx', priority: 10 }, - { url: 'https://xxx.com/xxxx', priority: 8 } + { url: 'https://www.example.com/api-1', priority: 1 }, + { url: 'https://www.example.com/api-2', priority: 10 }, + { url: 'https://www.example.com/api-3', priority: 8 } ]) .then((res) => {}) ``` @@ -420,13 +467,20 @@ x-crawl 本身就是用 TypeScript 编写的,并对 TypeScript 提供了支持 #### 类型 -- [XCrawlBaseConfig](#XCrawlBaseConfig) -- [XCrawlInstance](#XCrawlInstance) +xCrawl API 是一个函数。 ```ts function xCrawl(baseConfig?: XCrawlBaseConfig): XCrawlInstance ``` +**参数类型:** + +- 查看 [XCrawlBaseConfig](#XCrawlBaseConfig) 类型 + +**返回值类型:** + +- 查看 [XCrawlInstance](#XCrawlInstance)类型 + #### 示例 ```js @@ -434,7 +488,7 @@ import xCrawl from 'x-crawl' // xCrawl API const myXCrawl = xCrawl({ - baseUrl: 'https://xxx.com', + baseUrl: 'https://www.example.com', timeout: 10000, intervalTime: { max: 2000, min: 1000 } }) @@ -446,17 +500,41 @@ crawlPage 是爬虫实例的方法,通常用于爬取页面。 #### 类型 -- 查看 [CrawlPageConfig](#CrawlPageConfig) 类型 -- 查看 [CrawlPageSingleRes](#CrawlPageSingleRes) 类型 -- 查看 [CrawlPageRes](#CrawlPageRes) 类型 +crawlPage API 是一个函数。类型是 [重载函数](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) 可以通过不同的配置参数调用该函数(在类型方面)。 ```ts -function crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined -) => Promise> +type crawlPage = { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise +} ``` +**参数类型:** + +- 查看 [PageRequestConfig](#PageRequestConfig) 类型 +- 查看 [CrawlPageConfigObject](#CrawlPageConfigObject) 类型 + +**返回值类型:** + +- 查看 [CrawlPageSingleRes](#CrawlPageSingleRes) 类型 + #### 示例 ```js @@ -465,7 +543,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() // crawlPage API -myXCrawl.crawlPage('https://xxx.com/xxx').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // 关闭浏览器 @@ -491,7 +569,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` 拿到的 res 将是一个对象。 @@ -509,7 +587,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlPage({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com', proxy: 'xxx', maxRetry: 1 }) @@ -530,7 +608,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlPage([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -542,20 +623,22 @@ CrawlPageConfigObject 的更多配置选项可以查看 [CrawlPageConfigObject]( 如果你想爬取多个页面,并且请求配置(proxy、cookies、重试等等)不想重复写,需要间隔时间的话,可以试试这种写法: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlPage({ + requestConfigs: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` 拿到的 res 将是一个数组,里面是对象。 @@ -568,17 +651,41 @@ crawl 是爬虫实例的方法,通常用于爬取 API ,可获取 JSON 数据 #### 类型 -- 查看 [CrawlDataConfig](#CrawlDataConfig) 类型 -- 查看 [CrawlDataSingleRes](#CrawlDataSingleRes) 类型 -- 查看 [CrawlDataRes](#CrawlDataRes) 类型 +crawlData API 是一个函数。类型是 [重载函数](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) 可以通过不同的配置参数调用该函数(在类型方面)。 ```ts -function crawlData( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined -) => Promise> +type crawlData = { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> +} ``` +**参数类型:** + +- 查看 [DataRequestConfig](#DataRequestConfig) 类型 +- 查看 [CrawlDataConfigObject](#CrawlDataConfigObject) 类型 + +**返回值类型:** + +- 查看 [CrawlDataSingleRes](#CrawlDataSingleRes) 类型 + #### 示例 ```js @@ -592,7 +699,10 @@ const myXCrawl = xCrawl({ // crawlData API myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 3000, min: 1000 }, cookies: 'xxx', maxRetry: 1 @@ -620,7 +730,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlData('https://www.example.com/api').then((res) => {}) ``` 拿到的 res 将是一个对象。 @@ -638,7 +748,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/api', proxy: 'xxx', maxRetry: 1 }) @@ -659,7 +769,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlPage([ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -671,20 +784,22 @@ CrawlPageConfigObject 的更多配置选项可以查看 [CrawlPageConfigObject]( 如果你想爬取多个数据,并且请求配置(proxy、cookies、重试等等)不想重复写,需要间隔时间的话,可以试试这种写法: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlData({ + requestConfigs: [ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` 拿到的 res 将是一个数组,里面是对象。 @@ -697,17 +812,36 @@ crawlFile 是爬虫实例的方法,通常用于爬取文件,可获取图片 #### 类型 -- 查看 [CrawlFileConfig](#CrawlFileConfig) 类型 -- 查看 [CrawlFileSingleRes](#CrawlFileSingleRes) 类型 -- 查看 [CrawlFileRes](#CrawlFileRes) 类型 +crawlFile API 是一个函数。类型是 [重载函数](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) 可以通过不同的配置参数调用该函数(在类型方面)。 ```ts -function crawlFile( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined -) => Promise> +type crawlFile = { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise +} ``` +**参数类型:** + +- 查看 [FileRequestConfig](#FileRequestConfig) 类型 +- 查看 [CrawlFileConfigObject](#CrawlFileConfigObject) 类型 + +**返回值类型:** + +- 查看 [CrawlFileSingleRes](#CrawlFileSingleRes) 类型 + #### 示例 ```js @@ -721,7 +855,10 @@ const myXCrawl = xCrawl({ // crawlFile API myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], storeDir: './upload', intervalTime: { max: 3000, min: 1000 }, maxRetry: 1 @@ -750,7 +887,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/file', proxy: 'xxx', maxRetry: 1, storeDir: './upload', @@ -774,8 +911,8 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile([ - { url: 'https://xxx.com/xxxx', storeDir: './upload' }, - { url: 'https://xxx.com/xxxx', storeDir: './upload', maxRetry: 2 } + { url: 'https://www.example.com/file-1', storeDir: './upload' }, + { url: 'https://www.example.com/file-2', storeDir: './upload', maxRetry: 2 } ]) .then((res) => {}) ``` @@ -788,20 +925,22 @@ CrawlFileConfigObject 的更多配置选项可以查看 [CrawlFileConfigObject]( 如果你想爬取多个数据,并且请求配置(storeDir、proxy、重试等等)不想重复写,需要间隔时间等等的话,可以试试这种写法: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlFile({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', storeDir: './upload/xxx' } - ], - storeDir: './upload', - intervalTime: { max: 3000, min: 1000 }, - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1', + { url: 'https://www.example.com/file-2', storeDir: './upload/file2' } + ], + storeDir: './upload', + intervalTime: { max: 3000, min: 1000 }, + maxRetry: 1 + }) + .then((res) => {}) ``` 拿到的 res 将是一个数组,里面是对象。 @@ -992,37 +1131,11 @@ export interface CrawlFileConfigObject { fileName: string filePath: string data: Buffer - }) => Buffer | void + }) => Promise } } ``` -##### CrawlPageConfig - -```ts -export type CrawlPageConfig = - | string - | PageRequestConfig - | (string | PageRequestConfig)[] - | CrawlPageConfigObject -``` - -##### CrawlDataConfig - -```ts -export type CrawlDataConfig = - | string - | DataRequestConfig - | (string | DataRequestConfig)[] - | CrawlDataConfigObject -``` - -##### CrawlFileConfig - -```ts -export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject -``` - ##### StartPollingConfig ```js @@ -1039,20 +1152,66 @@ export interface StartPollingConfig { ```ts export interface XCrawlInstance { - crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined - ) => Promise> + crawlPage: { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + } - crawlData: ( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined - ) => Promise> + crawlData: { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + } - crawlFile: ( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined - ) => Promise> + crawlFile: { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + } startPolling: ( config: StartPollingConfig, @@ -1117,36 +1276,6 @@ export interface CrawlFileSingleRes extends CrawlCommonRes { } ``` -#### CrawlPageRes - -```ts -export type CrawlPageRes = R extends - | (string | PageRequestConfig)[] - | CrawlPageConfigObject - ? CrawlPageSingleRes[] - : CrawlPageSingleRes -``` - -#### CrawlDataRes - -```ts -export type CrawlDataRes = R extends - | (string | DataRequestConfig)[] - | CrawlDataConfigObject - ? CrawlDataSingleRes[] - : CrawlDataSingleRes -``` - -#### CrawlFileRes - -```ts -export type CrawlFileRes = R extends - | FileRequestConfig[] - | CrawlFileConfigObject - ? CrawlFileSingleRes[] - : CrawlFileSingleRes -``` - ### API Other #### AnyObject @@ -1160,3 +1289,5 @@ export interface AnyObject extends Object { ## 更多 如果您有 **问题 、需求、好的建议** 请在 https://github.com/coder-hxl/x-crawl/issues 中提 **Issues** 。 + +感谢你们的支持。 diff --git a/package.json b/package.json index 88e7b3ec..ad0c23a4 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "5.0.2", + "version": "5.1.0", "author": "coderHXL", "description": "x-crawl is a flexible nodejs crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index 29c66c87..d73e34cb 100644 --- a/publish/README.md +++ b/publish/README.md @@ -2,14 +2,14 @@ English | [简体中文](https://github.com/coder-hxl/x-crawl/blob/main/docs/cn.md) -x-crawl is a flexible nodejs crawler library. It can crawl pages in batches, network requests in batches, download file resources in batches, polling and crawling, etc. Flexible and simple to use, friendly to JS/TS developers. +x-crawl is a flexible nodejs crawler library. Used to crawl pages, crawl interfaces, crawl files, and poll crawls. Flexible and simple to use, friendly to JS/TS developers. > If you like x-crawl, you can give [x-crawl repository](https://github.com/coder-hxl/x-crawl) a star to support it, not only for its recognition, but also for Approved by the developer. ## Features - **🔥 Async/Sync** - Just change the mode property to toggle async/sync crawling mode. -- **⚙️ Multiple functions** - Batch crawling of pages, batch network requests, batch download of file resources, polling crawling, etc. +- **⚙️Multiple functions** - Can crawl pages, crawl interfaces, crawl files and poll crawls. And it supports crawling single or multiple. - **🖋️ Flexible writing method** - A function adapts to multiple crawling configurations and obtains crawling results. The writing method is very flexible. - **⏱️ Interval crawling** - no interval/fixed interval/random interval, can effectively use/avoid high concurrent crawling. - **🔄 Retry on failure** - It can be set for all crawling requests, for a single crawling request, and for a single request to set a failed retry. @@ -37,6 +37,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [page instance](#page-instance) - [Crawl interface](#Crawl-interface) - [Crawl files](#Crawl-files) + - [life cycle](#life-cycle) + - [beforeSave](#beforeSave) - [Start polling](#Start-polling) - [Config priority](#Config-Priority) - [Interval time](#Interval-time) @@ -78,9 +80,6 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [CrawlPageConfigObject](#CrawlPageConfigObject) - [CrawlDataConfigObject](#CrawlDataConfigObject) - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - [StartPollingConfig](#StartPollingConfig) - [API Result](#API-Result) - [XCrawlInstance](#XCrawlInstance) @@ -88,9 +87,6 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [CrawlPageSingleRes](#CrawlPageSingleRes) - [CrawlDataSingleRes](#CrawlDataSingleRes) - [CrawlFileSingleRes](#CrawlFileSingleRes) - - [CrawlPageRes](#CrawlPageRes) - - [CrawlDataRes](#CrawlDataRes) - - [CrawlFileRes](#CrawlFileRes) - [API Other](#API-Other) - [AnyObject](#AnyObject) - [More](#More) @@ -135,7 +131,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { // Gets the URL of the page's wheel image element const boxHandle = await page.$(elSelectorMap[id - 1]) - const urls = await boxHandle!.$$eval('picture img', (imgEls) => { + const urls = await boxHandle.$$eval('picture img', (imgEls) => { return imgEls.map((item) => item.src) }) imgUrls.push(...urls) @@ -224,7 +220,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // Close the browser @@ -253,7 +249,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com').then(async (res) => { +myXCrawl.crawlPage('https://www.example.com').then(async (res) => { const { browser, page } = res.data // Get a screenshot of the rendered page @@ -275,9 +271,13 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) const requestConfigs = [ - 'https://xxx.com/xxxx', - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', method: 'POST', data: { name: 'coderhxl' } } + 'https://www.example.com/api-1', + 'https://www.example.com/api-2', + { + url: 'https://www.example.com/api-3', + method: 'POST', + data: { name: 'coderhxl' } + } ] myXCrawl.crawlData({ requestConfigs }).then((res) => { @@ -296,7 +296,10 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], fileConfig: { storeDir: './upload' // storage folder } @@ -306,6 +309,45 @@ myXCrawl }) ``` +#### life cycle + +The crawlFile API has a lifetime function: + +- beforeSave: executed before saving the file + +##### beforeSave + +In the beforeSave function you can get a file of type Buffer, which you can process and return a Promise and resolve as a Buffer. + +**Resize picture** + +Use the sharp library to resize the images to be crawled: + +```js +import xCrawl from 'x-crawl' +import sharp from 'sharp' + +const testXCrawl = xCrawl() + +testXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1.jpg', + 'https://www.example.com/file-2.jpg' + ], + fileConfig: { + beforeSave(info) { + return sharp(info.data).resize(200).toBuffer() + } + } + }) + .then((res) => { + res.forEach((item) => { + console.log(item.data?.data.isSuccess) + }) + }) +``` + ### Start polling Start a polling crawl with [startPolling()](#startPolling) . @@ -321,7 +363,7 @@ const myXCrawl = xCrawl({ myXCrawl.startPolling({ h: 2, m: 30 }, async (count, stopPolling) => { // will be executed every two and a half hours // crawlPage/crawlData/crawlFile - const res = await myXCrawl.crawlPage('https://xxx.com') + const res = await myXCrawl.crawlPage('https://www.example.com') res.data.page.close() }) ``` @@ -356,7 +398,10 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 2000, min: 1000 } }) .then((res) => {}) @@ -378,7 +423,9 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) +myXCrawl + .crawlData({ url: 'https://www.example.com/api', maxRetry: 1 }) + .then((res) => {}) ``` The maxRetry attribute determines how many times to retry. @@ -394,9 +441,9 @@ const myXCrawl = xCrawl() myXCrawl .crawlData([ - { url: 'https://xxx.com/xxxx', priority: 1 }, - { url: 'https://xxx.com/xxxx', priority: 10 }, - { url: 'https://xxx.com/xxxx', priority: 8 } + { url: 'https://www.example.com/api-1', priority: 1 }, + { url: 'https://www.example.com/api-2', priority: 10 }, + { url: 'https://www.example.com/api-3', priority: 8 } ]) .then((res) => {}) ``` @@ -425,13 +472,20 @@ Create a crawler instance via call xCrawl. The request queue is maintained by th #### Type -- [XCrawlBaseConfig](#XCrawlBaseConfig) -- [XCrawlInstance](#XCrawlInstance) +The xCrawl API is a function. ```ts function xCrawl(baseConfig?: XCrawlBaseConfig): XCrawlInstance ``` +**Parameter Type:** + +- Look at the [XCrawlBaseConfig](#XCrawlBaseConfig) type + +**Return value type:** + +- View [XCrawlInstance](#XCrawlInstance) type + #### Example ```js @@ -439,7 +493,7 @@ import xCrawl from 'x-crawl' // xCrawl API const myXCrawl = xCrawl({ - baseUrl: 'https://xxx.com', + baseUrl: 'https://www.example.com', timeout: 10000, intervalTime: { max: 2000, min: 1000 } }) @@ -453,17 +507,41 @@ crawlPage is the method of the crawler instance, usually used to crawl page. #### Type -- Look at the [CrawlPageConfig](#CrawlPageConfig) type -- Look at the [CrawlPageSingleRes](#CrawlPageSingleRes) type -- Look at the [CrawlPageRes](#CrawlPageRes) type +The crawlPage API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined -) => Promise> +type crawlPage = { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise +} ``` +**Parameter Type:** + +- Look at the [PageRequestConfig](#PageRequestConfig) type +- Look at the [CrawlPageConfigObject](#CrawlPageConfigObject) type + +**Return value type:** + +- Look at the [CrawlPageSingleRes](#CrawlPageSingleRes) type + #### Example ```js @@ -472,7 +550,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() // crawlPage API -myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => { +myXCrawl.crawlPage('https://www.example.com').then((res) => { const { browser, page } = res.data // Close the browser @@ -498,7 +576,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` The res you get will be an object. @@ -516,7 +594,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlPage({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com', proxy: 'xxx', maxRetry: 1 }) @@ -537,7 +615,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlPage([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -549,20 +630,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP If you want to crawl multiple pages, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this way of writing: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlPage({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlPage({ + requestConfigs: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -575,17 +658,41 @@ crawlData is the method of the crawler instance, which is usually used to crawl #### Type -- Look at the [CrawlDataConfig](#CrawlDataConfig) type -- Look at the [CrawlDataSingleRes](#CrawlDataSingleRes) type -- Look at the [CrawlDataRes](#CrawlDataRes) type +The crawlData API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlData( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined -) => Promise> +type crawlData = { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> +} ``` +**Parameter Type:** + +- See [DataRequestConfig](#DataRequestConfig) type +- Look at the [CrawlDataConfigObject](#CrawlDataConfigObject) type + +**Return value type:** + +- Look at the [CrawlDataSingleRes](#CrawlDataSingleRes) type + #### Example ```js @@ -598,7 +705,10 @@ const myXCrawl = xCrawl({ myXCrawl .crawlData({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/api-1', + 'https://www.example.com/api-2' + ], intervalTime: { max: 3000, min: 1000 }, cookies: 'xxx', maxRetry: 1 @@ -626,7 +736,7 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) +myXCrawl.crawlData('https://www.example.com/api').then((res) => {}) ``` The res you get will be an object. @@ -644,7 +754,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlData({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/api', proxy: 'xxx', maxRetry: 1 }) @@ -665,7 +775,10 @@ import xCrawl from 'x-crawl' const myXCrawl = xCrawl() myXCrawl - .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }]) + .crawlData([ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 2 } + ]) .then((res) => {}) ``` @@ -677,20 +790,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP If you want to crawl multiple data, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this writing method: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlData({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', maxRetry: 6 } - ], - intervalTime: { max: 3000, min: 1000 }, - cookies: 'xxx', - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlData({ + requestConfigs: [ + 'https://www.example.com/api-1', + { url: 'https://www.example.com/api-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -703,17 +818,36 @@ crawlFile is the method of the crawler instance, which is usually used to crawl #### Type -- Look at the [CrawlFileConfig](#CrawlFileConfig) type -- Look at the [CrawlFileSingleRes](#CrawlFileSingleRes) type -- Look at the [CrawlFileRes](#CrawlFileRes) type +The crawlFile API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. ```ts -function crawlFile( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined -) => Promise> +type crawlFile = { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise +} ``` +**Parameter Type:** + +- See [FileRequestConfig](#FileRequestConfig) type +- Look at the [CrawlFileConfigObject](#CrawlFileConfigObject) type + +**Return value type:** + +- Look at the [CrawlFileSingleRes](#CrawlFileSingleRes) type + #### Example ```js @@ -727,7 +861,10 @@ const myXCrawl = xCrawl({ // crawlFile API myXCrawl .crawlFile({ - requestConfigs: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'], + requestConfigs: [ + 'https://www.example.com/file-1', + 'https://www.example.com/file-2' + ], storeDir: './upload', intervalTime: { max: 3000, min: 1000 }, maxRetry: 1 @@ -757,7 +894,7 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile({ - url: 'https://xxx.com/xxxx', + url: 'https://www.example.com/file', proxy: 'xxx', maxRetry: 1, storeDir: './upload', @@ -781,8 +918,8 @@ const myXCrawl = xCrawl() myXCrawl .crawlFile([ - { url: 'https://xxx.com/xxxx', storeDir: './upload' }, - { url: 'https://xxx.com/xxxx', storeDir: './upload', maxRetry: 2 } + { url: 'https://www.example.com/file-1', storeDir: './upload' }, + { url: 'https://www.example.com/file-2', storeDir: './upload', maxRetry: 2 } ]) .then((res) => {}) ``` @@ -795,20 +932,22 @@ For more configuration options of CrawlFileConfigObject, please refer to [CrawlF If you want to crawl multiple data, and the request configuration (storeDir, proxy, retry, etc.) does not want to be written repeatedly, and you need interval time, etc., you can try this way of writing: -``` +```js import xCrawl from 'x-crawl' const myXCrawl = xCrawl() -myXCrawl.crawlFile({ - requestConfigs: [ - 'https://xxx.com/xxxx', - { url: 'https://xxx.com/xxxx', storeDir: './upload/xxx' } - ], - storeDir: './upload', - intervalTime: { max: 3000, min: 1000 }, - maxRetry: 1 -}).then((res) => {}) +myXCrawl + .crawlFile({ + requestConfigs: [ + 'https://www.example.com/file-1', + { url: 'https://www.example.com/file-2', storeDir: './upload/xxx' } + ], + storeDir: './upload', + intervalTime: { max: 3000, min: 1000 }, + maxRetry: 1 + }) + .then((res) => {}) ``` The res you get will be an array of objects. @@ -999,38 +1138,12 @@ export interface CrawlFileConfigObject { fileName: string filePath: string data: Buffer - }) => Buffer | void + }) => Promise } } ``` -##### CrawlPageConfig - -```ts -export type CrawlPageConfig = - | string - | PageRequestConfig - | (string | PageRequestConfig)[] - | CrawlPageConfigObject -``` - -##### CrawlDataConfig - -```ts -export type CrawlDataConfig = - | string - | DataRequestConfig - | (string | DataRequestConfig)[] - | CrawlDataConfigObject -``` - -##### CrawlFileConfig - -```ts -export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject -``` - -##### StartPollingConfig +##### startPollingConfig ```js export interface StartPollingConfig { @@ -1046,20 +1159,66 @@ export interface StartPollingConfig { ```ts export interface XCrawlInstance { - crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined - ) => Promise> + crawlPage: { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + } - crawlData: ( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined - ) => Promise> + crawlData: { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + } - crawlFile: ( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined - ) => Promise> + crawlFile: { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + } startPolling: ( config: StartPollingConfig, @@ -1124,36 +1283,6 @@ export interface CrawlFileSingleRes extends CrawlCommonRes { } ``` -#### CrawlPageRes - -```ts -export type CrawlPageRes = R extends - | (string | PageRequestConfig)[] - | CrawlPageConfigObject - ? CrawlPageSingleRes[] - : CrawlPageSingleRes -``` - -#### CrawlDataRes - -```ts -export type CrawlDataRes = R extends - | (string | DataRequestConfig)[] - | CrawlDataConfigObject - ? CrawlDataSingleRes[] - : CrawlDataSingleRes -``` - -#### CrawlFileRes - -```ts -export type CrawlFileRes = R extends - | FileRequestConfig[] - | CrawlFileConfigObject - ? CrawlFileSingleRes[] - : CrawlFileSingleRes -``` - ### API Other #### AnyObject @@ -1167,3 +1296,5 @@ export interface AnyObject extends Object { ## More If you have **problems, needs, good suggestions** please raise **Issues** in https://github.com/coder-hxl/x-crawl/issues. + +thank you for your support. diff --git a/publish/package.json b/publish/package.json index aaad28ea..c7f06d9d 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "5.0.2", + "version": "5.1.0", "author": "coderHXL", "description": "x-crawl is a flexible nodejs crawler library.", "license": "MIT", diff --git a/rollup.config.mjs b/rollup.config.mjs index 349f28e2..67d3e423 100644 --- a/rollup.config.mjs +++ b/rollup.config.mjs @@ -19,6 +19,10 @@ console.log(outputMap) export default { input: 'src/index.ts', output: outputMap, + treeshake: { + tryCatchDeoptimization: false, + unknownGlobalSideEffects: false + }, plugins: [ tsPlugin(), getBabelOutputPlugin({ diff --git a/src/api.ts b/src/api.ts index 78d2741e..f2dcd1da 100644 --- a/src/api.ts +++ b/src/api.ts @@ -29,12 +29,9 @@ import { StartPollingConfig, LoaderCrawlPageConfig, CrawlPageConfigObject, - CrawlPageRes, LoaderCrawlDataConfig, LoaderCrawlFileConfig, CrawlDataSingleRes, - CrawlDataRes, - CrawlFileRes, CrawlFileSingleRes, CrawlDataConfigObject, LoaderPageRequestConfig, @@ -320,10 +317,30 @@ export function createCrawlPage(baseConfig: LoaderXCrawlBaseConfig) { // 通过 爬取cId 找到对应爬取, 再通过 爬取id 找到 page const errorPageContainer = new Map>() - async function crawlPage( - config: T, + function crawlPage( + config: string, callback?: (res: CrawlPageSingleRes) => void - ): Promise> { + ): Promise + + function crawlPage( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + function crawlPage( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + function crawlPage( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + async function crawlPage( + config: CrawlPageConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise { const cId = ++cIdCount // 创建浏览器 @@ -405,7 +422,7 @@ export function createCrawlPage(baseConfig: LoaderXCrawlBaseConfig) { ? crawlResArr : crawlResArr[0] - return crawlRes as CrawlPageRes + return crawlRes } async function crawlPageSingle( @@ -466,10 +483,30 @@ export function createCrawlPage(baseConfig: LoaderXCrawlBaseConfig) { } export function createCrawlData(baseConfig: LoaderXCrawlBaseConfig) { - async function crawlData( - config: T, - callback?: (res: CrawlDataSingleRes) => void - ): Promise> { + function crawlData( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + function crawlData( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + function crawlData( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + function crawlData( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + async function crawlData( + config: CrawlDataConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise | CrawlDataSingleRes[]> { const { requestConfigs, intervalTime } = loaderDataConfig( baseConfig, config @@ -484,7 +521,7 @@ export function createCrawlData(baseConfig: LoaderXCrawlBaseConfig) { crawlRequestSingle ) - const crawlResArr: CrawlDataSingleRes[] = controllerRes.map((item) => { + const crawlResArr: CrawlDataSingleRes[] = controllerRes.map((item) => { const { id, isSuccess, @@ -494,7 +531,7 @@ export function createCrawlData(baseConfig: LoaderXCrawlBaseConfig) { crawlSingleRes } = item - const crawlRes: CrawlDataSingleRes = { + const crawlRes: CrawlDataSingleRes = { id, isSuccess, maxRetry, @@ -507,7 +544,7 @@ export function createCrawlData(baseConfig: LoaderXCrawlBaseConfig) { if (isSuccess && crawlSingleRes) { const contentType = crawlSingleRes.headers['content-type'] ?? '' - const data: D = contentType.includes('text') + const data: T = contentType.includes('text') ? crawlSingleRes.data.toString() : JSON.parse(crawlSingleRes.data.toString()) @@ -527,17 +564,32 @@ export function createCrawlData(baseConfig: LoaderXCrawlBaseConfig) { ? crawlResArr : crawlResArr[0] - return crawlRes as CrawlDataRes + return crawlRes } return crawlData } export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) { - async function crawlFile( - config: T, + function crawlFile( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + function crawlFile( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + function crawlFile( + config: CrawlFileConfigObject, callback?: (res: CrawlFileSingleRes) => void - ): Promise> { + ): Promise + + async function crawlFile( + config: CrawlFileConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise { const { requestConfigs, intervalTime, fileConfig } = loaderFileConfig( baseConfig, config @@ -595,49 +647,47 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) { const filePath = path.resolve(storePath, fileName + fileExtension) // 在保存前的回调 - let data = crawlSingleRes.data + const data = crawlSingleRes.data + let dataPromise = Promise.resolve(data) if (fileConfig?.beforeSave) { - const newData = fileConfig.beforeSave({ + dataPromise = fileConfig.beforeSave({ id, fileName, filePath, data }) - - if (newData) { - data = newData - } } - const saveFileItem = writeFile(filePath, data) - .catch((err) => { + const saveFileItem = dataPromise.then(async (newData) => { + let isSuccess = true + try { + await writeFile(filePath, newData) + } catch (err: any) { + isSuccess = false + const message = `File save error at id ${id}: ${err.message}` const valueOf = () => id saveFileErrorArr.push({ message, valueOf }) + } - return true - }) - .then((isError) => { - const size = crawlSingleRes.data.length - const isSuccess = !isError - - crawlRes.data = { - ...crawlSingleRes, - data: { - isSuccess, - fileName, - fileExtension, - mimeType, - size, - filePath - } + const size = newData.length + crawlRes.data = { + ...crawlSingleRes, + data: { + isSuccess, + fileName, + fileExtension, + mimeType, + size, + filePath } + } - if (callback) { - callback(crawlRes) - } - }) + if (callback) { + callback(crawlRes) + } + }) saveFileQueue.push(saveFileItem) } else { @@ -687,7 +737,7 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) { ? crawlResArr : crawlResArr[0] - return crawlRes as CrawlFileRes + return crawlRes } return crawlFile diff --git a/src/types/api.ts b/src/types/api.ts index 17fed09e..09844de7 100644 --- a/src/types/api.ts +++ b/src/types/api.ts @@ -31,10 +31,28 @@ export interface LoaderCrawlFileConfig requestConfigs: LoaderFileRequestConfig[] } +// Function overloading crawl config +export type CrawlPageConfig = + | string + | PageRequestConfig + | (string | PageRequestConfig)[] + | CrawlPageConfigObject + +export type CrawlDataConfig = + | string + | DataRequestConfig + | (string | DataRequestConfig)[] + | CrawlDataConfigObject + +export type CrawlFileConfig = + | FileRequestConfig + | FileRequestConfig[] + | CrawlFileConfigObject + /* API Config */ +// API Config Other export type IntervalTime = number | { max: number; min?: number } -// RequestConfig export type Method = | 'get' | 'GET' @@ -62,6 +80,7 @@ export type PageRequestConfigCookies = | Protocol.Network.CookieParam | Protocol.Network.CookieParam[] +// API Config Request export interface PageRequestConfig { url: string headers?: AnyObject @@ -96,7 +115,7 @@ export interface FileRequestConfig { extension?: string } -// CrawlConfig +// API Config Crawl export interface CrawlPageConfigObject { requestConfigs: (string | PageRequestConfig)[] proxy?: string @@ -128,27 +147,10 @@ export interface CrawlFileConfigObject { fileName: string filePath: string data: Buffer - }) => Buffer | void + }) => Promise } } -export type CrawlPageConfig = - | string - | PageRequestConfig - | (string | PageRequestConfig)[] - | CrawlPageConfigObject - -export type CrawlDataConfig = - | string - | DataRequestConfig - | (string | DataRequestConfig)[] - | CrawlDataConfigObject - -export type CrawlFileConfig = - | FileRequestConfig - | FileRequestConfig[] - | CrawlFileConfigObject - export interface StartPollingConfig { d?: number h?: number @@ -195,21 +197,3 @@ export interface CrawlFileSingleRes extends CrawlCommonRes { } } | null } - -export type CrawlPageRes = R extends - | (string | PageRequestConfig)[] - | CrawlPageConfigObject - ? CrawlPageSingleRes[] - : CrawlPageSingleRes - -export type CrawlDataRes = R extends - | (string | DataRequestConfig)[] - | CrawlDataConfigObject - ? CrawlDataSingleRes[] - : CrawlDataSingleRes - -export type CrawlFileRes = R extends - | FileRequestConfig[] - | CrawlFileConfigObject - ? CrawlFileSingleRes[] - : CrawlFileSingleRes diff --git a/src/types/index.ts b/src/types/index.ts index 2c69aca2..d2f6a11f 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,15 +1,15 @@ import { - CrawlFileConfig, - CrawlPageConfig, StartPollingConfig, IntervalTime, - CrawlPageRes, - CrawlDataConfig, CrawlPageSingleRes, - CrawlDataRes, CrawlDataSingleRes, CrawlFileSingleRes, - CrawlFileRes + CrawlFileConfigObject, + FileRequestConfig, + DataRequestConfig, + CrawlDataConfigObject, + PageRequestConfig, + CrawlPageConfigObject } from './api' export interface XCrawlBaseConfig { @@ -28,20 +28,66 @@ export type LoaderXCrawlBaseConfig = XCrawlBaseConfig & { } export interface XCrawlInstance { - crawlPage: ( - config: T, - callback?: ((res: CrawlPageSingleRes) => void) | undefined - ) => Promise> - - crawlData: ( - config: T, - callback?: ((res: CrawlDataSingleRes) => void) | undefined - ) => Promise> - - crawlFile: ( - config: T, - callback?: ((res: CrawlFileSingleRes) => void) | undefined - ) => Promise> + crawlPage: { + ( + config: string, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: PageRequestConfig, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: (string | PageRequestConfig)[], + callback?: (res: CrawlPageSingleRes) => void + ): Promise + + ( + config: CrawlPageConfigObject, + callback?: (res: CrawlPageSingleRes) => void + ): Promise + } + + crawlData: { + ( + config: DataRequestConfig, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: string, + callback?: (res: CrawlDataSingleRes) => void + ): Promise> + + ( + config: (string | DataRequestConfig)[], + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + + ( + config: CrawlDataConfigObject, + callback?: (res: CrawlDataSingleRes) => void + ): Promise[]> + } + + crawlFile: { + ( + config: FileRequestConfig, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: FileRequestConfig[], + callback?: (res: CrawlFileSingleRes) => void + ): Promise + + ( + config: CrawlFileConfigObject, + callback?: (res: CrawlFileSingleRes) => void + ): Promise + } startPolling: ( config: StartPollingConfig, diff --git a/test/environment/crawlFile.test.ts b/test/environment/crawlFile.test.ts index c5d49259..466574ac 100644 --- a/test/environment/crawlFile.test.ts +++ b/test/environment/crawlFile.test.ts @@ -119,8 +119,9 @@ async function storeConfig() { fileConfig: { storeDir: path.resolve(__dirname, './upload'), extension: '.jpg', - beforeSave(info) { + async beforeSave(info) { record.push(info.fileName) + return info.data } } }) diff --git a/test/start/index.js b/test/start/index.js index e8320697..7f67a46a 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),n=require("puppeteer"),o=require("chalk"),s=require("node:http"),i=require("node:https"),a=require("node:url"),u=require("https-proxy-agent");const c=console.log,l=o.hex("#a57fff"),f=o.green,m=o.red,p=o.yellow;function h(e){return void 0===e}function d(e){return"number"==typeof e}function g(e){return"object"==typeof e&&e&&!Array.isArray(e)}function y(e){return Array.isArray(e)}async function w(e,t,r,n){if(e&&n>1){const e=t?r:function(e,t=0){let r=Math.floor(Math.random()*e);for(;rsetTimeout(t,e)))}(e)}else c(`Id: ${l(n)} - Crawl does not need to sleep, send immediately`)}async function x(e,t,r,n){const o=!h(t),s=d(t),i=[];for(const a of e){const{id:e}=a;await w(o,s,t,e),a.crawlCount++;const u=n(a,r).catch((e=>(a.errorQueue.push(e),!1))).then((e=>{!1!==e&&(a.isSuccess=!0,a.crawlSingleRes=e)}));i.push(u)}await Promise.all(i)}async function C(e,t,r,n){const o=!h(t),s=d(t);for(const i of e){const{id:e}=i;await w(o,s,t,e),i.crawlCount++;try{i.crawlSingleRes=await n(i,r),i.isSuccess=!0}catch(e){i.errorQueue.push(e)}}}function S(e,t,r){const n=e[t];e[t]=e[r],e[r]=n}function v(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=v(e.slice(0,t)),n=v(e.slice(t)),o=[];let s=0,i=0;for(;s=n[i]?(o.push(r[s]),s++):(o.push(n[i]),i++);return se.priority===r[0].priority))?v(r.map((e=>({...e,valueOf:()=>e.priority})))):r).map(((e,t)=>({id:t+1,isSuccess:!1,maxRetry:e.maxRetry,crawlCount:0,errorQueue:[],requestConfig:e,crawlSingleRes:null})));c(`${f("Start crawling")} - name: ${p(e)}, mode: ${p(t)}, total: ${l(i.length)} `);const a="async"===t?x:C;let u=i;for(;u.length;)if(await a(u,n,o,s),u=u.filter((e=>e.maxRetry&&!e.isSuccess&&e.crawlCount<=e.maxRetry)),u.length){const e=u.map((e=>e.id));c(p(`Ids to retry: [ ${e.join(" - ")} ]`))}const h=[],d=[];return i.forEach((e=>{e.isSuccess?h.push(e.id):d.push(e.id)})),c("Crawl the final result:"),c(f(` Success - total: ${h.length}, ids: [ ${h.join(" - ")} ]`)),c(m(` Error - total: ${d.length}, ids: [ ${d.join(" - ")} ]`)),i}function R(e,t){let r=e?`${e}`:"?";if(t)for(const e in t){r+=`&${e}=${t[e]}`}else r=e;return r}function $(e){const{protocol:t,hostname:r,port:n,pathname:o,search:c}=new a.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new i.Agent,protocol:t,hostname:r,port:n,path:o,search:R(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const r={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(r["Content-Type"]="application/json",r["Content-Length"]=Buffer.byteLength(e.data)),r}(e,f),f}async function T(e){const{requestConfig:t}=e;return await(r=t,new Promise(((e,t)=>{const n=h(r.data);r.data=n?r.data:JSON.stringify(r.data);const o=$(r);function a(t){const{statusCode:r,headers:n}=t,o=[];t.on("data",(e=>o.push(e))),t.on("end",(()=>{const t=Buffer.concat(o);e({statusCode:r,headers:n,data:t})}))}let u;u="http:"===o.protocol?s.request(o,a):i.request(o,a),u.on("timeout",(()=>{t(new Error(`Timeout ${r.timeout}ms`))})),u.on("error",(e=>{t(e)})),"POST"!==o.method||n||u.write(r.data),u.end()})));var r}function b(e){return y(e)?e.map((e=>g(e)?e:{url:e})):[g(e)?e:{url:e}]}function O(e,t,r){r.requestConfigs=t.map((t=>{let{url:n,timeout:o,proxy:s,maxRetry:i,priority:a}=t;return h(e.baseUrl)||(n=e.baseUrl+n),h(o)&&(o=h(r.timeout)?e.timeout:r.timeout),h(s)&&(h(r.proxy)?h(e.proxy)||(s=e.proxy):s=r.proxy),h(i)&&(i=h(r.maxRetry)?e.maxRetry:r.maxRetry),h(a)&&(a=0),{...t,url:n,timeout:o,proxy:s,maxRetry:i,priority:a}})),h(r.intervalTime)&&!h(e.intervalTime)&&(r.intervalTime=e.intervalTime)}function j(e){let t=null,r=null,o=!1,s=0;const i=new Map;async function a(e,r){const{id:n,requestConfig:o}=e,s=await t.newPage();await s.setViewport({width:1280,height:1024});let a=null;try{o.proxy?await t.createIncognitoBrowserContext({proxyServer:o.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0}),o.headers&&await s.setExtraHTTPHeaders(o.headers),o.cookies&&await s.setCookie(...function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const n=t.split("=");r.push({name:n[0],value:n[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(o.url,o.cookies)),a=await s.goto(o.url,{timeout:o.timeout})}catch(e){let t=i.get(r);throw t||(t=new Map,i.set(r,t)),t.get(n)||t.set(n,s),e}return{response:a,page:s}}return async function(u,c){const l=++s;o||(o=!0,r=n.launch().then((e=>{t=e}))),r&&(await r,r&&(r=null));const{requestConfigs:f,intervalTime:m}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,cookies:i,intervalTime:a,maxRetry:u}=t;r.proxy=o,r.cookies=i,r.intervalTime=a,r.maxRetry=u,r.timeout=s,n.push(...b(e))}else{const e=b(t);n.push(...e)}return O(e,n,r),h(r.cookies)||r.requestConfigs.forEach((e=>{const{cookies:t}=e;h(t)&&!h(r.cookies)&&(e.cookies=r.cookies)})),r}(e,u),p=(await q("page",e.mode,f,m,l,a)).map((e=>{const{id:r,isSuccess:n,maxRetry:o,crawlCount:s,errorQueue:a,crawlSingleRes:u}=e;let f=null;if(n&&u)f={browser:t,...u};else{const e=i.get(l).get(r);f={browser:t,response:null,page:e}}const m={id:r,isSuccess:n,maxRetry:o,crawlCount:s,retryCount:s-1,errorQueue:a,data:f};return c&&c(m),m}));return i.delete(l),y(u)||g(u)&&Object.hasOwn(u,"requestConfigs")?p:p[0]}}function k(e){return async function(t,r){const{requestConfigs:n,intervalTime:o}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,intervalTime:i,maxRetry:a}=t;r.proxy=o,r.intervalTime=i,r.maxRetry=a,r.timeout=s,n.push(...b(e))}else{const e=b(t);n.push(...b(e))}return O(e,n,r),r}(e,t),s=(await q("data",e.mode,n,o,void 0,T)).map((e=>{const{id:t,isSuccess:n,maxRetry:o,crawlCount:s,errorQueue:i,crawlSingleRes:a}=e,u={id:t,isSuccess:n,maxRetry:o,crawlCount:s,retryCount:s-1,errorQueue:i,data:null};if(n&&a){const e=(a.headers["content-type"]??"").includes("text")?a.data.toString():JSON.parse(a.data.toString());u.data={...a,data:e}}return r&&r(u),u}));return y(t)||g(t)&&Object.hasOwn(t,"requestConfigs")?s:s[0]}}function P(n){return async function(o,s){const{requestConfigs:i,intervalTime:a,fileConfig:u}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,intervalTime:i,maxRetry:a,fileConfig:u}=t;r.proxy=o,r.intervalTime=i,r.maxRetry=a,r.timeout=s,r.fileConfig=u,n.push(...b(e))}else n.push(...y(t)?t:[t]);return O(e,n,r),h(r.fileConfig?.storeDir)&&h(r.fileConfig?.extension)||r.requestConfigs.forEach((e=>{h(e.storeDir)&&!h(r.fileConfig?.storeDir)&&(e.storeDir=r.fileConfig.storeDir),h(e.extension)&&!h(r.fileConfig?.extension)&&(e.extension=r.fileConfig.extension)})),r}(n,o),l=await q("file",n.mode,i,a,void 0,T),p=[],d=[],w=l.map((n=>{const{id:o,isSuccess:i,maxRetry:a,crawlCount:c,errorQueue:l,crawlSingleRes:f,requestConfig:m}=n,g={id:o,isSuccess:i,maxRetry:a,crawlCount:c,retryCount:c-1,errorQueue:l,data:null};if(i&&f){const n=f.headers["content-type"]??"",i=m.fileName??`${o}-${(new Date).getTime()}`,a=m.extension??`.${n.split("/").pop()}`;h(m.storeDir)||e.existsSync(m.storeDir)||(y=m.storeDir,r.resolve(y).split(r.sep).reduce(((t,n,o)=>{const s=0!==o?r.join(t,n):n;return e.existsSync(s)||e.mkdirSync(s),s}),""));const c=m.storeDir??__dirname,l=r.resolve(c,i+a);let w=f.data;if(u?.beforeSave){const e=u.beforeSave({id:o,fileName:i,filePath:l,data:w});e&&(w=e)}const x=t.writeFile(l,w).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return d.push({message:t,valueOf:()=>o}),!0})).then((e=>{const t=f.data.length,r=!e;g.data={...f,data:{isSuccess:r,fileName:i,fileExtension:a,mimeType:n,size:t,filePath:l}},s&&s(g)}));p.push(x)}else s&&s(g);var y;return g}));var x;await Promise.all(p),(x=d,function e(t,r){if(t>=r)return;const n=x[r];let o=t,s=r-1;for(;o<=s;){for(;x[o]n;)s--;o<=s&&(S(x,o,s),o++,s--)}S(x,o,r),e(t,o-1),e(o+1,r)}(0,x.length-1),x).forEach((e=>c(m(e.message))));const C=[],v=[];return w.forEach((e=>{e.data?.data.isSuccess?C.push(e.id):v.push(e.id)})),c("Save file final result:"),c(f(` Success - total: ${C.length}, ids: [ ${C.join(" - ")} ]`)),c(m(` Error - total: ${v.length}, ids: [ ${v.join(" - ")} ]`)),y(o)||g(o)&&Object.hasOwn(o,"requestConfigs")?w:w[0]}}function E(e,t){const{d:r,h:n,m:o}=e,s=(h(r)?0:1e3*r*60*60*24)+(h(n)?0:1e3*n*60*60)+(h(o)?0:1e3*o*60);let i=0;u();const a=setInterval(u,s);function u(){console.log(f(`Start the ${p.bold(++i)} polling`)),t(i,c)}function c(){clearInterval(a),console.log(f("Stop the polling"))}}const D=function(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),h(e?.timeout)&&(t.timeout=1e4),h(e?.maxRetry)&&(t.maxRetry=0),t}(e);return function(e){return{crawlPage:j(e),crawlData:k(e),crawlFile:P(e),startPolling:E}}(t)}({maxRetry:3,intervalTime:{max:3e3,min:2e3}});D.startPolling({d:1},(async(e,t)=>{const r=await D.crawlPage(["https://zh.airbnb.com/s/hawaii/experiences","https://zh.airbnb.com/s/hawaii/plus_homes"]),n=[],o=[".c14whb16",".a1stauiv"];for(const e of r){const{id:t}=e,{page:r}=e.data,s=await r.$(o[t-1]),i=await s.$$eval("picture img",(e=>e.map((e=>e.src))));n.push(...i),r.close()}D.crawlFile({requestConfigs:n,fileConfig:{storeDir:"./upload"}})})); +"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),o=require("puppeteer"),n=require("chalk"),s=require("node:http"),i=require("node:https"),a=require("node:url"),u=require("https-proxy-agent"),c=require("sharp"),l=require("path");const f=console.log,p=n.hex("#a57fff"),h=n.green,m=n.red,d=n.yellow;function g(e){return void 0===e}function y(e){return"number"==typeof e}function w(e){return"object"==typeof e&&e&&!Array.isArray(e)}function x(e){return Array.isArray(e)}async function C(e,t,r,o){if(e&&o>1){const e=t?r:function(e,t=0){let r=Math.floor(Math.random()*e);for(;rsetTimeout(t,e)))}(e)}else f(`Id: ${p(o)} - Crawl does not need to sleep, send immediately`)}async function S(e,t,r,o){const n=!g(t),s=y(t),i=[];for(const a of e){const{id:e}=a;await C(n,s,t,e),a.crawlCount++;const u=o(a,r).catch((e=>(a.errorQueue.push(e),!1))).then((e=>{!1!==e&&(a.isSuccess=!0,a.crawlSingleRes=e)}));i.push(u)}await Promise.all(i)}async function v(e,t,r,o){const n=!g(t),s=y(t);for(const i of e){const{id:e}=i;await C(n,s,t,e),i.crawlCount++;try{i.crawlSingleRes=await o(i,r),i.isSuccess=!0}catch(e){i.errorQueue.push(e)}}}function q(e,t,r){const o=e[t];e[t]=e[r],e[r]=o}function R(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=R(e.slice(0,t)),o=R(e.slice(t)),n=[];let s=0,i=0;for(;s=o[i]?(n.push(r[s]),s++):(n.push(o[i]),i++);return se.priority===r[0].priority))?R(r.map((e=>({...e,valueOf:()=>e.priority})))):r).map(((e,t)=>({id:t+1,isSuccess:!1,maxRetry:e.maxRetry,crawlCount:0,errorQueue:[],requestConfig:e,crawlSingleRes:null})));f(`${h("Start crawling")} - name: ${d(e)}, mode: ${d(t)}, total: ${p(i.length)} `);const a="async"===t?S:v;let u=0,c=i;for(;c.length;)if(await a(c,o,n,s),c=c.filter((e=>e.maxRetry&&!e.isSuccess&&e.crawlCount<=e.maxRetry)),c.length){const e=c.map((e=>e.id));f(d(`Retry: ${++u} - Ids to retry: [ ${e.join(" - ")} ]`))}const l=[],g=[];return i.forEach((e=>{e.isSuccess?l.push(e.id):g.push(e.id)})),f("Crawl the final result:"),f(h(` Success - total: ${l.length}, ids: [ ${l.join(" - ")} ]`)),f(m(` Error - total: ${g.length}, ids: [ ${g.join(" - ")} ]`)),i}function T(e,t){let r=e?`${e}`:"?";if(t)for(const e in t){r+=`&${e}=${t[e]}`}else r=e;return r}function b(e){const{protocol:t,hostname:r,port:o,pathname:n,search:c}=new a.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new i.Agent,protocol:t,hostname:r,port:o,path:n,search:T(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const r={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(r["Content-Type"]="application/json",r["Content-Length"]=Buffer.byteLength(e.data)),r}(e,f),f}async function O(e){const{requestConfig:t}=e;return await(r=t,new Promise(((e,t)=>{const o=g(r.data);r.data=o?r.data:JSON.stringify(r.data);const n=b(r);function a(t){const{statusCode:r,headers:o}=t,n=[];t.on("data",(e=>n.push(e))),t.on("end",(()=>{const t=Buffer.concat(n);e({statusCode:r,headers:o,data:t})}))}let u;u="http:"===n.protocol?s.request(n,a):i.request(n,a),u.on("timeout",(()=>{t(new Error(`Timeout ${r.timeout}ms`))})),u.on("error",(e=>{t(e)})),"POST"!==n.method||o||u.write(r.data),u.end()})));var r}function j(e){return x(e)?e.map((e=>w(e)?e:{url:e})):[w(e)?e:{url:e}]}function k(e,t,r){r.requestConfigs=t.map((t=>{let{url:o,timeout:n,proxy:s,maxRetry:i,priority:a}=t;return g(e.baseUrl)||(o=e.baseUrl+o),g(n)&&(n=g(r.timeout)?e.timeout:r.timeout),g(s)&&(g(r.proxy)?g(e.proxy)||(s=e.proxy):s=r.proxy),g(i)&&(i=g(r.maxRetry)?e.maxRetry:r.maxRetry),g(a)&&(a=0),{...t,url:o,timeout:n,proxy:s,maxRetry:i,priority:a}})),g(r.intervalTime)&&!g(e.intervalTime)&&(r.intervalTime=e.intervalTime)}function E(e){let t=null,r=null,n=!1,s=0;const i=new Map;async function a(e,r){const{id:o,requestConfig:n}=e,s=await t.newPage();await s.setViewport({width:1280,height:1024});let a=null;try{n.proxy?await t.createIncognitoBrowserContext({proxyServer:n.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0}),n.headers&&await s.setExtraHTTPHeaders(n.headers),n.cookies&&await s.setCookie(...function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const o=t.split("=");r.push({name:o[0],value:o[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(n.url,n.cookies)),a=await s.goto(n.url,{timeout:n.timeout})}catch(e){let t=i.get(r);throw t||(t=new Map,i.set(r,t)),t.get(o)||t.set(o,s),e}return{response:a,page:s}}return async function(u,c){const l=++s;n||(n=!0,r=o.launch().then((e=>{t=e}))),r&&(await r,r&&(r=null));const{requestConfigs:f,intervalTime:p}=function(e,t){const r={requestConfigs:[]},o=[];if(w(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:n,timeout:s,cookies:i,intervalTime:a,maxRetry:u}=t;r.proxy=n,r.cookies=i,r.intervalTime=a,r.maxRetry=u,r.timeout=s,o.push(...j(e))}else{const e=j(t);o.push(...e)}return k(e,o,r),g(r.cookies)||r.requestConfigs.forEach((e=>{const{cookies:t}=e;g(t)&&!g(r.cookies)&&(e.cookies=r.cookies)})),r}(e,u),h=(await $("page",e.mode,f,p,l,a)).map((e=>{const{id:r,isSuccess:o,maxRetry:n,crawlCount:s,errorQueue:a,crawlSingleRes:u}=e;let f=null;if(o&&u)f={browser:t,...u};else{const e=i.get(l).get(r);f={browser:t,response:null,page:e}}const p={id:r,isSuccess:o,maxRetry:n,crawlCount:s,retryCount:s-1,errorQueue:a,data:f};return c&&c(p),p}));return i.delete(l),x(u)||w(u)&&Object.hasOwn(u,"requestConfigs")?h:h[0]}}function P(e){return async function(t,r){const{requestConfigs:o,intervalTime:n}=function(e,t){const r={requestConfigs:[]},o=[];if(w(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:n,timeout:s,intervalTime:i,maxRetry:a}=t;r.proxy=n,r.intervalTime=i,r.maxRetry=a,r.timeout=s,o.push(...j(e))}else{const e=j(t);o.push(...j(e))}return k(e,o,r),r}(e,t),s=(await $("data",e.mode,o,n,void 0,O)).map((e=>{const{id:t,isSuccess:o,maxRetry:n,crawlCount:s,errorQueue:i,crawlSingleRes:a}=e,u={id:t,isSuccess:o,maxRetry:n,crawlCount:s,retryCount:s-1,errorQueue:i,data:null};if(o&&a){const e=(a.headers["content-type"]??"").includes("text")?a.data.toString():JSON.parse(a.data.toString());u.data={...a,data:e}}return r&&r(u),u}));return x(t)||w(t)&&Object.hasOwn(t,"requestConfigs")?s:s[0]}}function D(o){return async function(n,s){const{requestConfigs:i,intervalTime:a,fileConfig:u}=function(e,t){const r={requestConfigs:[]},o=[];if(w(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:n,timeout:s,intervalTime:i,maxRetry:a,fileConfig:u}=t;r.proxy=n,r.intervalTime=i,r.maxRetry=a,r.timeout=s,r.fileConfig=u,o.push(...j(e))}else o.push(...x(t)?t:[t]);return k(e,o,r),g(r.fileConfig?.storeDir)&&g(r.fileConfig?.extension)||r.requestConfigs.forEach((e=>{g(e.storeDir)&&!g(r.fileConfig?.storeDir)&&(e.storeDir=r.fileConfig.storeDir),g(e.extension)&&!g(r.fileConfig?.extension)&&(e.extension=r.fileConfig.extension)})),r}(o,n),c=await $("file",o.mode,i,a,void 0,O),l=[],p=[],d=c.map((o=>{const{id:n,isSuccess:i,maxRetry:a,crawlCount:c,errorQueue:f,crawlSingleRes:h,requestConfig:m}=o,d={id:n,isSuccess:i,maxRetry:a,crawlCount:c,retryCount:c-1,errorQueue:f,data:null};if(i&&h){const o=h.headers["content-type"]??"",i=m.fileName??`${n}-${(new Date).getTime()}`,a=m.extension??`.${o.split("/").pop()}`;g(m.storeDir)||e.existsSync(m.storeDir)||(y=m.storeDir,r.resolve(y).split(r.sep).reduce(((t,o,n)=>{const s=0!==n?r.join(t,o):o;return e.existsSync(s)||e.mkdirSync(s),s}),""));const c=m.storeDir??__dirname,f=r.resolve(c,i+a),w=h.data;let x=Promise.resolve(w);u?.beforeSave&&(x=u.beforeSave({id:n,fileName:i,filePath:f,data:w}));const C=x.then((async e=>{let r=!0;try{await t.writeFile(f,e)}catch(e){r=!1;const t=`File save error at id ${n}: ${e.message}`,o=()=>n;p.push({message:t,valueOf:o})}const u=e.length;d.data={...h,data:{isSuccess:r,fileName:i,fileExtension:a,mimeType:o,size:u,filePath:f}},s&&s(d)}));l.push(C)}else s&&s(d);var y;return d}));var y;await Promise.all(l),(y=p,function e(t,r){if(t>=r)return;const o=y[r];let n=t,s=r-1;for(;n<=s;){for(;y[n]o;)s--;n<=s&&(q(y,n,s),n++,s--)}q(y,n,r),e(t,n-1),e(n+1,r)}(0,y.length-1),y).forEach((e=>f(m(e.message))));const C=[],S=[];return d.forEach((e=>{e.data?.data.isSuccess?C.push(e.id):S.push(e.id)})),f("Save file final result:"),f(h(` Success - total: ${C.length}, ids: [ ${C.join(" - ")} ]`)),f(m(` Error - total: ${S.length}, ids: [ ${S.join(" - ")} ]`)),x(n)||w(n)&&Object.hasOwn(n,"requestConfigs")?d:d[0]}}function A(e,t){const{d:r,h:o,m:n}=e,s=(g(r)?0:1e3*r*60*60*24)+(g(o)?0:1e3*o*60*60)+(g(n)?0:1e3*n*60);let i=0;u();const a=setInterval(u,s);function u(){console.log(h(`Start the ${d.bold(++i)} polling`)),t(i,c)}function c(){clearInterval(a),console.log(h("Stop the polling"))}}const M=function(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),g(e?.timeout)&&(t.timeout=1e4),g(e?.maxRetry)&&(t.maxRetry=0),t}(e);return function(e){return{crawlPage:E(e),crawlData:P(e),crawlFile:D(e),startPolling:A}}(t)}();M.crawlFile({requestConfigs:["https://raw.githubusercontent.com/coder-hxl/airbnb-upload/master/area/4401.jpg"],proxy:"http://localhost:14892",fileConfig:{storeDir:l.resolve(__dirname,"./upload"),beforeSave:e=>c(e.data).resize(200).toBuffer()}}).then((async e=>{e.forEach((e=>{console.log(e.data?.data.isSuccess)}))})); diff --git a/test/start/index.ts b/test/start/index.ts index 96336ae6..4def8053 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -1,42 +1,24 @@ -// 1.Import module ES/CJS import xCrawl from 'x-crawl' +import sharp from 'sharp' +import path from 'path' -// 2.Create a crawler instance -const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } }) +const testXCrawl = xCrawl() -// 3.Set the crawling task -/* - Call the startPolling API to start the polling function, - and the callback function will be called every other day -*/ -myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { - // Call crawlPage API to crawl Page - const res = await myXCrawl.crawlPage([ - 'https://zh.airbnb.com/s/hawaii/experiences', - 'https://zh.airbnb.com/s/hawaii/plus_homes' - ]) - - // Store the image URL - const imgUrls: string[] = [] - const elSelectorMap = ['.c14whb16', '.a1stauiv'] - for (const item of res) { - const { id } = item - const { page } = item.data - - // Gets the URL of the page's wheel image element - const boxHandle = await page.$(elSelectorMap[id - 1]) - const urls = await boxHandle!.$$eval('picture img', (imgEls) => { - return imgEls.map((item) => item.src) +testXCrawl + .crawlFile({ + requestConfigs: [ + 'https://raw.githubusercontent.com/coder-hxl/airbnb-upload/master/area/4401.jpg' + ], + proxy: 'http://localhost:14892', + fileConfig: { + storeDir: path.resolve(__dirname, './upload'), + beforeSave(info) { + return sharp(info.data).resize(200).toBuffer() + } + } + }) + .then(async (res) => { + res.forEach((item) => { + console.log(item.data?.data.isSuccess) }) - imgUrls.push(...urls) - - // Close page - page.close() - } - - // Call the crawlFile API to crawl pictures - myXCrawl.crawlFile({ - requestConfigs: imgUrls, - fileConfig: { storeDir: './upload' } }) -}) diff --git a/test/start/package.json b/test/start/package.json new file mode 100644 index 00000000..9dcdf9d5 --- /dev/null +++ b/test/start/package.json @@ -0,0 +1,15 @@ +{ + "name": "start", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "devDependencies": { + "sharp": "^0.32.0" + } +} diff --git a/test/start/pnpm-lock.yaml b/test/start/pnpm-lock.yaml new file mode 100644 index 00000000..b8113d5d --- /dev/null +++ b/test/start/pnpm-lock.yaml @@ -0,0 +1,290 @@ +lockfileVersion: 5.4 + +specifiers: + sharp: ^0.32.0 + +devDependencies: + sharp: 0.32.0 + +packages: + + /base64-js/1.5.1: + resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + dev: true + + /bl/4.1.0: + resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} + dependencies: + buffer: 5.7.1 + inherits: 2.0.4 + readable-stream: 3.6.2 + dev: true + + /buffer/5.7.1: + resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==} + dependencies: + base64-js: 1.5.1 + ieee754: 1.2.1 + dev: true + + /chownr/1.1.4: + resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} + dev: true + + /color-convert/2.0.1: + resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} + engines: {node: '>=7.0.0'} + dependencies: + color-name: 1.1.4 + dev: true + + /color-name/1.1.4: + resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + dev: true + + /color-string/1.9.1: + resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + dependencies: + color-name: 1.1.4 + simple-swizzle: 0.2.2 + dev: true + + /color/4.2.3: + resolution: {integrity: sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==} + engines: {node: '>=12.5.0'} + dependencies: + color-convert: 2.0.1 + color-string: 1.9.1 + dev: true + + /decompress-response/6.0.0: + resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==} + engines: {node: '>=10'} + dependencies: + mimic-response: 3.1.0 + dev: true + + /deep-extend/0.6.0: + resolution: {integrity: sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==} + engines: {node: '>=4.0.0'} + dev: true + + /detect-libc/2.0.1: + resolution: {integrity: sha512-463v3ZeIrcWtdgIg6vI6XUncguvr2TnGl4SzDXinkt9mSLpBJKXT3mW6xT3VQdDN11+WVs29pgvivTc4Lp8v+w==} + engines: {node: '>=8'} + dev: true + + /end-of-stream/1.4.4: + resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} + dependencies: + once: 1.4.0 + dev: true + + /expand-template/2.0.3: + resolution: {integrity: sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==} + engines: {node: '>=6'} + dev: true + + /fs-constants/1.0.0: + resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} + dev: true + + /github-from-package/0.0.0: + resolution: {integrity: sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==} + dev: true + + /ieee754/1.2.1: + resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} + dev: true + + /inherits/2.0.4: + resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} + dev: true + + /ini/1.3.8: + resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==} + dev: true + + /is-arrayish/0.3.2: + resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==} + dev: true + + /lru-cache/6.0.0: + resolution: {integrity: sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==} + engines: {node: '>=10'} + dependencies: + yallist: 4.0.0 + dev: true + + /mimic-response/3.1.0: + resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==} + engines: {node: '>=10'} + dev: true + + /minimist/1.2.8: + resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + dev: true + + /mkdirp-classic/0.5.3: + resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==} + dev: true + + /napi-build-utils/1.0.2: + resolution: {integrity: sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==} + dev: true + + /node-abi/3.35.0: + resolution: {integrity: sha512-jAlSOFR1Bls963NmFwxeQkNTzqjUF0NThm8Le7eRIRGzFUVJuMOFZDLv5Y30W/Oaw+KEebEJLAigwO9gQHoEmw==} + engines: {node: '>=10'} + dependencies: + semver: 7.4.0 + dev: true + + /node-addon-api/6.0.0: + resolution: {integrity: sha512-GyHvgPvUXBvAkXa0YvYnhilSB1A+FRYMpIVggKzPZqdaZfevZOuzfWzyvgzOwRLHBeo/MMswmJFsrNF4Nw1pmA==} + dev: true + + /once/1.4.0: + resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + dependencies: + wrappy: 1.0.2 + dev: true + + /prebuild-install/7.1.1: + resolution: {integrity: sha512-jAXscXWMcCK8GgCoHOfIr0ODh5ai8mj63L2nWrjuAgXE6tDyYGnx4/8o/rCgU+B4JSyZBKbeZqzhtwtC3ovxjw==} + engines: {node: '>=10'} + hasBin: true + dependencies: + detect-libc: 2.0.1 + expand-template: 2.0.3 + github-from-package: 0.0.0 + minimist: 1.2.8 + mkdirp-classic: 0.5.3 + napi-build-utils: 1.0.2 + node-abi: 3.35.0 + pump: 3.0.0 + rc: 1.2.8 + simple-get: 4.0.1 + tar-fs: 2.1.1 + tunnel-agent: 0.6.0 + dev: true + + /pump/3.0.0: + resolution: {integrity: sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==} + dependencies: + end-of-stream: 1.4.4 + once: 1.4.0 + dev: true + + /rc/1.2.8: + resolution: {integrity: sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==} + hasBin: true + dependencies: + deep-extend: 0.6.0 + ini: 1.3.8 + minimist: 1.2.8 + strip-json-comments: 2.0.1 + dev: true + + /readable-stream/3.6.2: + resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==} + engines: {node: '>= 6'} + dependencies: + inherits: 2.0.4 + string_decoder: 1.3.0 + util-deprecate: 1.0.2 + dev: true + + /safe-buffer/5.2.1: + resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} + dev: true + + /semver/7.4.0: + resolution: {integrity: sha512-RgOxM8Mw+7Zus0+zcLEUn8+JfoLpj/huFTItQy2hsM4khuC1HYRDp0cU482Ewn/Fcy6bCjufD8vAj7voC66KQw==} + engines: {node: '>=10'} + hasBin: true + dependencies: + lru-cache: 6.0.0 + dev: true + + /sharp/0.32.0: + resolution: {integrity: sha512-yLAypVcqj1toSAqRSwbs86nEzfyZVDYqjuUX8grhFpeij0DDNagKJXELS/auegDBRDg1XBtELdOGfo2X1cCpeA==} + engines: {node: '>=14.15.0'} + requiresBuild: true + dependencies: + color: 4.2.3 + detect-libc: 2.0.1 + node-addon-api: 6.0.0 + prebuild-install: 7.1.1 + semver: 7.4.0 + simple-get: 4.0.1 + tar-fs: 2.1.1 + tunnel-agent: 0.6.0 + dev: true + + /simple-concat/1.0.1: + resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} + dev: true + + /simple-get/4.0.1: + resolution: {integrity: sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==} + dependencies: + decompress-response: 6.0.0 + once: 1.4.0 + simple-concat: 1.0.1 + dev: true + + /simple-swizzle/0.2.2: + resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} + dependencies: + is-arrayish: 0.3.2 + dev: true + + /string_decoder/1.3.0: + resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==} + dependencies: + safe-buffer: 5.2.1 + dev: true + + /strip-json-comments/2.0.1: + resolution: {integrity: sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==} + engines: {node: '>=0.10.0'} + dev: true + + /tar-fs/2.1.1: + resolution: {integrity: sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==} + dependencies: + chownr: 1.1.4 + mkdirp-classic: 0.5.3 + pump: 3.0.0 + tar-stream: 2.2.0 + dev: true + + /tar-stream/2.2.0: + resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==} + engines: {node: '>=6'} + dependencies: + bl: 4.1.0 + end-of-stream: 1.4.4 + fs-constants: 1.0.0 + inherits: 2.0.4 + readable-stream: 3.6.2 + dev: true + + /tunnel-agent/0.6.0: + resolution: {integrity: sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==} + dependencies: + safe-buffer: 5.2.1 + dev: true + + /util-deprecate/1.0.2: + resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} + dev: true + + /wrappy/1.0.2: + resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} + dev: true + + /yallist/4.0.0: + resolution: {integrity: sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==} + dev: true