diff --git a/CHANGELOG.md b/CHANGELOG.md index 86cbdde7..ad72c7ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +# [v5.0.1](https://github.com/coder-hxl/x-crawl/compare/v5.0.0...v5.0.1) (2023-04-08) + +### 🚀 Features + +- New adjustments to the document. + +--- + +### 🚀 Features + +- 文档新的调整。 + # [v5.0.0](https://github.com/coder-hxl/x-crawl/compare/v4.0.1...v5.0.0) (2023-04-06) ### 🚨 Breaking Changes diff --git a/README.md b/README.md index b86284e6..5986e9fa 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [Interval time](#Interval-time) - [Fail retry](#Fail-retry) - [Priority queue](#Priority-queue) - - [About the results](#About the results) + - [About results](#About-results) + - [TypeScript](#TypeScript) - [API](#API) - [xCrawl](#xCrawl) - [Type](#Type) @@ -64,31 +65,34 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [Example](#Example-5) - [Types](#Types) - [API Config](#API-Config) - - [IntervalTime](#IntervalTime) - - [Method](#Method) - - [PageRequestConfigCookies](#PageRequestConfigCookies) - - [PageRequestConfig](#PageRequestConfig) - - [DataRequestConfig](#DataRequestConfig) - - [FileRequestConfig](#FileRequestConfig) - - [CrawlPageConfigObject](#CrawlPageConfigObject) - - [CrawlDataConfigObject](#CrawlDataConfigObject) - - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [XCrawlBaseConfig](#XCrawlBaseConfig) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - - [StartPollingConfig](#StartPollingConfig) - - [API Result](#API-Result) - - [XCrawlInstance](#XCrawlInstance) - - [CrawlCommonRes](#CrawlCommonRes) - - [CrawlPageSingleRes](#CrawlPageSingleRes) - - [CrawlDataSingleRes](#CrawlDataSingleRes) - - [CrawlFileSingleRes](#CrawlFileSingleRes) - - [CrawlPageRes](#CrawlPageRes) - - [CrawlDataRes](#CrawlDataRes) - - [CrawlFileRes](#CrawlFileRes) - - [API Other](#API-Other) - - [AnyObject](#AnyObject) + - [API Config Other](#API-Config-Other) + - [IntervalTime](#IntervalTime) + - [Method](#Method) + - [PageRequestConfigCookies](#PageRequestConfigCookies) + - [API Config Request](#API-Config-Request) + - [PageRequestConfig](#PageRequestConfig) + - [DataRequestConfig](#DataRequestConfig) + - [FileRequestConfig](#FileRequestConfig) + - [API Config Crawl](#API-Config-Crawl) + - [XCrawlBaseConfig](#XCrawlBaseConfig) + - [CrawlPageConfigObject](#CrawlPageConfigObject) + - [CrawlDataConfigObject](#CrawlDataConfigObject) + - [CrawlFileConfigObject](#CrawlFileConfigObject) + - [CrawlPageConfig](#CrawlPageConfig) + - [CrawlDataConfig](#CrawlDataConfig) + - [CrawlFileConfig](#CrawlFileConfig) + - [StartPollingConfig](#StartPollingConfig) + - [API Result](#API-Result) + - [XCrawlInstance](#XCrawlInstance) + - [CrawlCommonRes](#CrawlCommonRes) + - [CrawlPageSingleRes](#CrawlPageSingleRes) + - [CrawlDataSingleRes](#CrawlDataSingleRes) + - [CrawlFileSingleRes](#CrawlFileSingleRes) + - [CrawlPageRes](#CrawlPageRes) + - [CrawlDataRes](#CrawlDataRes) + - [CrawlFileRes](#CrawlFileRes) + - [API Other](#API-Other) + - [AnyObject](#AnyObject) - [More](#More) ## Install @@ -101,7 +105,7 @@ npm install x-crawl ## Example -Timing capture: Take the automatic capture of the cover image of Airbnb Plus listings every day as an example: +Take some pictures of Airbnb hawaii experience and Plus listings automatically every day as an example: ```js // 1.Import module ES/CJS @@ -117,23 +121,34 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 2000 } }) */ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { // Call crawlPage API to crawl Page - const res = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes') - const { page } = res.data - - // set request configuration - const plusBoxHandle = await page.$('.a1stauiv') - const requestConfigs = await plusBoxHandle!.$$eval( - 'picture img', - (imgEls) => { + const res = await myXCrawl.crawlPage([ + 'https://zh.airbnb.com/s/hawaii/experiences', + 'https://zh.airbnb.com/s/hawaii/plus_homes' + ]) + + // Store the image URL + const imgUrls: string[] = [] + const elSelectorMap = ['.c14whb16', '.a1stauiv'] + for (const item of res) { + const { id } = item + const { page } = item.data + + // Gets the URL of the page's wheel image element + const boxHandle = await page.$(elSelectorMap[id - 1]) + const urls = await boxHandle!.$$eval('picture img', (imgEls) => { return imgEls.map((item) => item.src) - } - ) + }) + imgUrls.push(...urls) - // Call the crawlFile API to crawl pictures - myXCrawl.crawlFile({ requestConfigs, fileConfig: { storeDir: './upload' } }) + // Close page + page.close() + } - // Close page - page.close() + // Call the crawlFile API to crawl pictures + myXCrawl.crawlFile({ + requestConfigs: imgUrls, + fileConfig: { storeDir: './upload' } + }) }) ``` @@ -322,11 +337,11 @@ Callback function parameters: Some general configuration can be set in three places: -- Examples of crawler applications -- Spider API -- request configuration +- Crawler application instance (global) +- Crawler API (local) +- Request configuration (separate) -The priority is: request config > API config > base config +The priority is: request config > API config > application config ### Interval time @@ -356,14 +371,12 @@ The intervalTime option defaults to undefined . If there is a setting value, it ### Fail retry -Failed retries can be re-requested when timeouts and the like. +Failed retry In the event of an error such as a timeout, the request will wait for the round to end and then retry. ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) ``` @@ -377,9 +390,7 @@ A priority queue allows a request to be sent first. ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl .crawlData([ @@ -392,10 +403,20 @@ myXCrawl The larger the value of the priority attribute, the higher the priority in the current crawling queue. -### About the results +### About results For the result, the result of each request is uniformly wrapped with an object that provides information about the result of the request, such as id, result, success or not, maximum retry, number of retries, error information collected, and so on. Automatically determine whether the return value is wrapped in an array depending on the configuration you choose, and the type fits perfectly in TS. +The id of each object is determined according to the order of requests in your configuration, and if there is a priority used, it will be sorted by priority. + +Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlData config](#config-1), [crawlFile config](#config-2). + +### TypeScript + +Type systems like TypeScript can detect many common errors at compile time through static analysis. This reduces runtime errors and gives us more confidence when refactoring large projects. TypeScript also improves the development experience and efficiency through type-based auto-completion in the IDE. + +x-crawl itself is written in TypeScript and supports TypeScript. Comes with a type declaration file, out of the box. + ## API ### xCrawl @@ -480,6 +501,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) ``` +The res you get will be an object. + **2. PageRequestConfig** More configuration options of PageRequestConfig can be found in [PageRequestConfig](#PageRequestConfig) . @@ -500,6 +523,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **3.(string | PageRequestConfig)[]** More configuration options of PageRequestConfig can be found in [PageRequestConfig](#PageRequestConfig) . @@ -516,6 +541,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **4. CrawlPageConfigObject** For more configuration options of CrawlPageConfigObject, please refer to [CrawlPageConfigObject](#CrawlPageConfigObject) . @@ -538,7 +565,9 @@ myXCrawl.crawlPage({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### crawlData @@ -600,6 +629,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) ``` +The res you get will be an object. + **2. DataRequestConfig** More configuration options of DataRequestConfig can be found in [DataRequestConfig](#DataRequestConfig) . @@ -620,6 +651,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **3.(string | DataRequestConfig)[]** More configuration options of DataRequestConfig can be found in [DataRequestConfig](#DataRequestConfig) . @@ -636,6 +669,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **4. CrawlDataConfigObject** For more configuration options of CrawlPageConfigObject, please refer to [CrawlPageConfigObject](#CrawlPageConfigObject) . @@ -658,7 +693,9 @@ myXCrawl.crawlData({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### crawlFile @@ -729,6 +766,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **2. FileRequestConfig[]** More configuration options of FileRequestConfig can be found in [FileRequestConfig](#FileRequestConfig) . @@ -748,6 +787,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **3. CrawlFileConfigObject** For more configuration options of CrawlFileConfigObject, please refer to [CrawlFileConfigObject](#CrawlFileConfigObject) . @@ -770,7 +811,9 @@ myXCrawl.crawlFile({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### startPolling @@ -808,13 +851,15 @@ myXCrawl.startPolling({ h: 2, m: 30 }, (count, stopPolling) => { ### API Config -#### IntervalTime +#### API Config Other + +##### IntervalTime ```ts export type IntervalTime = number | { max: number; min?: number } ``` -#### Method +##### Method ```ts export type Method = @@ -840,7 +885,7 @@ export type Method = | 'UNLINK' ``` -#### PageRequestConfigCookies +##### PageRequestConfigCookies ```ts export type PageRequestConfigCookies = @@ -849,7 +894,9 @@ export type PageRequestConfigCookies = | Protocol.Network.CookieParam[] ``` -#### PageRequestConfig +#### API Config Request + +##### PageRequestConfig ```ts export interface PageRequestConfig { @@ -863,7 +910,7 @@ export interface PageRequestConfig { } ``` -#### DataRequestConfig +##### DataRequestConfig ```ts export interface DataRequestConfig { @@ -879,7 +926,7 @@ export interface DataRequestConfig { } ``` -#### FileRequestConfig +##### FileRequestConfig ```ts export interface FileRequestConfig { @@ -895,7 +942,22 @@ export interface FileRequestConfig { } ``` -#### CrawlPageConfigObject +#### API Config Crawl + +##### XCrawlBaseConfig + +```ts +export interface XCrawlBaseConfig { + baseUrl?: string + timeout?: number + intervalTime?: IntervalTime + mode?: 'async' | 'sync' + proxy?: string + maxRetry?: number +} +``` + +##### CrawlPageConfigObject ```ts export interface CrawlPageConfigObject { @@ -908,7 +970,7 @@ export interface CrawlPageConfigObject { } ``` -#### CrawlDataConfigObject +##### CrawlDataConfigObject ```ts export interface CrawlDataConfigObject { @@ -920,7 +982,7 @@ export interface CrawlDataConfigObject { } ``` -#### CrawlFileConfigObject +##### CrawlFileConfigObject ```ts export interface CrawlFileConfigObject { @@ -942,20 +1004,7 @@ export interface CrawlFileConfigObject { } ``` -#### XCrawlBaseConfig - -```ts -export interface XCrawlBaseConfig { - baseUrl?: string - timeout?: number - intervalTime?: IntervalTime - mode?: 'async' | 'sync' - proxy?: string - maxRetry?: number -} -``` - -#### CrawlPageConfig +##### CrawlPageConfig ```ts export type CrawlPageConfig = @@ -965,7 +1014,7 @@ export type CrawlPageConfig = | CrawlPageConfigObject ``` -#### CrawlDataConfig +##### CrawlDataConfig ```ts export type CrawlDataConfig = @@ -975,13 +1024,13 @@ export type CrawlDataConfig = | CrawlDataConfigObject ``` -#### CrawlFileConfig +##### CrawlFileConfig ```ts export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject ``` -#### StartPollingConfig +##### StartPollingConfig ```js export interface StartPollingConfig { diff --git a/assets/cn/crawler-result.png b/assets/cn/crawler-result.png index 199d8115..e6e233e0 100644 Binary files a/assets/cn/crawler-result.png and b/assets/cn/crawler-result.png differ diff --git a/assets/cn/crawler.png b/assets/cn/crawler.png index 2505f5d6..0c44f114 100644 Binary files a/assets/cn/crawler.png and b/assets/cn/crawler.png differ diff --git a/assets/en/crawler-result.png b/assets/en/crawler-result.png index b3bc679b..55f3c2b4 100644 Binary files a/assets/en/crawler-result.png and b/assets/en/crawler-result.png differ diff --git a/assets/en/crawler.png b/assets/en/crawler.png index 98da689e..84f94061 100644 Binary files a/assets/en/crawler.png and b/assets/en/crawler.png differ diff --git a/docs/cn.md b/docs/cn.md index ab0fb49e..ad3593f8 100644 --- a/docs/cn.md +++ b/docs/cn.md @@ -43,6 +43,7 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer) - [失败重试](#失败重试) - [优先队列](#优先队列) - [关于结果](#关于结果) + - [TypeScript](#TypeScript) - [API](#API) - [xCrawl](#xCrawl) - [类型](#类型) @@ -62,22 +63,26 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer) - [startPolling](#startPolling) - [类型](#类型-4) - [示例](#示例-5) -- [类型](#类型-5) + - [类型](#类型-5) +- [类型](#类型-6) - [API Config](#API-Config) - - [IntervalTime](#IntervalTime) - - [Method](#Method) - - [PageRequestConfigCookies](#PageRequestConfigCookies) - - [PageRequestConfig](#PageRequestConfig) - - [DataRequestConfig](#DataRequestConfig) - - [FileRequestConfig](#FileRequestConfig) - - [CrawlPageConfigObject](#CrawlPageConfigObject) - - [CrawlDataConfigObject](#CrawlDataConfigObject) - - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [XCrawlBaseConfig](#XCrawlBaseConfig) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - - [StartPollingConfig](#StartPollingConfig) + - [API Config Other](#API-Config-Other) + - [IntervalTime](#IntervalTime) + - [Method](#Method) + - [PageRequestConfigCookies](#PageRequestConfigCookies) + - [API Config Request](#API-Config-Request) + - [PageRequestConfig](#PageRequestConfig) + - [DataRequestConfig](#DataRequestConfig) + - [FileRequestConfig](#FileRequestConfig) + - [API Config Crawl](#API-Config-Crawl) + - [XCrawlBaseConfig](#XCrawlBaseConfig) + - [CrawlPageConfigObject](#CrawlPageConfigObject) + - [CrawlDataConfigObject](#CrawlDataConfigObject) + - [CrawlFileConfigObject](#CrawlFileConfigObject) + - [CrawlPageConfig](#CrawlPageConfig) + - [CrawlDataConfig](#CrawlDataConfig) + - [CrawlFileConfig](#CrawlFileConfig) + - [StartPollingConfig](#StartPollingConfig) - [API Result](#API-Result) - [XCrawlInstance](#XCrawlInstance) - [CrawlCommonRes](#CrawlCommonRes) @@ -101,32 +106,47 @@ npm install x-crawl ## 示例 -每天自动获取 bilibili 国漫主页的轮播图片为例: +每天自动获取 bilibili 首页、国漫、电影这三个页面的轮播图片为例: ```js // 1.导入模块 ES/CJS import xCrawl from 'x-crawl' // 2.创建一个爬虫实例 -const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 2000 } }) +const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } }) // 3.设置爬取任务 // 调用 startPolling API 开始轮询功能,每隔一天会调用回调函数 myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { - // 调用 crawlPage API 爬取 Page - const res = await myXCrawl.crawlPage('https://www.bilibili.com/guochuang/') - const { page } = res.data + // 调用 crawlPage API 爬取 首页、国漫、电影 这三个页面 + const res = await myXCrawl.crawlPage([ + 'https://www.bilibili.com', + 'https://www.bilibili.com/guochuang', + 'https://www.bilibili.com/movie' + ]) - // 设置请求配置,获取轮播图片的 URL - const requestConfigs = await page.$$eval('.chief-recom-item img', (imgEls) => - imgEls.map((item) => item.src) - ) + // 存放图片 URL + const imgUrls: string[] = [] + const elSelectorMap = ['.carousel-inner img', '.chief-recom-item img', '.bg-item img'] + for (const item of res) { + const { id } = item + const { page } = item.data + + // 获取页面轮播图片元素的 URL + const urls = await page.$$eval(elSelectorMap[id - 1], (imgEls) => + imgEls.map((item) => item.src) + ) + imgUrls.push(...urls) + + // 关闭页面 + page.close() + } // 调用 crawlFile API 爬取图片 - await myXCrawl.crawlFile({ requestConfigs, fileConfig: { storeDir: './upload' } }) - - // 关闭页面 - page.close() + await myXCrawl.crawlFile({ + requestConfigs: imgUrls, + fileConfig: { storeDir: './upload' } + }) }) ``` @@ -312,11 +332,11 @@ myXCrawl.startPolling({ h: 2, m: 30 }, async (count, stopPolling) => { 一些通用的配置可以在三个地方设置: -- 爬虫应用实例 -- 爬虫 API -- 请求配置 +- 爬虫应用实例(全局) +- 爬虫 API (局部) +- 请求配置(单独) -优先级为:request config > API config > base config +优先级为:request config > API config > application config ### 间隔时间 @@ -346,14 +366,12 @@ intervalTime 选项默认为 undefined 。若有设置值,则会在请求前 ### 失败重试 -失败重试可以在超时之类时会进行重新请求。 +失败重试在超时之类的错误发生时,将会等待这一轮请求结束后重新请求。 ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) ``` @@ -367,9 +385,7 @@ maxRetry 属性决定要重试几次。 ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl .crawlData([ @@ -386,6 +402,16 @@ priority 属性的值越大就在当前爬取队列中越优先。 对于结果,每个请求的结果将统一使用对象包裹着,该对象提供了关于这次请求结果的信息,比如:id、结果、是否成功、最大重试、重试次数、收集到错误信息等。自动根据你选用的配置方式决定返回值是否包裹在一个数组中,并且在 TS 中类型完美适配。 +每个对象的 id 是根据你配置里的请求顺序决定的,如果有使用优先级,则会根据优先级排序。 + +相关的配置方式和结果详情查看:[crawlPage 配置](#配置)、[crawlData 配置](#配置-1)、[crawlFile 配置](#配置-2) 。 + +### TypeScript + +像 TypeScript 这样的类型系统可以在编译时通过静态分析检测出很多常见错误。这减少了运行时错误,也让我们在重构大型项目的时候更有信心。通过 IDE 中基于类型的自动补全,TypeScript 还改善了开发体验和效率。 + +x-crawl 本身就是用 TypeScript 编写的,并对 TypeScript 提供了支持。自带类型声明文件,开箱即用。 + ## API ### xCrawl @@ -468,6 +494,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) ``` +拿到的 res 将是一个对象。 + **2.PageRequestConfig** PageRequestConfig 的更多配置选项可以查看 [PageRequestConfig](#PageRequestConfig) 。 @@ -488,6 +516,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个对象。 + **3.(string | PageRequestConfig)[]** PageRequestConfig 的更多配置选项可以查看 [PageRequestConfig](#PageRequestConfig) 。 @@ -504,6 +534,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个数组,里面是对象。 + **4.CrawlPageConfigObject** CrawlPageConfigObject 的更多配置选项可以查看 [CrawlPageConfigObject](#CrawlPageConfigObject) 。 @@ -526,7 +558,9 @@ myXCrawl.crawlPage({ }).then((res) => {}) ``` -可以根据实际情况选用即可。 +拿到的 res 将是一个数组,里面是对象。 + +关于结果的更多信息可查看 [关于结果](#关于结果) ,可以根据实际情况选用即可。 ### crawlData @@ -589,6 +623,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) ``` +拿到的 res 将是一个对象。 + **2.DataRequestConfig** DataRequestConfig 的更多配置选项可以查看 [DataRequestConfig](#DataRequestConfig) 。 @@ -609,6 +645,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个对象。 + **3.(string | DataRequestConfig)[]** DataRequestConfig 的更多配置选项可以查看 [DataRequestConfig](#DataRequestConfig) 。 @@ -625,6 +663,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个数组,里面是对象。 + **4.CrawlDataConfigObject** CrawlPageConfigObject 的更多配置选项可以查看 [CrawlPageConfigObject](#CrawlPageConfigObject) 。 @@ -647,7 +687,9 @@ myXCrawl.crawlData({ }).then((res) => {}) ``` -可以根据实际情况选用即可。 +拿到的 res 将是一个数组,里面是对象。 + +关于结果的更多信息可查看 [关于结果](#关于结果) ,可以根据实际情况选用即可。 ### crawlFile @@ -717,6 +759,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个对象。 + **2.FileRequestConfig[]** FileRequestConfig 的更多配置选项可以查看 [FileRequestConfig](#FileRequestConfig) 。 @@ -736,6 +780,8 @@ myXCrawl .then((res) => {}) ``` +拿到的 res 将是一个数组,里面是对象。 + **3.CrawlFileConfigObject** CrawlFileConfigObject 的更多配置选项可以查看 [CrawlFileConfigObject](#CrawlFileConfigObject) 。 @@ -758,7 +804,9 @@ myXCrawl.crawlFile({ }).then((res) => {}) ``` -可以根据实际情况选用即可。 +拿到的 res 将是一个数组,里面是对象。 + +关于结果的更多信息可查看 [关于结果](#关于结果) ,可以根据实际情况选用即可。 ### startPolling @@ -796,13 +844,15 @@ myXCrawl.startPolling({ h: 2, m: 30 }, (count, stopPolling) => { ### API Config -#### IntervalTime +#### API Config Other + +##### IntervalTime ```ts export type IntervalTime = number | { max: number; min?: number } ``` -#### Method +##### Method ```ts export type Method = @@ -828,7 +878,7 @@ export type Method = | 'UNLINK' ``` -#### PageRequestConfigCookies +##### PageRequestConfigCookies ```ts export type PageRequestConfigCookies = @@ -837,7 +887,9 @@ export type PageRequestConfigCookies = | Protocol.Network.CookieParam[] ``` -#### PageRequestConfig +#### API Config Request + +##### PageRequestConfig ```ts export interface PageRequestConfig { @@ -851,7 +903,7 @@ export interface PageRequestConfig { } ``` -#### DataRequestConfig +##### DataRequestConfig ```ts export interface DataRequestConfig { @@ -867,7 +919,7 @@ export interface DataRequestConfig { } ``` -#### FileRequestConfig +##### FileRequestConfig ```ts export interface FileRequestConfig { @@ -883,7 +935,22 @@ export interface FileRequestConfig { } ``` -#### CrawlPageConfigObject +#### API Config Crawl + +##### XCrawlBaseConfig + +```ts +export interface XCrawlBaseConfig { + baseUrl?: string + timeout?: number + intervalTime?: IntervalTime + mode?: 'async' | 'sync' + proxy?: string + maxRetry?: number +} +``` + +##### CrawlPageConfigObject ```ts export interface CrawlPageConfigObject { @@ -896,7 +963,7 @@ export interface CrawlPageConfigObject { } ``` -#### CrawlDataConfigObject +##### CrawlDataConfigObject ```ts export interface CrawlDataConfigObject { @@ -908,7 +975,7 @@ export interface CrawlDataConfigObject { } ``` -#### CrawlFileConfigObject +##### CrawlFileConfigObject ```ts export interface CrawlFileConfigObject { @@ -930,20 +997,7 @@ export interface CrawlFileConfigObject { } ``` -#### XCrawlBaseConfig - -```ts -export interface XCrawlBaseConfig { - baseUrl?: string - timeout?: number - intervalTime?: IntervalTime - mode?: 'async' | 'sync' - proxy?: string - maxRetry?: number -} -``` - -#### CrawlPageConfig +##### CrawlPageConfig ```ts export type CrawlPageConfig = @@ -953,7 +1007,7 @@ export type CrawlPageConfig = | CrawlPageConfigObject ``` -#### CrawlDataConfig +##### CrawlDataConfig ```ts export type CrawlDataConfig = @@ -963,13 +1017,13 @@ export type CrawlDataConfig = | CrawlDataConfigObject ``` -#### CrawlFileConfig +##### CrawlFileConfig ```ts export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject ``` -#### StartPollingConfig +##### StartPollingConfig ```js export interface StartPollingConfig { diff --git a/package.json b/package.json index 8b3d60dd..6ddb0455 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "5.0.0", + "version": "5.0.1", "author": "coderHXL", "description": "x-crawl is a flexible nodejs crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index b86284e6..5986e9fa 100644 --- a/publish/README.md +++ b/publish/README.md @@ -42,7 +42,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [Interval time](#Interval-time) - [Fail retry](#Fail-retry) - [Priority queue](#Priority-queue) - - [About the results](#About the results) + - [About results](#About-results) + - [TypeScript](#TypeScript) - [API](#API) - [xCrawl](#xCrawl) - [Type](#Type) @@ -64,31 +65,34 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p - [Example](#Example-5) - [Types](#Types) - [API Config](#API-Config) - - [IntervalTime](#IntervalTime) - - [Method](#Method) - - [PageRequestConfigCookies](#PageRequestConfigCookies) - - [PageRequestConfig](#PageRequestConfig) - - [DataRequestConfig](#DataRequestConfig) - - [FileRequestConfig](#FileRequestConfig) - - [CrawlPageConfigObject](#CrawlPageConfigObject) - - [CrawlDataConfigObject](#CrawlDataConfigObject) - - [CrawlFileConfigObject](#CrawlFileConfigObject) - - [XCrawlBaseConfig](#XCrawlBaseConfig) - - [CrawlPageConfig](#CrawlPageConfig) - - [CrawlDataConfig](#CrawlDataConfig) - - [CrawlFileConfig](#CrawlFileConfig) - - [StartPollingConfig](#StartPollingConfig) - - [API Result](#API-Result) - - [XCrawlInstance](#XCrawlInstance) - - [CrawlCommonRes](#CrawlCommonRes) - - [CrawlPageSingleRes](#CrawlPageSingleRes) - - [CrawlDataSingleRes](#CrawlDataSingleRes) - - [CrawlFileSingleRes](#CrawlFileSingleRes) - - [CrawlPageRes](#CrawlPageRes) - - [CrawlDataRes](#CrawlDataRes) - - [CrawlFileRes](#CrawlFileRes) - - [API Other](#API-Other) - - [AnyObject](#AnyObject) + - [API Config Other](#API-Config-Other) + - [IntervalTime](#IntervalTime) + - [Method](#Method) + - [PageRequestConfigCookies](#PageRequestConfigCookies) + - [API Config Request](#API-Config-Request) + - [PageRequestConfig](#PageRequestConfig) + - [DataRequestConfig](#DataRequestConfig) + - [FileRequestConfig](#FileRequestConfig) + - [API Config Crawl](#API-Config-Crawl) + - [XCrawlBaseConfig](#XCrawlBaseConfig) + - [CrawlPageConfigObject](#CrawlPageConfigObject) + - [CrawlDataConfigObject](#CrawlDataConfigObject) + - [CrawlFileConfigObject](#CrawlFileConfigObject) + - [CrawlPageConfig](#CrawlPageConfig) + - [CrawlDataConfig](#CrawlDataConfig) + - [CrawlFileConfig](#CrawlFileConfig) + - [StartPollingConfig](#StartPollingConfig) + - [API Result](#API-Result) + - [XCrawlInstance](#XCrawlInstance) + - [CrawlCommonRes](#CrawlCommonRes) + - [CrawlPageSingleRes](#CrawlPageSingleRes) + - [CrawlDataSingleRes](#CrawlDataSingleRes) + - [CrawlFileSingleRes](#CrawlFileSingleRes) + - [CrawlPageRes](#CrawlPageRes) + - [CrawlDataRes](#CrawlDataRes) + - [CrawlFileRes](#CrawlFileRes) + - [API Other](#API-Other) + - [AnyObject](#AnyObject) - [More](#More) ## Install @@ -101,7 +105,7 @@ npm install x-crawl ## Example -Timing capture: Take the automatic capture of the cover image of Airbnb Plus listings every day as an example: +Take some pictures of Airbnb hawaii experience and Plus listings automatically every day as an example: ```js // 1.Import module ES/CJS @@ -117,23 +121,34 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 2000 } }) */ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { // Call crawlPage API to crawl Page - const res = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes') - const { page } = res.data - - // set request configuration - const plusBoxHandle = await page.$('.a1stauiv') - const requestConfigs = await plusBoxHandle!.$$eval( - 'picture img', - (imgEls) => { + const res = await myXCrawl.crawlPage([ + 'https://zh.airbnb.com/s/hawaii/experiences', + 'https://zh.airbnb.com/s/hawaii/plus_homes' + ]) + + // Store the image URL + const imgUrls: string[] = [] + const elSelectorMap = ['.c14whb16', '.a1stauiv'] + for (const item of res) { + const { id } = item + const { page } = item.data + + // Gets the URL of the page's wheel image element + const boxHandle = await page.$(elSelectorMap[id - 1]) + const urls = await boxHandle!.$$eval('picture img', (imgEls) => { return imgEls.map((item) => item.src) - } - ) + }) + imgUrls.push(...urls) - // Call the crawlFile API to crawl pictures - myXCrawl.crawlFile({ requestConfigs, fileConfig: { storeDir: './upload' } }) + // Close page + page.close() + } - // Close page - page.close() + // Call the crawlFile API to crawl pictures + myXCrawl.crawlFile({ + requestConfigs: imgUrls, + fileConfig: { storeDir: './upload' } + }) }) ``` @@ -322,11 +337,11 @@ Callback function parameters: Some general configuration can be set in three places: -- Examples of crawler applications -- Spider API -- request configuration +- Crawler application instance (global) +- Crawler API (local) +- Request configuration (separate) -The priority is: request config > API config > base config +The priority is: request config > API config > application config ### Interval time @@ -356,14 +371,12 @@ The intervalTime option defaults to undefined . If there is a setting value, it ### Fail retry -Failed retries can be re-requested when timeouts and the like. +Failed retry In the event of an error such as a timeout, the request will wait for the round to end and then retry. ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) ``` @@ -377,9 +390,7 @@ A priority queue allows a request to be sent first. ```js import xCrawl from 'x-crawl' -const myXCrawl = xCrawl({ - intervalTime: { max: 3000, min: 1000 } -}) +const myXCrawl = xCrawl() myXCrawl .crawlData([ @@ -392,10 +403,20 @@ myXCrawl The larger the value of the priority attribute, the higher the priority in the current crawling queue. -### About the results +### About results For the result, the result of each request is uniformly wrapped with an object that provides information about the result of the request, such as id, result, success or not, maximum retry, number of retries, error information collected, and so on. Automatically determine whether the return value is wrapped in an array depending on the configuration you choose, and the type fits perfectly in TS. +The id of each object is determined according to the order of requests in your configuration, and if there is a priority used, it will be sorted by priority. + +Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlData config](#config-1), [crawlFile config](#config-2). + +### TypeScript + +Type systems like TypeScript can detect many common errors at compile time through static analysis. This reduces runtime errors and gives us more confidence when refactoring large projects. TypeScript also improves the development experience and efficiency through type-based auto-completion in the IDE. + +x-crawl itself is written in TypeScript and supports TypeScript. Comes with a type declaration file, out of the box. + ## API ### xCrawl @@ -480,6 +501,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {}) ``` +The res you get will be an object. + **2. PageRequestConfig** More configuration options of PageRequestConfig can be found in [PageRequestConfig](#PageRequestConfig) . @@ -500,6 +523,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **3.(string | PageRequestConfig)[]** More configuration options of PageRequestConfig can be found in [PageRequestConfig](#PageRequestConfig) . @@ -516,6 +541,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **4. CrawlPageConfigObject** For more configuration options of CrawlPageConfigObject, please refer to [CrawlPageConfigObject](#CrawlPageConfigObject) . @@ -538,7 +565,9 @@ myXCrawl.crawlPage({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### crawlData @@ -600,6 +629,8 @@ const myXCrawl = xCrawl() myXCrawl.crawlData('https://xxx.com/xxxx').then((res) => {}) ``` +The res you get will be an object. + **2. DataRequestConfig** More configuration options of DataRequestConfig can be found in [DataRequestConfig](#DataRequestConfig) . @@ -620,6 +651,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **3.(string | DataRequestConfig)[]** More configuration options of DataRequestConfig can be found in [DataRequestConfig](#DataRequestConfig) . @@ -636,6 +669,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **4. CrawlDataConfigObject** For more configuration options of CrawlPageConfigObject, please refer to [CrawlPageConfigObject](#CrawlPageConfigObject) . @@ -658,7 +693,9 @@ myXCrawl.crawlData({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### crawlFile @@ -729,6 +766,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an object. + **2. FileRequestConfig[]** More configuration options of FileRequestConfig can be found in [FileRequestConfig](#FileRequestConfig) . @@ -748,6 +787,8 @@ myXCrawl .then((res) => {}) ``` +The res you get will be an array of objects. + **3. CrawlFileConfigObject** For more configuration options of CrawlFileConfigObject, please refer to [CrawlFileConfigObject](#CrawlFileConfigObject) . @@ -770,7 +811,9 @@ myXCrawl.crawlFile({ }).then((res) => {}) ``` -It can be selected according to the actual situation. +The res you get will be an array of objects. + +More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. ### startPolling @@ -808,13 +851,15 @@ myXCrawl.startPolling({ h: 2, m: 30 }, (count, stopPolling) => { ### API Config -#### IntervalTime +#### API Config Other + +##### IntervalTime ```ts export type IntervalTime = number | { max: number; min?: number } ``` -#### Method +##### Method ```ts export type Method = @@ -840,7 +885,7 @@ export type Method = | 'UNLINK' ``` -#### PageRequestConfigCookies +##### PageRequestConfigCookies ```ts export type PageRequestConfigCookies = @@ -849,7 +894,9 @@ export type PageRequestConfigCookies = | Protocol.Network.CookieParam[] ``` -#### PageRequestConfig +#### API Config Request + +##### PageRequestConfig ```ts export interface PageRequestConfig { @@ -863,7 +910,7 @@ export interface PageRequestConfig { } ``` -#### DataRequestConfig +##### DataRequestConfig ```ts export interface DataRequestConfig { @@ -879,7 +926,7 @@ export interface DataRequestConfig { } ``` -#### FileRequestConfig +##### FileRequestConfig ```ts export interface FileRequestConfig { @@ -895,7 +942,22 @@ export interface FileRequestConfig { } ``` -#### CrawlPageConfigObject +#### API Config Crawl + +##### XCrawlBaseConfig + +```ts +export interface XCrawlBaseConfig { + baseUrl?: string + timeout?: number + intervalTime?: IntervalTime + mode?: 'async' | 'sync' + proxy?: string + maxRetry?: number +} +``` + +##### CrawlPageConfigObject ```ts export interface CrawlPageConfigObject { @@ -908,7 +970,7 @@ export interface CrawlPageConfigObject { } ``` -#### CrawlDataConfigObject +##### CrawlDataConfigObject ```ts export interface CrawlDataConfigObject { @@ -920,7 +982,7 @@ export interface CrawlDataConfigObject { } ``` -#### CrawlFileConfigObject +##### CrawlFileConfigObject ```ts export interface CrawlFileConfigObject { @@ -942,20 +1004,7 @@ export interface CrawlFileConfigObject { } ``` -#### XCrawlBaseConfig - -```ts -export interface XCrawlBaseConfig { - baseUrl?: string - timeout?: number - intervalTime?: IntervalTime - mode?: 'async' | 'sync' - proxy?: string - maxRetry?: number -} -``` - -#### CrawlPageConfig +##### CrawlPageConfig ```ts export type CrawlPageConfig = @@ -965,7 +1014,7 @@ export type CrawlPageConfig = | CrawlPageConfigObject ``` -#### CrawlDataConfig +##### CrawlDataConfig ```ts export type CrawlDataConfig = @@ -975,13 +1024,13 @@ export type CrawlDataConfig = | CrawlDataConfigObject ``` -#### CrawlFileConfig +##### CrawlFileConfig ```ts export type CrawlFileConfig = FileRequestConfig | FileRequestConfig[] | CrawlFileConfigObject ``` -#### StartPollingConfig +##### StartPollingConfig ```js export interface StartPollingConfig { diff --git a/publish/package.json b/publish/package.json index 58bb7095..c2130ab8 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "5.0.0", + "version": "5.0.1", "author": "coderHXL", "description": "x-crawl is a flexible nodejs crawler library.", "license": "MIT", diff --git a/test/start/index.js b/test/start/index.js index 619e59c8..e8320697 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),n=require("puppeteer"),o=require("chalk"),i=require("node:http"),s=require("node:https"),a=require("node:url"),u=require("https-proxy-agent");const c=console.log,l=o.hex("#a57fff"),f=o.green,m=o.red,p=o.yellow;function h(e){return void 0===e}function d(e){return"number"==typeof e}function g(e){return"object"==typeof e&&e&&!Array.isArray(e)}function y(e){return Array.isArray(e)}async function w(e,t,r,n){if(e&&n>1){const e=t?r:function(e,t=0){let r=Math.floor(Math.random()*e);for(;rsetTimeout(t,e)))}(e)}else c(`Id: ${l(n)} - Crawl does not need to sleep, send immediately`)}async function x(e,t,r,n){const o=!h(t),i=d(t),s=[];for(const a of e){const{id:e}=a;await w(o,i,t,e),a.crawlCount++;const u=n(a,r).catch((e=>(a.errorQueue.push(e),!1))).then((e=>{!1!==e&&(a.isSuccess=!0,a.crawlSingleRes=e)}));s.push(u)}await Promise.all(s)}async function C(e,t,r,n){const o=!h(t),i=d(t);for(const s of e){const{id:e}=s;await w(o,i,t,e),s.crawlCount++;try{s.crawlSingleRes=await n(s,r),s.isSuccess=!0}catch(e){s.errorQueue.push(e)}}}function S(e,t,r){const n=e[t];e[t]=e[r],e[r]=n}function v(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=v(e.slice(0,t)),n=v(e.slice(t)),o=[];let i=0,s=0;for(;i=n[s]?(o.push(r[i]),i++):(o.push(n[s]),s++);return ie.priority===r[0].priority))?v(r.map((e=>({...e,valueOf:()=>e.priority})))):r).map(((e,t)=>({id:t+1,isSuccess:!1,maxRetry:e.maxRetry,crawlCount:0,errorQueue:[],requestConfig:e,crawlSingleRes:null})));c(`${f("Start crawling")} - name: ${p(e)}, mode: ${p(t)}, total: ${l(s.length)} `);const a="async"===t?x:C;let u=s;for(;u.length;)if(await a(u,n,o,i),u=u.filter((e=>e.maxRetry&&!e.isSuccess&&e.crawlCount<=e.maxRetry)),u.length){const e=u.map((e=>e.id));c(p(`Ids to retry: [ ${e.join(" - ")} ]`))}const h=[],d=[];return s.forEach((e=>{e.isSuccess?h.push(e.id):d.push(e.id)})),c("Crawl the final result:"),c(f(` Success - total: ${h.length}, ids: [ ${h.join(" - ")} ]`)),c(m(` Error - total: ${d.length}, ids: [ ${d.join(" - ")} ]`)),s}function R(e,t){let r=e?`${e}`:"?";if(t)for(const e in t){r+=`&${e}=${t[e]}`}else r=e;return r}function $(e){const{protocol:t,hostname:r,port:n,pathname:o,search:c}=new a.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new i.Agent:new s.Agent,protocol:t,hostname:r,port:n,path:o,search:R(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const r={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(r["Content-Type"]="application/json",r["Content-Length"]=Buffer.byteLength(e.data)),r}(e,f),f}async function T(e){const{requestConfig:t}=e;return await(r=t,new Promise(((e,t)=>{const n=h(r.data);r.data=n?r.data:JSON.stringify(r.data);const o=$(r);function a(t){const{statusCode:r,headers:n}=t,o=[];t.on("data",(e=>o.push(e))),t.on("end",(()=>{const t=Buffer.concat(o);e({statusCode:r,headers:n,data:t})}))}let u;u="http:"===o.protocol?i.request(o,a):s.request(o,a),u.on("timeout",(()=>{t(new Error(`Timeout ${r.timeout}ms`))})),u.on("error",(e=>{t(e)})),"POST"!==o.method||n||u.write(r.data),u.end()})));var r}function b(e){return y(e)?e.map((e=>g(e)?e:{url:e})):[g(e)?e:{url:e}]}function O(e,t,r){r.requestConfigs=t.map((t=>{let{url:n,timeout:o,proxy:i,maxRetry:s,priority:a}=t;return h(e.baseUrl)||(n=e.baseUrl+n),h(o)&&(o=h(r.timeout)?e.timeout:r.timeout),h(i)&&(h(r.proxy)?h(e.proxy)||(i=e.proxy):i=r.proxy),h(s)&&(s=h(r.maxRetry)?e.maxRetry:r.maxRetry),h(a)&&(a=0),{...t,url:n,timeout:o,proxy:i,maxRetry:s,priority:a}})),h(r.intervalTime)&&!h(e.intervalTime)&&(r.intervalTime=e.intervalTime)}function j(e){let t=null,r=null,o=!1,i=0;const s=new Map;async function a(e,r){const{id:n,requestConfig:o}=e,i=await t.newPage();await i.setViewport({width:1280,height:1024});let a=null;try{o.proxy?await t.createIncognitoBrowserContext({proxyServer:o.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0}),o.headers&&await i.setExtraHTTPHeaders(o.headers),o.cookies&&await i.setCookie(...function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const n=t.split("=");r.push({name:n[0],value:n[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(o.url,o.cookies)),a=await i.goto(o.url,{timeout:o.timeout})}catch(e){let t=s.get(r);throw t||(t=new Map,s.set(r,t)),t.get(n)||t.set(n,i),e}return{response:a,page:i}}return async function(u,c){const l=++i;o||(o=!0,r=n.launch().then((e=>{t=e}))),r&&(await r,r&&(r=null));const{requestConfigs:f,intervalTime:m}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:i,cookies:s,intervalTime:a,maxRetry:u}=t;r.proxy=o,r.cookies=s,r.intervalTime=a,r.maxRetry=u,r.timeout=i,n.push(...b(e))}else{const e=b(t);n.push(...e)}return O(e,n,r),h(r.cookies)||r.requestConfigs.forEach((e=>{const{cookies:t}=e;h(t)&&!h(r.cookies)&&(e.cookies=r.cookies)})),r}(e,u),p=(await q("page",e.mode,f,m,l,a)).map((e=>{const{id:r,isSuccess:n,maxRetry:o,crawlCount:i,errorQueue:a,crawlSingleRes:u}=e;let f=null;if(n&&u)f={browser:t,...u};else{const e=s.get(l).get(r);f={browser:t,response:null,page:e}}const m={id:r,isSuccess:n,maxRetry:o,crawlCount:i,retryCount:i-1,errorQueue:a,data:f};return c&&c(m),m}));return s.delete(l),y(u)||g(u)&&Object.hasOwn(u,"requestConfigs")?p:p[0]}}function k(e){return async function(t,r){const{requestConfigs:n,intervalTime:o}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:i,intervalTime:s,maxRetry:a}=t;r.proxy=o,r.intervalTime=s,r.maxRetry=a,r.timeout=i,n.push(...b(e))}else{const e=b(t);n.push(...b(e))}return O(e,n,r),r}(e,t),i=(await q("data",e.mode,n,o,void 0,T)).map((e=>{const{id:t,isSuccess:n,maxRetry:o,crawlCount:i,errorQueue:s,crawlSingleRes:a}=e,u={id:t,isSuccess:n,maxRetry:o,crawlCount:i,retryCount:i-1,errorQueue:s,data:null};if(n&&a){const e=(a.headers["content-type"]??"").includes("text")?a.data.toString():JSON.parse(a.data.toString());u.data={...a,data:e}}return r&&r(u),u}));return y(t)||g(t)&&Object.hasOwn(t,"requestConfigs")?i:i[0]}}function P(n){return async function(o,i){const{requestConfigs:s,intervalTime:a,fileConfig:u}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:i,intervalTime:s,maxRetry:a,fileConfig:u}=t;r.proxy=o,r.intervalTime=s,r.maxRetry=a,r.timeout=i,r.fileConfig=u,n.push(...b(e))}else n.push(...y(t)?t:[t]);return O(e,n,r),h(r.fileConfig?.storeDir)&&h(r.fileConfig?.extension)||r.requestConfigs.forEach((e=>{h(e.storeDir)&&!h(r.fileConfig?.storeDir)&&(e.storeDir=r.fileConfig.storeDir),h(e.extension)&&!h(r.fileConfig?.extension)&&(e.extension=r.fileConfig.extension)})),r}(n,o),l=await q("file",n.mode,s,a,void 0,T),p=[],d=[],w=l.map((n=>{const{id:o,isSuccess:s,maxRetry:a,crawlCount:c,errorQueue:l,crawlSingleRes:f,requestConfig:m}=n,g={id:o,isSuccess:s,maxRetry:a,crawlCount:c,retryCount:c-1,errorQueue:l,data:null};if(s&&f){const n=f.headers["content-type"]??"",s=m.fileName??`${o}-${(new Date).getTime()}`,a=m.extension??`.${n.split("/").pop()}`;h(m.storeDir)||e.existsSync(m.storeDir)||(y=m.storeDir,r.resolve(y).split(r.sep).reduce(((t,n,o)=>{const i=0!==o?r.join(t,n):n;return e.existsSync(i)||e.mkdirSync(i),i}),""));const c=m.storeDir??__dirname,l=r.resolve(c,s+a);let w=f.data;if(u?.beforeSave){const e=u.beforeSave({id:o,fileName:s,filePath:l,data:w});e&&(w=e)}const x=t.writeFile(l,w).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return d.push({message:t,valueOf:()=>o}),!0})).then((e=>{const t=f.data.length,r=!e;g.data={...f,data:{isSuccess:r,fileName:s,fileExtension:a,mimeType:n,size:t,filePath:l}},i&&i(g)}));p.push(x)}else i&&i(g);var y;return g}));var x;await Promise.all(p),(x=d,function e(t,r){if(t>=r)return;const n=x[r];let o=t,i=r-1;for(;o<=i;){for(;x[o]n;)i--;o<=i&&(S(x,o,i),o++,i--)}S(x,o,r),e(t,o-1),e(o+1,r)}(0,x.length-1),x).forEach((e=>c(m(e.message))));const C=[],v=[];return w.forEach((e=>{e.data?.data.isSuccess?C.push(e.id):v.push(e.id)})),c("Save file final result:"),c(f(` Success - total: ${C.length}, ids: [ ${C.join(" - ")} ]`)),c(m(` Error - total: ${v.length}, ids: [ ${v.join(" - ")} ]`)),y(o)||g(o)&&Object.hasOwn(o,"requestConfigs")?w:w[0]}}function E(e,t){const{d:r,h:n,m:o}=e,i=(h(r)?0:1e3*r*60*60*24)+(h(n)?0:1e3*n*60*60)+(h(o)?0:1e3*o*60);let s=0;u();const a=setInterval(u,i);function u(){console.log(f(`Start the ${p.bold(++s)} polling`)),t(s,c)}function c(){clearInterval(a),console.log(f("Stop the polling"))}}const D=function(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),h(e?.timeout)&&(t.timeout=1e4),h(e?.maxRetry)&&(t.maxRetry=0),t}(e);return function(e){return{crawlPage:j(e),crawlData:k(e),crawlFile:P(e),startPolling:E}}(t)}({intervalTime:{max:3e3,min:2e3}});D.startPolling({d:1},(async(e,t)=>{const r=await D.crawlPage("https://www.bilibili.com/guochuang/"),{page:n}=r.data,o=await n.$$eval(".chief-recom-item img",(e=>e.map((e=>e.src))));await D.crawlFile({requestConfigs:o,fileConfig:{storeDir:"./upload"}}),n.close()})); +"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),n=require("puppeteer"),o=require("chalk"),s=require("node:http"),i=require("node:https"),a=require("node:url"),u=require("https-proxy-agent");const c=console.log,l=o.hex("#a57fff"),f=o.green,m=o.red,p=o.yellow;function h(e){return void 0===e}function d(e){return"number"==typeof e}function g(e){return"object"==typeof e&&e&&!Array.isArray(e)}function y(e){return Array.isArray(e)}async function w(e,t,r,n){if(e&&n>1){const e=t?r:function(e,t=0){let r=Math.floor(Math.random()*e);for(;rsetTimeout(t,e)))}(e)}else c(`Id: ${l(n)} - Crawl does not need to sleep, send immediately`)}async function x(e,t,r,n){const o=!h(t),s=d(t),i=[];for(const a of e){const{id:e}=a;await w(o,s,t,e),a.crawlCount++;const u=n(a,r).catch((e=>(a.errorQueue.push(e),!1))).then((e=>{!1!==e&&(a.isSuccess=!0,a.crawlSingleRes=e)}));i.push(u)}await Promise.all(i)}async function C(e,t,r,n){const o=!h(t),s=d(t);for(const i of e){const{id:e}=i;await w(o,s,t,e),i.crawlCount++;try{i.crawlSingleRes=await n(i,r),i.isSuccess=!0}catch(e){i.errorQueue.push(e)}}}function S(e,t,r){const n=e[t];e[t]=e[r],e[r]=n}function v(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=v(e.slice(0,t)),n=v(e.slice(t)),o=[];let s=0,i=0;for(;s=n[i]?(o.push(r[s]),s++):(o.push(n[i]),i++);return se.priority===r[0].priority))?v(r.map((e=>({...e,valueOf:()=>e.priority})))):r).map(((e,t)=>({id:t+1,isSuccess:!1,maxRetry:e.maxRetry,crawlCount:0,errorQueue:[],requestConfig:e,crawlSingleRes:null})));c(`${f("Start crawling")} - name: ${p(e)}, mode: ${p(t)}, total: ${l(i.length)} `);const a="async"===t?x:C;let u=i;for(;u.length;)if(await a(u,n,o,s),u=u.filter((e=>e.maxRetry&&!e.isSuccess&&e.crawlCount<=e.maxRetry)),u.length){const e=u.map((e=>e.id));c(p(`Ids to retry: [ ${e.join(" - ")} ]`))}const h=[],d=[];return i.forEach((e=>{e.isSuccess?h.push(e.id):d.push(e.id)})),c("Crawl the final result:"),c(f(` Success - total: ${h.length}, ids: [ ${h.join(" - ")} ]`)),c(m(` Error - total: ${d.length}, ids: [ ${d.join(" - ")} ]`)),i}function R(e,t){let r=e?`${e}`:"?";if(t)for(const e in t){r+=`&${e}=${t[e]}`}else r=e;return r}function $(e){const{protocol:t,hostname:r,port:n,pathname:o,search:c}=new a.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new i.Agent,protocol:t,hostname:r,port:n,path:o,search:R(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const r={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(r["Content-Type"]="application/json",r["Content-Length"]=Buffer.byteLength(e.data)),r}(e,f),f}async function T(e){const{requestConfig:t}=e;return await(r=t,new Promise(((e,t)=>{const n=h(r.data);r.data=n?r.data:JSON.stringify(r.data);const o=$(r);function a(t){const{statusCode:r,headers:n}=t,o=[];t.on("data",(e=>o.push(e))),t.on("end",(()=>{const t=Buffer.concat(o);e({statusCode:r,headers:n,data:t})}))}let u;u="http:"===o.protocol?s.request(o,a):i.request(o,a),u.on("timeout",(()=>{t(new Error(`Timeout ${r.timeout}ms`))})),u.on("error",(e=>{t(e)})),"POST"!==o.method||n||u.write(r.data),u.end()})));var r}function b(e){return y(e)?e.map((e=>g(e)?e:{url:e})):[g(e)?e:{url:e}]}function O(e,t,r){r.requestConfigs=t.map((t=>{let{url:n,timeout:o,proxy:s,maxRetry:i,priority:a}=t;return h(e.baseUrl)||(n=e.baseUrl+n),h(o)&&(o=h(r.timeout)?e.timeout:r.timeout),h(s)&&(h(r.proxy)?h(e.proxy)||(s=e.proxy):s=r.proxy),h(i)&&(i=h(r.maxRetry)?e.maxRetry:r.maxRetry),h(a)&&(a=0),{...t,url:n,timeout:o,proxy:s,maxRetry:i,priority:a}})),h(r.intervalTime)&&!h(e.intervalTime)&&(r.intervalTime=e.intervalTime)}function j(e){let t=null,r=null,o=!1,s=0;const i=new Map;async function a(e,r){const{id:n,requestConfig:o}=e,s=await t.newPage();await s.setViewport({width:1280,height:1024});let a=null;try{o.proxy?await t.createIncognitoBrowserContext({proxyServer:o.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0}),o.headers&&await s.setExtraHTTPHeaders(o.headers),o.cookies&&await s.setCookie(...function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const n=t.split("=");r.push({name:n[0],value:n[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(o.url,o.cookies)),a=await s.goto(o.url,{timeout:o.timeout})}catch(e){let t=i.get(r);throw t||(t=new Map,i.set(r,t)),t.get(n)||t.set(n,s),e}return{response:a,page:s}}return async function(u,c){const l=++s;o||(o=!0,r=n.launch().then((e=>{t=e}))),r&&(await r,r&&(r=null));const{requestConfigs:f,intervalTime:m}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,cookies:i,intervalTime:a,maxRetry:u}=t;r.proxy=o,r.cookies=i,r.intervalTime=a,r.maxRetry=u,r.timeout=s,n.push(...b(e))}else{const e=b(t);n.push(...e)}return O(e,n,r),h(r.cookies)||r.requestConfigs.forEach((e=>{const{cookies:t}=e;h(t)&&!h(r.cookies)&&(e.cookies=r.cookies)})),r}(e,u),p=(await q("page",e.mode,f,m,l,a)).map((e=>{const{id:r,isSuccess:n,maxRetry:o,crawlCount:s,errorQueue:a,crawlSingleRes:u}=e;let f=null;if(n&&u)f={browser:t,...u};else{const e=i.get(l).get(r);f={browser:t,response:null,page:e}}const m={id:r,isSuccess:n,maxRetry:o,crawlCount:s,retryCount:s-1,errorQueue:a,data:f};return c&&c(m),m}));return i.delete(l),y(u)||g(u)&&Object.hasOwn(u,"requestConfigs")?p:p[0]}}function k(e){return async function(t,r){const{requestConfigs:n,intervalTime:o}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,intervalTime:i,maxRetry:a}=t;r.proxy=o,r.intervalTime=i,r.maxRetry=a,r.timeout=s,n.push(...b(e))}else{const e=b(t);n.push(...b(e))}return O(e,n,r),r}(e,t),s=(await q("data",e.mode,n,o,void 0,T)).map((e=>{const{id:t,isSuccess:n,maxRetry:o,crawlCount:s,errorQueue:i,crawlSingleRes:a}=e,u={id:t,isSuccess:n,maxRetry:o,crawlCount:s,retryCount:s-1,errorQueue:i,data:null};if(n&&a){const e=(a.headers["content-type"]??"").includes("text")?a.data.toString():JSON.parse(a.data.toString());u.data={...a,data:e}}return r&&r(u),u}));return y(t)||g(t)&&Object.hasOwn(t,"requestConfigs")?s:s[0]}}function P(n){return async function(o,s){const{requestConfigs:i,intervalTime:a,fileConfig:u}=function(e,t){const r={requestConfigs:[]},n=[];if(g(t)&&Object.hasOwn(t,"requestConfigs")){const{requestConfigs:e,proxy:o,timeout:s,intervalTime:i,maxRetry:a,fileConfig:u}=t;r.proxy=o,r.intervalTime=i,r.maxRetry=a,r.timeout=s,r.fileConfig=u,n.push(...b(e))}else n.push(...y(t)?t:[t]);return O(e,n,r),h(r.fileConfig?.storeDir)&&h(r.fileConfig?.extension)||r.requestConfigs.forEach((e=>{h(e.storeDir)&&!h(r.fileConfig?.storeDir)&&(e.storeDir=r.fileConfig.storeDir),h(e.extension)&&!h(r.fileConfig?.extension)&&(e.extension=r.fileConfig.extension)})),r}(n,o),l=await q("file",n.mode,i,a,void 0,T),p=[],d=[],w=l.map((n=>{const{id:o,isSuccess:i,maxRetry:a,crawlCount:c,errorQueue:l,crawlSingleRes:f,requestConfig:m}=n,g={id:o,isSuccess:i,maxRetry:a,crawlCount:c,retryCount:c-1,errorQueue:l,data:null};if(i&&f){const n=f.headers["content-type"]??"",i=m.fileName??`${o}-${(new Date).getTime()}`,a=m.extension??`.${n.split("/").pop()}`;h(m.storeDir)||e.existsSync(m.storeDir)||(y=m.storeDir,r.resolve(y).split(r.sep).reduce(((t,n,o)=>{const s=0!==o?r.join(t,n):n;return e.existsSync(s)||e.mkdirSync(s),s}),""));const c=m.storeDir??__dirname,l=r.resolve(c,i+a);let w=f.data;if(u?.beforeSave){const e=u.beforeSave({id:o,fileName:i,filePath:l,data:w});e&&(w=e)}const x=t.writeFile(l,w).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return d.push({message:t,valueOf:()=>o}),!0})).then((e=>{const t=f.data.length,r=!e;g.data={...f,data:{isSuccess:r,fileName:i,fileExtension:a,mimeType:n,size:t,filePath:l}},s&&s(g)}));p.push(x)}else s&&s(g);var y;return g}));var x;await Promise.all(p),(x=d,function e(t,r){if(t>=r)return;const n=x[r];let o=t,s=r-1;for(;o<=s;){for(;x[o]n;)s--;o<=s&&(S(x,o,s),o++,s--)}S(x,o,r),e(t,o-1),e(o+1,r)}(0,x.length-1),x).forEach((e=>c(m(e.message))));const C=[],v=[];return w.forEach((e=>{e.data?.data.isSuccess?C.push(e.id):v.push(e.id)})),c("Save file final result:"),c(f(` Success - total: ${C.length}, ids: [ ${C.join(" - ")} ]`)),c(m(` Error - total: ${v.length}, ids: [ ${v.join(" - ")} ]`)),y(o)||g(o)&&Object.hasOwn(o,"requestConfigs")?w:w[0]}}function E(e,t){const{d:r,h:n,m:o}=e,s=(h(r)?0:1e3*r*60*60*24)+(h(n)?0:1e3*n*60*60)+(h(o)?0:1e3*o*60);let i=0;u();const a=setInterval(u,s);function u(){console.log(f(`Start the ${p.bold(++i)} polling`)),t(i,c)}function c(){clearInterval(a),console.log(f("Stop the polling"))}}const D=function(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),h(e?.timeout)&&(t.timeout=1e4),h(e?.maxRetry)&&(t.maxRetry=0),t}(e);return function(e){return{crawlPage:j(e),crawlData:k(e),crawlFile:P(e),startPolling:E}}(t)}({maxRetry:3,intervalTime:{max:3e3,min:2e3}});D.startPolling({d:1},(async(e,t)=>{const r=await D.crawlPage(["https://zh.airbnb.com/s/hawaii/experiences","https://zh.airbnb.com/s/hawaii/plus_homes"]),n=[],o=[".c14whb16",".a1stauiv"];for(const e of r){const{id:t}=e,{page:r}=e.data,s=await r.$(o[t-1]),i=await s.$$eval("picture img",(e=>e.map((e=>e.src))));n.push(...i),r.close()}D.crawlFile({requestConfigs:n,fileConfig:{storeDir:"./upload"}})})); diff --git a/test/start/index.ts b/test/start/index.ts index 5f2f7636..96336ae6 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -1,50 +1,42 @@ -// import path from 'node:path' -// import xCrawl from 'x-crawl' - -// const testXCrawl = xCrawl({ -// timeout: 10000, -// intervalTime: { max: 3000, min: 1000 }, -// proxy: 'http://localhost:14892' -// }) - -// testXCrawl -// .crawlData({ -// requestConfigs: [ -// { url: '/room/597664', priority: 3 }, -// { url: '/room/92507', priority: 8 }, -// { url: '/room/193581217', priority: 3 } -// ] -// }) -// .then((res) => { -// res.forEach((item) => { -// console.log(item.data?.data.data.id) -// }) -// }) - -// 1.导入模块 ES/CJS +// 1.Import module ES/CJS import xCrawl from 'x-crawl' -// 2.创建一个爬虫实例 -const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 2000 } }) +// 2.Create a crawler instance +const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } }) -// 3.设置爬取任务 -// 调用 startPolling API 开始轮询功能,每隔一天会调用回调函数 +// 3.Set the crawling task +/* + Call the startPolling API to start the polling function, + and the callback function will be called every other day +*/ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { - // 调用 crawlPage API 爬取 Page - const res = await myXCrawl.crawlPage('https://www.bilibili.com/guochuang/') - const { page } = res.data - - // 设置请求配置,获取轮播图片的 URL - const requestConfigs = await page.$$eval('.chief-recom-item img', (imgEls) => - imgEls.map((item) => item.src) - ) - - // 调用 crawlFile API 爬取图片 - await myXCrawl.crawlFile({ - requestConfigs, + // Call crawlPage API to crawl Page + const res = await myXCrawl.crawlPage([ + 'https://zh.airbnb.com/s/hawaii/experiences', + 'https://zh.airbnb.com/s/hawaii/plus_homes' + ]) + + // Store the image URL + const imgUrls: string[] = [] + const elSelectorMap = ['.c14whb16', '.a1stauiv'] + for (const item of res) { + const { id } = item + const { page } = item.data + + // Gets the URL of the page's wheel image element + const boxHandle = await page.$(elSelectorMap[id - 1]) + const urls = await boxHandle!.$$eval('picture img', (imgEls) => { + return imgEls.map((item) => item.src) + }) + imgUrls.push(...urls) + + // Close page + page.close() + } + + // Call the crawlFile API to crawl pictures + myXCrawl.crawlFile({ + requestConfigs: imgUrls, fileConfig: { storeDir: './upload' } }) - - // 关闭页面 - page.close() })