diff --git a/README.md b/README.md index 0ed27dab..bf7cc8f2 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ XCrawl is a Nodejs multifunctional crawler library. - Crawl HTML, JSON, file resources, etc. with simple configuration - Use the JSDOM library to parse HTML, or parse HTML by yourself -- Optional mode asynchronous/synchronous for batch requests +- The request method supports asynchronous/synchronous +- Support Promise/Callback - Polling function - Anthropomorphic request interval - Written in TypeScript @@ -47,6 +48,7 @@ XCrawl is a Nodejs multifunctional crawler library. * [IFetchFileConfig](#IFetchFileConfig) * [IFetchPollingConfig](#IFetchPollingConfig) * [IFetchCommon](#IFetchCommon) + * [IFetchCommonArr](#IFetchCommonArr) * [IFileInfo](#IFileInfo) * [IFetchHTML](#IFetchHTML) - [More](#More) @@ -92,10 +94,26 @@ Create a crawler instance via new XCrawl. The request queue is maintained by the ```ts class XCrawl { constructor(baseConfig?: IXCrawlBaseConifg) - fetchHTML(config: IFetchHTMLConfig): Promise - fetchData(config: IFetchDataConfig): Promise> - fetchFile(config: IFetchFileConfig): Promise> - fetchPolling(config: IFetchPollingConfig, callback: (count: number) => void): void + + fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void + ): Promise + + fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchPolling( + config: IFetchPollingConfig, + callback: (count: number) => void + ): void } ``` @@ -142,7 +160,10 @@ fetchHTML is the method of the above [myXCrawl](https://github.com/coder-hxl/x-c #### Type ```ts -function fetchHTML(config: IFetchHTMLConfig): Promise +fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void +): Promise ``` #### Example @@ -161,7 +182,10 @@ fetchData is the method of the above [myXCrawl](#Example-1) instance, which is u #### Type ```ts -function fetchData(config: IFetchDataConfig): Promise> +fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### Example @@ -188,7 +212,10 @@ fetchFile is the method of the above [myXCrawl](#Example-1) instance, which is u #### Type ```ts -function fetchFile(config: IFetchFileConfig): Promise> +fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### Example @@ -331,12 +358,18 @@ interface IFetchPollingConfig { ### IFetchCommon ```ts -type IFetchCommon = { +interface IFetchCommon { id: number statusCode: number | undefined - headers: IncomingHttpHeaders // node:http type + headers: IncomingHttpHeaders // node:http 类型 data: T -}[] +} +``` + +### IFetchCommonArr + +```ts +type IFetchCommonArr = IFetchCommon[] ``` ### IFileInfo diff --git a/document/cn.md b/document/cn.md index f9dbdc5d..ccd3763d 100644 --- a/document/cn.md +++ b/document/cn.md @@ -8,7 +8,8 @@ XCrawl 是 Nodejs 多功能爬虫库。 - 只需简单的配置即可抓取 HTML 、JSON、文件资源等等 - 使用 JSDOM 库对 HTML 解析,也可自行解析 HTML -- 批量请求时可选择模式 异步/同步 +- 请求方式支持 异步/同步 +- 支持 Promise/Callback - 轮询功能 - 拟人化的请求间隔时间 - 使用 TypeScript 编写 @@ -47,6 +48,7 @@ XCrawl 是 Nodejs 多功能爬虫库。 * [IFetchFileConfig](#IFetchFileConfig) * [IFetchPollingConfig](#IFetchPollingConfig) * [IFetchCommon](#IFetchCommon) + * [IFetchCommonArr](#IFetchCommonArr) * [IFileInfo](#IFileInfo) * [IFetchHTML](#IFetchHTML) - [更多](#更多) @@ -104,10 +106,26 @@ myXCrawl.fetchPolling({ d: 1 }, () => { ```ts class XCrawl { constructor(baseConfig?: IXCrawlBaseConifg) - fetchHTML(config: IFetchHTMLConfig): Promise - fetchData(config: IFetchDataConfig): Promise> - fetchFile(config: IFetchFileConfig): Promise> - fetchPolling(config: IFetchPollingConfig, callback: (count: number) => void): void + + fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void + ): Promise + + fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchPolling( + config: IFetchPollingConfig, + callback: (count: number) => void + ): void } ``` @@ -154,7 +172,10 @@ fetchHTML 是 [myXCrawl](https://github.com/coder-hxl/x-crawl/blob/main/document #### 类型 ```ts -function fetchHTML(config: IFetchHTMLConfig): Promise +fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void +): Promise ``` #### 示例 @@ -173,7 +194,10 @@ fetch 是 [myXCrawl](#示例-1) 实例的方法,通常用于爬取 API ,可 #### 类型 ```ts -function fetchData(config: IFetchDataConfig): Promise> +fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### 示例 @@ -200,7 +224,10 @@ fetchFile 是 [myXCrawl](#示例-1) 实例的方法,通常用于爬取文件 #### 类型 ```ts -function fetchFile(config: IFetchFileConfig): Promise> +fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### 示例 @@ -343,12 +370,18 @@ interface IFetchPollingConfig { ### IFetchCommon ```ts -type IFetchCommon = { +interface IFetchCommon { id: number statusCode: number | undefined headers: IncomingHttpHeaders // node:http 类型 data: T -}[] +} +``` + +### IFetchCommonArr + +```ts +type IFetchCommonArr = IFetchCommon[] ``` ### IFileInfo diff --git a/package.json b/package.json index 5967a80a..b179186a 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "0.4.0", + "version": "1.0.0", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index 0ed27dab..bf7cc8f2 100644 --- a/publish/README.md +++ b/publish/README.md @@ -8,7 +8,8 @@ XCrawl is a Nodejs multifunctional crawler library. - Crawl HTML, JSON, file resources, etc. with simple configuration - Use the JSDOM library to parse HTML, or parse HTML by yourself -- Optional mode asynchronous/synchronous for batch requests +- The request method supports asynchronous/synchronous +- Support Promise/Callback - Polling function - Anthropomorphic request interval - Written in TypeScript @@ -47,6 +48,7 @@ XCrawl is a Nodejs multifunctional crawler library. * [IFetchFileConfig](#IFetchFileConfig) * [IFetchPollingConfig](#IFetchPollingConfig) * [IFetchCommon](#IFetchCommon) + * [IFetchCommonArr](#IFetchCommonArr) * [IFileInfo](#IFileInfo) * [IFetchHTML](#IFetchHTML) - [More](#More) @@ -92,10 +94,26 @@ Create a crawler instance via new XCrawl. The request queue is maintained by the ```ts class XCrawl { constructor(baseConfig?: IXCrawlBaseConifg) - fetchHTML(config: IFetchHTMLConfig): Promise - fetchData(config: IFetchDataConfig): Promise> - fetchFile(config: IFetchFileConfig): Promise> - fetchPolling(config: IFetchPollingConfig, callback: (count: number) => void): void + + fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void + ): Promise + + fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void + ): Promise> + + fetchPolling( + config: IFetchPollingConfig, + callback: (count: number) => void + ): void } ``` @@ -142,7 +160,10 @@ fetchHTML is the method of the above [myXCrawl](https://github.com/coder-hxl/x-c #### Type ```ts -function fetchHTML(config: IFetchHTMLConfig): Promise +fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void +): Promise ``` #### Example @@ -161,7 +182,10 @@ fetchData is the method of the above [myXCrawl](#Example-1) instance, which is u #### Type ```ts -function fetchData(config: IFetchDataConfig): Promise> +fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### Example @@ -188,7 +212,10 @@ fetchFile is the method of the above [myXCrawl](#Example-1) instance, which is u #### Type ```ts -function fetchFile(config: IFetchFileConfig): Promise> +fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void +): Promise> ``` #### Example @@ -331,12 +358,18 @@ interface IFetchPollingConfig { ### IFetchCommon ```ts -type IFetchCommon = { +interface IFetchCommon { id: number statusCode: number | undefined - headers: IncomingHttpHeaders // node:http type + headers: IncomingHttpHeaders // node:http 类型 data: T -}[] +} +``` + +### IFetchCommonArr + +```ts +type IFetchCommonArr = IFetchCommon[] ``` ### IFileInfo diff --git a/publish/package.json b/publish/package.json index c6611155..5b00866d 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "0.4.0", + "version": "1.0.0", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/src/index.ts b/src/index.ts index cf146c75..24b2d8ef 100644 --- a/src/index.ts +++ b/src/index.ts @@ -21,12 +21,13 @@ import { IFetchFileConfig, IFetchPollingConfig, IFetchBaseConifg, - IFetchCommon, IFileInfo, IFetchHTML, IRequestResItem, IRequestConfig, - IIntervalTime + IIntervalTime, + IFetchCommon, + IFetchCommonArr } from './types' export default class XCrawl { @@ -73,23 +74,24 @@ export default class XCrawl { private async useBatchRequestByMode( requestConifg: IRequestConfig | IRequestConfig[], - intervalTime: IIntervalTime | undefined + intervalTime: IIntervalTime | undefined, + callback: (requestResItem: IRequestResItem) => void ) { const requestConfigQueue = isArray(requestConifg) ? requestConifg : [requestConifg] - let requestRes: IRequestResItem[] = [] if (this.baseConfig.mode !== 'sync') { - requestRes = await batchRequest(requestConfigQueue, intervalTime) + await batchRequest(requestConfigQueue, intervalTime, callback) } else { - requestRes = await syncBatchRequest(requestConfigQueue, intervalTime) + await syncBatchRequest(requestConfigQueue, intervalTime, callback) } - - return requestRes } - async fetchHTML(config: IFetchHTMLConfig): Promise { + async fetchHTML( + config: IFetchHTMLConfig, + callback?: (res: IFetchHTML) => void + ): Promise { const { requestConifg } = this.mergeConfig({ requestConifg: isString(config) ? { url: config } : config }) @@ -105,44 +107,50 @@ export default class XCrawl { } } + if (callback) { + callback(res) + } + return res } - async fetchData(config: IFetchDataConfig): Promise> { + async fetchData( + config: IFetchDataConfig, + callback?: (res: IFetchCommon) => void + ): Promise> { const { requestConifg, intervalTime } = this.mergeConfig(config) - const requestRes = await this.useBatchRequestByMode( - requestConifg, - intervalTime - ) - - const container: IFetchCommon = [] - - requestRes.forEach((item) => { - const contentType = item.headers['content-type'] ?? '' - const rawData = item.data + const container: IFetchCommonArr = [] + function handleResItem(requestResItem: IRequestResItem) { + const contentType = requestResItem.headers['content-type'] ?? '' + const rawData = requestResItem.data const data = contentType.includes('text') ? rawData.toString() : JSON.parse(rawData.toString()) - container.push({ ...item, data }) - }) + const itemRes = { ...requestResItem, data } + + if (callback) { + callback(itemRes) + } + + container.push(itemRes) + } + + await this.useBatchRequestByMode(requestConifg, intervalTime, handleResItem) return container } - async fetchFile(config: IFetchFileConfig): Promise> { + async fetchFile( + config: IFetchFileConfig, + callback?: (res: IFetchCommon) => void + ): Promise> { const { requestConifg, intervalTime, fileConfig } = this.mergeConfig(config) - const requestRes = await this.useBatchRequestByMode( - requestConifg, - intervalTime - ) - - const container: IFetchCommon = [] - - requestRes.forEach((requestResItem) => { + const container: IFetchCommonArr = [] + function handleResItem(requestResItem: IRequestResItem) { const { id, headers, data } = requestResItem const mimeType = headers['content-type'] ?? '' @@ -156,16 +164,24 @@ export default class XCrawl { try { fs.writeFileSync(filePath, data) - container.push({ + const res = { ...requestResItem, data: { fileName, mimeType, size: data.length, filePath } - }) + } + + if (callback) { + callback(res) + } + + container.push(res) } catch (error: any) { log(logError(`File save error at id ${id}: ${error.message}`)) } - }) + } + + await this.useBatchRequestByMode(requestConifg, intervalTime, handleResItem) - const saveTotal = requestRes.length + const saveTotal = isArray(requestConifg) ? requestConifg.length : 1 const success = container.length const error = saveTotal - success log( @@ -188,12 +204,12 @@ export default class XCrawl { const total = year + month + day + hour + minute let count = 0 - function cb() { + function startCallback() { console.log(logWarn(`Start the ${logWarn.bold(++count)} polling`)) callback(count) } - cb() - setInterval(cb, total) + startCallback() + setInterval(startCallback, total) } } diff --git a/src/request.ts b/src/request.ts index cf368315..d4b617c9 100644 --- a/src/request.ts +++ b/src/request.ts @@ -163,7 +163,8 @@ async function useSleepByBatch( export async function batchRequest( requestConifgs: IRequestConfig[], - intervalTime: IIntervalTime | undefined + intervalTime: IIntervalTime | undefined, + callback: (requestResItem: IRequestResItem) => void ) { const isHaveIntervalTime = !isUndefined(intervalTime) const isNumberIntervalTime = isNumber(intervalTime) @@ -172,9 +173,10 @@ export async function batchRequest( `Begin execution, mode: async, total: ${logNumber(requestConifgs.length)} ` ) - const requestQueue: Promise[] = [] - let index = 0 + let successTotal = 0 + let errorTotal = 0 + const requestQueue: Promise[] = [] for (const requestConifg of requestConifgs) { const id = ++index @@ -187,12 +189,14 @@ export async function batchRequest( const requestItem = request(requestConifg) .catch((error: any) => { + errorTotal++ return `Request ${id} is an error: ${error.message}` }) .then((requestRes) => { if (typeof requestRes === 'string') return requestRes - return { id, ...requestRes } + successTotal++ + callback({ id, ...requestRes }) }) requestQueue.push(requestItem) @@ -202,32 +206,20 @@ export async function batchRequest( const res = await Promise.all(requestQueue) - const success: IRequestResItem[] = [] - const error: string[] = [] - - // 通过类型分类 - res.forEach((item) => { - if (typeof item === 'string') { - return error.push(item) - } - - success.push(item) - }) - - error.forEach((message) => log(logError(message))) + // 打印错误消息 + res.forEach((item) => (item ? log(logError(item)) : '')) log( `requestsTotal: ${logNumber(requestConifgs.length)}, success: ${logSuccess( - success.length - )}, error: ${logError(error.length)}` + successTotal + )}, error: ${logError(errorTotal)}` ) - - return success } export async function syncBatchRequest( requestConifgs: IRequestConfig[], - intervalTime: IIntervalTime | undefined + intervalTime: IIntervalTime | undefined, + callback: (requestResItem: IRequestResItem) => void ) { const isHaveIntervalTime = !isUndefined(intervalTime) const isNumberIntervalTime = isNumber(intervalTime) @@ -239,7 +231,6 @@ export async function syncBatchRequest( let id = 0 let successTotal = 0 let errorTotal = 0 - const requestRes: IRequestResItem[] = [] for (const requestConifg of requestConifgs) { id++ @@ -250,15 +241,22 @@ export async function syncBatchRequest( id ) + let isRequestSuccess = true + let requestResItem: IRequestResItem | null = null try { - const requestResItem = await request(requestConifg) - requestRes.push({ id, ...requestResItem }) + const requestRes = await request(requestConifg) + requestResItem = { id, ...requestRes } log(logSuccess(`Request ${logNumber(id)} is an success`)) successTotal++ } catch (error: any) { + isRequestSuccess = false log(logError(`Request ${id} is an error: ${error.message}`)) errorTotal++ } + + if (isRequestSuccess && callback) { + callback(requestResItem as IRequestResItem) + } } log(logSuccess('All requests are over!')) @@ -268,6 +266,4 @@ export async function syncBatchRequest( successTotal )}, error: ${logError(errorTotal)}` ) - - return requestRes } diff --git a/src/types.ts b/src/types.ts index 8b264ac3..fc2ae68e 100644 --- a/src/types.ts +++ b/src/types.ts @@ -89,12 +89,14 @@ export interface IFetchPollingConfig { m?: number } -export type IFetchCommon = { +export interface IFetchCommon { id: number statusCode: number | undefined headers: IncomingHttpHeaders data: T -}[] +} + +export type IFetchCommonArr = IFetchCommon[] export interface IFileInfo { fileName: string diff --git a/test/start/index.js b/test/start/index.js index 4b8bb0ae..199e1f7a 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("node:https"),s=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,h=i.red,f=i.yellow;function d(e){return void 0===e}function g(e){return"number"==typeof e}function m(e){return Array.isArray(e)}function p(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function y(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new s.URL(e.url),l="http:"===t,h={agent:e.proxy?a(e.proxy):l?new n.Agent:new r.Agent,protocol:t,hostname:o,port:i,path:c,search:p(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return h.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,h),h}function q(e){return new Promise(((t,o)=>{const s=d(e.data);e.data=s?e.data:JSON.stringify(e.data);const a=y(e);function i(e){const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):r.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||s||c.write(e.data),c.end()}))}async function w(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const $=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=m(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:r}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(r)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t){const o=m(e)?e:[e];let n=[];return n="sync"!==this.baseConfig.mode?await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);const r=[];let s=0;for(const a of e){const e=++s;await w(o,n,t,e);const i=q(a).catch((t=>`Request ${e} is an error: ${t.message}`)).then((t=>"string"==typeof t?t:{id:e,...t}));r.push(i)}c(l("All requests have been sent!"));const a=await Promise.all(r),i=[],f=[];return a.forEach((e=>{if("string"==typeof e)return f.push(e);i.push(e)})),f.forEach((e=>c(h(e)))),c(`requestsTotal: ${u(e.length)}, success: ${l(i.length)}, error: ${h(f.length)}`),i}(o,t):await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,s=0,a=0;const i=[];for(const f of e){r++,await w(o,n,t,r);try{const e=await q(f);i.push({id:r,...e}),c(l(`Request ${u(r)} is an success`)),s++}catch(e){c(h(`Request ${r} is an error: ${e.message}`)),a++}}return c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(s)}, error: ${h(a)}`),i}(o,t),n}async fetchHTML(e){const{requestConifg:t}=this.mergeConfig({requestConifg:(n=e,"string"==typeof n?{url:e}:e)});var n;const r=await q(t),s=r.data.toString();return{...r,data:{html:s,jsdom:new o.JSDOM(s)}}}async fetchData(e){const{requestConifg:t,intervalTime:o}=this.mergeConfig(e),n=await this.useBatchRequestByMode(t,o),r=[];return n.forEach((e=>{const t=e.headers["content-type"]??"",o=e.data,n=t.includes("text")?o.toString():JSON.parse(o.toString());r.push({...e,data:n})})),r}async fetchFile(o){const{requestConifg:n,intervalTime:r,fileConfig:s}=this.mergeConfig(o),a=await this.useBatchRequestByMode(n,r),i=[];a.forEach((o=>{const{id:n,headers:r,data:a}=o,u=r["content-type"]??"",l=s.extension??u.split("/").pop(),f=(new Date).getTime().toString(),d=e.resolve(s.storeDir,`${f}.${l}`);try{t.writeFileSync(d,a),i.push({...o,data:{fileName:f,mimeType:u,size:a.length,filePath:d}})}catch(e){c(h(`File save error at id ${n}: ${e.message}`))}}));const f=a.length,d=i.length,g=f-d;return c(`saveTotal: ${u(f)}, success: ${l(d)}, error: ${h(g)}`),i}fetchPolling(e,t){const{Y:o,M:n,d:r,h:s,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(r)?0:1e3*r*60*60*24)+(d(s)?0:1e3*s*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(f(`Start the ${f.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:2e3,min:1e3},mode:"async"});$.fetchHTML({url:"https://www.google.com.hk/",proxy:"http://127.0.0.1:14892"}).then((t=>{console.log(t.statusCode);const{jsdom:o}=t.data,n=o.window.document.querySelector(".lnXdpd");$.fetchFile({requestConifg:{url:"https://www.google.com.hk/"+n.src,proxy:"http://127.0.0.1:14892"},fileConfig:{storeDir:e.resolve(__dirname,"./upload"),extension:"jpg"}})})); +"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),s=require("node:https"),r=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,h=i.red,f=i.yellow;function d(e){return void 0===e}function m(e){return"number"==typeof e}function g(e){return Array.isArray(e)}function p(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function y(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new r.URL(e.url),l="http:"===t,h={agent:e.proxy?a(e.proxy):l?new n.Agent:new s.Agent,protocol:t,hostname:o,port:i,path:c,search:p(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return h.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,h),h}function q(e){return new Promise(((t,o)=>{const r=d(e.data);e.data=r?e.data:JSON.stringify(e.data);const a=y(e);function i(e){const{statusCode:o,headers:n}=e,s=[];e.on("data",(e=>s.push(e))),e.on("end",(()=>{const e=Buffer.concat(s);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):s.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||r||c.write(e.data),c.end()}))}async function $(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const w=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=g(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:s}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(s)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t,o){const n=g(e)?e:[e];"sync"!==this.baseConfig.mode?await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);let r=0,a=0,i=0;const f=[];for(const c of e){const e=++r;await $(n,s,t,e);const u=q(c).catch((t=>(i++,`Request ${e} is an error: ${t.message}`))).then((t=>{if("string"==typeof t)return t;a++,o({id:e,...t})}));f.push(u)}c(l("All requests have been sent!")),(await Promise.all(f)).forEach((e=>e?c(h(e)):"")),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${h(i)}`)}(n,t,o):await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,a=0,i=0;for(const f of e){r++,await $(n,s,t,r);let e=!0,d=null;try{d={id:r,...await q(f)},c(l(`Request ${u(r)} is an success`)),a++}catch(t){e=!1,c(h(`Request ${r} is an error: ${t.message}`)),i++}e&&o&&o(d)}c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${h(i)}`)}(n,t,o)}async fetchHTML(e,t){const{requestConifg:n}=this.mergeConfig({requestConifg:(s=e,"string"==typeof s?{url:e}:e)});var s;const r=await q(n),a=r.data.toString(),i={...r,data:{html:a,jsdom:new o.JSDOM(a)}};return t&&t(i),i}async fetchData(e,t){const{requestConifg:o,intervalTime:n}=this.mergeConfig(e),s=[];return await this.useBatchRequestByMode(o,n,(function(e){const o=e.headers["content-type"]??"",n=e.data,r=o.includes("text")?n.toString():JSON.parse(n.toString()),a={...e,data:r};t&&t(a),s.push(a)})),s}async fetchFile(o,n){const{requestConifg:s,intervalTime:r,fileConfig:a}=this.mergeConfig(o),i=[];await this.useBatchRequestByMode(s,r,(function(o){const{id:s,headers:r,data:u}=o,l=r["content-type"]??"",f=a.extension??l.split("/").pop(),d=(new Date).getTime().toString(),m=e.resolve(a.storeDir,`${d}.${f}`);try{t.writeFileSync(m,u);const e={...o,data:{fileName:d,mimeType:l,size:u.length,filePath:m}};n&&n(e),i.push(e)}catch(e){c(h(`File save error at id ${s}: ${e.message}`))}}));const f=g(s)?s.length:1,d=i.length,m=f-d;return c(`saveTotal: ${u(f)}, success: ${l(d)}, error: ${h(m)}`),i}fetchPolling(e,t){const{Y:o,M:n,d:s,h:r,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(s)?0:1e3*s*60*60*24)+(d(r)?0:1e3*r*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(f(`Start the ${f.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:2e3,min:1e3},mode:"async"});w.fetchPolling({m:3},(()=>{w.fetchHTML("https://www.bilibili.com/guochuang/",(e=>{console.log("fetchHTML Callback: ",e.statusCode)})).then((t=>{const{jsdom:o}=t.data,n=[];o.window.document.querySelectorAll(".chief-recom-item").forEach((e=>n.push(e.querySelector("img").src)));const s=n.map((e=>({url:`https:${e}`})));s.pop(),w.fetchFile({requestConifg:s,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}},(e=>{console.log(e.id,e.statusCode,e.data.fileName)}))}))})); diff --git a/test/start/index.ts b/test/start/index.ts index acb072c9..f42683c8 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -17,46 +17,30 @@ const testXCrawl = new XCrawl({ // ] // }) -// testXCrawl.fetchPolling({ m: 3 }, () => { -// testXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { -// const { jsdom } = res.data - -// const imgSrc: string[] = [] -// const recomEls = jsdom.window.document.querySelectorAll('.chief-recom-item') -// recomEls.forEach((item) => imgSrc.push(item.querySelector('img')!.src)) - -// const requestConifg = imgSrc.map((src) => ({ url: `https:${src}` })) -// requestConifg.pop() - -// testXCrawl.fetchFile({ -// requestConifg, -// fileConfig: { storeDir: path.resolve(__dirname, './upload') } -// }) -// }) -// }) - -// 'http://127.0.0.1:14892' -testXCrawl - .fetchHTML({ - url: 'https://www.google.com.hk/', - proxy: 'http://127.0.0.1:14892' - }) - .then((res) => { - console.log(res.statusCode) - - const { jsdom } = res.data - - const imgEl = - jsdom.window.document.querySelector('.lnXdpd') - - testXCrawl.fetchFile({ - requestConifg: { - url: 'https://www.google.com.hk/' + imgEl!.src, - proxy: 'http://127.0.0.1:14892' - }, - fileConfig: { - storeDir: path.resolve(__dirname, './upload'), - extension: 'jpg' - } +testXCrawl.fetchPolling({ m: 3 }, () => { + testXCrawl + .fetchHTML('https://www.bilibili.com/guochuang/', (res) => { + console.log('fetchHTML Callback: ', res.statusCode) }) - }) + .then((res) => { + const { jsdom } = res.data + + const imgSrc: string[] = [] + const recomEls = + jsdom.window.document.querySelectorAll('.chief-recom-item') + recomEls.forEach((item) => imgSrc.push(item.querySelector('img')!.src)) + + const requestConifg = imgSrc.map((src) => ({ url: `https:${src}` })) + requestConifg.pop() + + testXCrawl.fetchFile( + { + requestConifg, + fileConfig: { storeDir: path.resolve(__dirname, './upload') } + }, + (res) => { + console.log(res.id, res.statusCode, res.data.fileName) + } + ) + }) +})