diff --git a/README.md b/README.md index 3b664fad..94075467 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,19 @@ x-crawl is a Nodejs multifunctional crawler library. ## Feature -- Crawl HTML, JSON, file resources, etc. with simple configuration -- Use puppeteer to crawl HTML, and use JSDOM library to parse HTML, or parse HTML by yourself -- Support asynchronous/synchronous way to crawl data -- Support Promise/Callback way to get the result -- Polling function -- Anthropomorphic request interval -- Written in TypeScript, provides generics +- Crawl HTML, JSON, file resources, etc. with simple configuration. +- Built-in puppeteer crawls HTML and uses JSDOM library to parse HTML. +- Support asynchronous/synchronous way to crawl data. +- Support Promise/Callback way to get the result. +- Polling function. +- Anthropomorphic request interval. +- Written in TypeScript, provides generics. + +## Benefits provided by using puppeter + +- Generate screenshots and PDFs of pages. +- Crawl a SPA (Single-Page Application) and generate pre-rendered content (i.e. "SSR" (Server-Side Rendering)). +- Automate form submission, UI testing, keyboard input, etc. # Table of Contents @@ -41,14 +47,15 @@ x-crawl is a Nodejs multifunctional crawler library. * [Method](#Method) * [RequestConfig](#RequestConfig) * [IntervalTime](#IntervalTime) - * [FetchBaseConifg](#FetchBaseConifg) * [XCrawlBaseConifg](#XCrawlBaseConifg) + * [FetchBaseConifgV1](#FetchBaseConifgV1) + * [FetchBaseConifgV2](#FetchBaseConifgV2) * [FetchHTMLConfig](#FetchHTMLConfig ) - * [FetchDataConfig](#FetchDataConfig) + * [FetchDataConfig](#FetchDataConfig) * [FetchFileConfig](#FetchFileConfig) * [StartPollingConfig](#StartPollingConfig) - * [FetchCommon](#FetchCommon) - * [FetchCommonArr](#FetchCommonArr) + * [FetchResCommonV1](#FetchResCommonV1) + * [FetchResCommonArrV1](#FetchResCommonArrV1) * [FileInfo](#FileInfo) * [FetchHTML](#FetchHTML) - [More](#More) @@ -318,7 +325,6 @@ interface FetchBaseConifgV1 { ```ts interface FetchBaseConifgV2 { url: string - header?: AnyObject timeout?: number proxy?: string } @@ -364,7 +370,7 @@ interface StartPollingConfig { interface FetchCommon { id: number statusCode: number | undefined - headers: IncomingHttpHeaders // node: http type + headers: IncomingHttpHeaders // nodejs: http type data: T } ``` @@ -392,8 +398,7 @@ interface FileInfo { interface FetchHTML { httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library data: { - page: Page - content: string + page: Page // The type of Page in the puppeteer library jsdom: JSDOM // The type of JSDOM in the jsdom library } } diff --git a/assets/crawler-result.png b/assets/crawler-result.png index 97813ce6..de8885ed 100644 Binary files a/assets/crawler-result.png and b/assets/crawler-result.png differ diff --git a/assets/crawler.png b/assets/crawler.png index b89679e3..67b5f2a4 100644 Binary files a/assets/crawler.png and b/assets/crawler.png differ diff --git a/docs/cn.md b/docs/cn.md index 5b955b8a..5d9f1813 100644 --- a/docs/cn.md +++ b/docs/cn.md @@ -6,13 +6,19 @@ x-crawl 是 Nodejs 多功能爬虫库。 ## 特征 -- 只需简单的配置即可抓取 HTML 、JSON、文件资源等等 -- 使用 puppeteer 爬取 HTML ,并用 JSDOM 库对 HTML 解析,也可自行解析 HTML -- 支持 异步/同步 方式爬取数据 -- 支持 Promise/Callback 方式获取结果 -- 轮询功能 -- 拟人化的请求间隔时间 -- 使用 TypeScript 编写,提供泛型 +- 只需简单的配置即可抓取 HTML 、JSON、文件资源等等。 +- 内置 puppeteer 爬取 HTML ,并用 JSDOM 库对 HTML 解析。 +- 支持 异步/同步 方式爬取数据。 +- 支持 Promise/Callback 方式获取结果。 +- 轮询功能。 +- 拟人化的请求间隔时间。 +- 使用 TypeScript 编写,提供泛型。 + +## 使用 puppeter 提供的好处 + +- 生成页面的屏幕截图和 PDF。 +- 抓取 SPA(单页应用程序)并生成预渲染内容(即“SSR”(服务器端渲染))。 +- 自动化表单提交、UI 测试、键盘输入等。 # 目录 @@ -41,14 +47,15 @@ x-crawl 是 Nodejs 多功能爬虫库。 * [Method](#Method) * [RequestConfig](#RequestConfig) * [IntervalTime](#IntervalTime) - * [FetchBaseConifg](#FetchBaseConifg) * [XCrawlBaseConifg](#XCrawlBaseConifg) + * [FetchBaseConifgV1](#FetchBaseConifgV1) + * [FetchBaseConifgV2](#FetchBaseConifgV2) * [FetchHTMLConfig](#FetchHTMLConfig ) * [FetchDataConfig](#FetchDataConfig) * [FetchFileConfig](#FetchFileConfig) - * [FetchPollingConfig](#FetchPollingConfig) - * [FetchCommon](#FetchCommon) - * [FetchCommonArr](#FetchCommonArr) + * [StartPollingConfig](#StartPollingConfig) + * [FetchResCommonV1](#FetchResCommonV1) + * [FetchResCommonArrV1](#FetchResCommonArrV1) * [FileInfo](#FileInfo) * [FetchHTML](#FetchHTML) - [更多](#更多) @@ -63,7 +70,7 @@ npm install x-crawl ## 示例 -每隔一天就获取 bilibili 国漫主页的推荐轮播图片为例: +每隔一天就获取 bilibili 国漫主页的轮播图片为例: ```js // 1.导入模块 ES/CJS @@ -76,14 +83,14 @@ const myXCrawl = xCrawl({ }) // 3.设置爬取任务 -// 调用 fetchPolling API 开始轮询功能,每隔一天会调用回调函数 -myXCrawl.fetchPolling({ d: 1 }, () => { +// 调用 startPolling API 开始轮询功能,每隔一天会调用回调函数 +myXCrawl.startPolling({ d: 1 }, () => { // 调用 fetchHTML API 爬取 HTML myXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { const { jsdom } = res.data // 默认使用了 JSDOM 库解析 HTML // 获取轮播图片元素 - const imgEls = jsdom.window.document.querySelectorAll('.chief-recom-item img') + const imgEls = jsdom.window.document.querySelectorAll('.carousel-wrapper .chief-recom-item img') // 设置请求配置 const requestConifg = [] @@ -342,7 +349,6 @@ interface FetchBaseConifgV1 { ```ts interface FetchBaseConifgV2 { url: string - header?: AnyObject timeout?: number proxy?: string } @@ -388,7 +394,7 @@ interface StartPollingConfig { interface FetchCommon { id: number statusCode: number | undefined - headers: IncomingHttpHeaders // node: http 类型 + headers: IncomingHttpHeaders // nodejs: http 类型 data: T } ``` @@ -416,8 +422,7 @@ interface FileInfo { interface FetchHTML { httpResponse: HTTPResponse | null // puppeteer 库的 HTTPResponse 类型 data: { - page: Page - content: string + page: Page // puppeteer 库的 Page 类型 jsdom: JSDOM } } diff --git a/package.json b/package.json index 940d0d8b..bbc92cdc 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "2.0.0", + "version": "2.1.0", "author": "coderHXL", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index 3b664fad..94075467 100644 --- a/publish/README.md +++ b/publish/README.md @@ -6,13 +6,19 @@ x-crawl is a Nodejs multifunctional crawler library. ## Feature -- Crawl HTML, JSON, file resources, etc. with simple configuration -- Use puppeteer to crawl HTML, and use JSDOM library to parse HTML, or parse HTML by yourself -- Support asynchronous/synchronous way to crawl data -- Support Promise/Callback way to get the result -- Polling function -- Anthropomorphic request interval -- Written in TypeScript, provides generics +- Crawl HTML, JSON, file resources, etc. with simple configuration. +- Built-in puppeteer crawls HTML and uses JSDOM library to parse HTML. +- Support asynchronous/synchronous way to crawl data. +- Support Promise/Callback way to get the result. +- Polling function. +- Anthropomorphic request interval. +- Written in TypeScript, provides generics. + +## Benefits provided by using puppeter + +- Generate screenshots and PDFs of pages. +- Crawl a SPA (Single-Page Application) and generate pre-rendered content (i.e. "SSR" (Server-Side Rendering)). +- Automate form submission, UI testing, keyboard input, etc. # Table of Contents @@ -41,14 +47,15 @@ x-crawl is a Nodejs multifunctional crawler library. * [Method](#Method) * [RequestConfig](#RequestConfig) * [IntervalTime](#IntervalTime) - * [FetchBaseConifg](#FetchBaseConifg) * [XCrawlBaseConifg](#XCrawlBaseConifg) + * [FetchBaseConifgV1](#FetchBaseConifgV1) + * [FetchBaseConifgV2](#FetchBaseConifgV2) * [FetchHTMLConfig](#FetchHTMLConfig ) - * [FetchDataConfig](#FetchDataConfig) + * [FetchDataConfig](#FetchDataConfig) * [FetchFileConfig](#FetchFileConfig) * [StartPollingConfig](#StartPollingConfig) - * [FetchCommon](#FetchCommon) - * [FetchCommonArr](#FetchCommonArr) + * [FetchResCommonV1](#FetchResCommonV1) + * [FetchResCommonArrV1](#FetchResCommonArrV1) * [FileInfo](#FileInfo) * [FetchHTML](#FetchHTML) - [More](#More) @@ -318,7 +325,6 @@ interface FetchBaseConifgV1 { ```ts interface FetchBaseConifgV2 { url: string - header?: AnyObject timeout?: number proxy?: string } @@ -364,7 +370,7 @@ interface StartPollingConfig { interface FetchCommon { id: number statusCode: number | undefined - headers: IncomingHttpHeaders // node: http type + headers: IncomingHttpHeaders // nodejs: http type data: T } ``` @@ -392,8 +398,7 @@ interface FileInfo { interface FetchHTML { httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library data: { - page: Page - content: string + page: Page // The type of Page in the puppeteer library jsdom: JSDOM // The type of JSDOM in the jsdom library } } diff --git a/publish/package.json b/publish/package.json index 8872a949..a7382ba4 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "2.0.0", + "version": "2.1.0", "author": "coderHXL", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/src/api.ts b/src/api.ts index 1e695864..0562deaa 100644 --- a/src/api.ts +++ b/src/api.ts @@ -29,11 +29,11 @@ import { IntervalTime, StartPollingConfig } from './types/api' -import { XCrawlBaseConifg } from './types' +import { LoaderXCrawlBaseConifg } from './types' import { RequestConfig, RequestResItem } from './types/request' function mergeConfig( - baseConfig: XCrawlBaseConifg, + baseConfig: LoaderXCrawlBaseConifg, rawConfig: T ): T { const newConfig = structuredClone(rawConfig) @@ -70,7 +70,7 @@ function mergeConfig( } async function useBatchRequestByMode( - mode: 'async' | 'sync' | undefined, + mode: 'async' | 'sync', requestConifg: RequestConfig | RequestConfig[], intervalTime: IntervalTime | undefined, callback: (requestRestem: RequestResItem) => void @@ -79,14 +79,14 @@ async function useBatchRequestByMode( ? requestConifg : [requestConifg] - if (mode !== 'sync') { + if (mode === 'async') { await batchRequest(requestConfigQueue, intervalTime, callback) } else { await syncBatchRequest(requestConfigQueue, intervalTime, callback) } } -export function createFetchHTML(baseConfig: XCrawlBaseConifg) { +export function createFetchHTML(baseConfig: LoaderXCrawlBaseConifg) { let browser: Browser | null = null let createBrowserState: Promise | null = null let callTotal = 0 @@ -95,7 +95,7 @@ export function createFetchHTML(baseConfig: XCrawlBaseConifg) { config: FetchHTMLConfig, callback?: (res: FetchHTML) => void ): Promise { - // 记录调用次数, 为关闭浏览器 + // 记录调用次数, 目的: 关闭浏览器 callTotal++ // 只创建一次浏览器 @@ -129,22 +129,20 @@ export function createFetchHTML(baseConfig: XCrawlBaseConifg) { }) } - const httpResponse = await page!.goto(requestConifg.url) + const httpResponse = await page!.goto(requestConifg.url, { + timeout: requestConifg.timeout + }) const content = await page!.content() // 关闭浏览器 if (--callTotal === 0) { - await browser!.close() + browser!.close() } const res: FetchHTML = { httpResponse, - data: { - page, - content, - jsdom: new JSDOM(content) - } + data: { page, jsdom: new JSDOM(content) } } if (callback) { @@ -157,7 +155,7 @@ export function createFetchHTML(baseConfig: XCrawlBaseConifg) { return fetchHTML } -export function createFetchData(baseConfig: XCrawlBaseConifg) { +export function createFetchData(baseConfig: LoaderXCrawlBaseConifg) { async function fetchData( config: FetchDataConfig, callback?: (res: FetchResCommonV1) => void @@ -198,7 +196,7 @@ export function createFetchData(baseConfig: XCrawlBaseConifg) { return fetchData } -export function createFetchFile(baseConfig: XCrawlBaseConifg) { +export function createFetchFile(baseConfig: LoaderXCrawlBaseConifg) { async function fetchFile( config: FetchFileConfig, callback?: (res: FetchResCommonV1) => void diff --git a/src/index.ts b/src/index.ts index 5b664e6a..1d4be0cc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,9 +5,25 @@ import { startPolling } from './api' -import { XCrawlBaseConifg, XCrawlInstance } from './types' +import { + LoaderXCrawlBaseConifg, + XCrawlBaseConifg, + XCrawlInstance +} from './types' + +function loaderBaseConifg( + baseConfig: XCrawlBaseConifg | undefined +): LoaderXCrawlBaseConifg { + const loaderBaseConfig = baseConfig ? baseConfig : {} + + if (!loaderBaseConfig.mode) { + loaderBaseConfig.mode = 'async' + } -function createnstance(baseConfig: XCrawlBaseConifg): XCrawlInstance { + return loaderBaseConfig as LoaderXCrawlBaseConifg +} + +function createnInstance(baseConfig: LoaderXCrawlBaseConifg): XCrawlInstance { const instance: XCrawlInstance = { fetchHTML: createFetchHTML(baseConfig), fetchData: createFetchData(baseConfig), @@ -18,10 +34,10 @@ function createnstance(baseConfig: XCrawlBaseConifg): XCrawlInstance { return instance } -export default function xCrawl( - baseConfig: XCrawlBaseConifg = {} -): XCrawlInstance { - const instance = createnstance(baseConfig) +export default function xCrawl(baseConfig?: XCrawlBaseConifg): XCrawlInstance { + const newBaseConfig = loaderBaseConifg(baseConfig) + + const instance = createnInstance(newBaseConfig) return instance } diff --git a/src/request.ts b/src/request.ts index 9c54d61e..a5d92f22 100644 --- a/src/request.ts +++ b/src/request.ts @@ -11,6 +11,7 @@ import { logError, logNumber, logSuccess, + logWarn, random, sleep } from './utils' @@ -166,7 +167,9 @@ export async function batchRequest( const isNumberntervalTime = isNumber(intervalTime) log( - `Begin execution, mode: async, total: ${logNumber(requestConifgs.length)} ` + `Begin execution, mode: ${logWarn('async')}, total: ${logNumber( + requestConifgs.length + )} ` ) let index = 0 @@ -227,7 +230,9 @@ export async function syncBatchRequest( const isNumberntervalTime = isNumber(intervalTime) log( - `Begin execution, mode: sync, total: ${logNumber(requestConifgs.length)} ` + `Begin execution, mode: ${logWarn('sync')}, total: ${logNumber( + requestConifgs.length + )} ` ) let id = 0 diff --git a/src/types/api.ts b/src/types/api.ts index 3c43a8ae..daae3fd0 100644 --- a/src/types/api.ts +++ b/src/types/api.ts @@ -3,7 +3,6 @@ import { HTTPResponse, Page } from 'puppeteer' import { JSDOM } from 'jsdom' import { RequestConfig } from './request' -import { AnyObject } from './common' export type IntervalTime = number | { max: number; min?: number } @@ -14,7 +13,6 @@ export interface FetchBaseConifgV1 { export interface FetchBaseConifgV2 { url: string - header?: AnyObject timeout?: number proxy?: string } @@ -56,7 +54,6 @@ export interface FetchHTML { httpResponse: HTTPResponse | null data: { page: Page - content: string jsdom: JSDOM } } diff --git a/src/types/index.ts b/src/types/index.ts index 1bb30ce7..c43bf032 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -9,6 +9,7 @@ import { StartPollingConfig, IntervalTime } from './api' +import { MapTypeObject } from './common' export interface XCrawlBaseConifg { baseUrl?: string @@ -18,6 +19,13 @@ export interface XCrawlBaseConifg { proxy?: string } +interface LoaderXCrawlBaseConifgValue { + mode: 'async' | 'sync' +} + +export type LoaderXCrawlBaseConifg = XCrawlBaseConifg & + MapTypeObject + export interface XCrawlInstance { fetchHTML: ( config: FetchHTMLConfig, diff --git a/test/start/index.js b/test/start/index.js index 8830bc11..569b7a40 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:fs/promises"),n=require("node:path"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),c=require("https-proxy-agent"),u=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const h=console.log,m=u.hex("#a57fff"),d=u.green,p=u.red,g=u.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function q(e){return Array.isArray(e)}function $(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function T(e){const{protocol:t,hostname:n,port:o,pathname:r,search:u}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?c(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:$(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function v(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=T(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let c;c="http:"===r.protocol?s.request(r,i):a.request(r,i),c.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{n(e)})),"POST"!==r.method||o||c.write(e.data),c.end()}))}async function x(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else h(`Request ${m(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=q(n.requestConifg)?n.requestConifg:[n.requestConifg];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function b(e,t,n,o){const r=q(t)?t:[t];"sync"!==e?await async function(e,t,n){const o=!w(t),r=y(t);h(`Begin execution, mode: async, total: ${m(e.length)} `);let s=0,a=0,i=0;const c=[],u=[];for(const l of e){const e=++s;await x(o,r,t,e);const f=v(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;u.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));c.push(f)}h(d("All requests have been sent!")),await Promise.all(c),f(u).forEach((e=>h(p(e.message)))),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);h(`Begin execution, mode: sync, total: ${m(e.length)} `);let s=0,a=0,i=0;for(const c of e){s++,await x(o,r,t,s);let e=!0,u=null;try{u={id:s,...await v(c)},h(d(`Request ${m(s)} is an success`)),a++}catch(t){e=!1,h(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(u)}h(d("All requests are over!")),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o)}function S(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e,console.log(111)}))),n&&(await Promise.all([n]),n=null);const c=await t.newPage();await c.setViewport({width:1280,height:1024});const{requestConifg:u}=C(e,{requestConifg:(l=a,"string"==typeof l?{url:a}:a)});var l;u.proxy?await t.createIncognitoBrowserContext({proxyServer:u.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await c.goto(u.url),h=await c.content();0==--s&&await t.close();const m={httpResponse:f,data:{page:c,content:h,jsdom:new o.JSDOM(h)}};return i&&i(m),m}}function M(e){return async function(t,n){const{requestConifg:o,intervalTime:r}=C(e,t),s=[];return await b(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function P(o){return async function(r,s){const{requestConifg:a,intervalTime:i,fileConfig:c}=C(o,r),u=[],l=[],g=[];e.existsSync(c.storeDir)||e.mkdirSync(c.storeDir),await b(o.mode,a,i,(function(e){const{id:o,headers:r,data:a}=e,i=r["content-type"]??"",f=c.extension??i.split("/").pop(),h=(new Date).getTime().toString(),m=n.resolve(c.storeDir,`${h}.${f}`),d=t.writeFile(m,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((t=>{if(t)return;const n={...e,data:{fileName:h,mimeType:i,size:a.length,filePath:m}};s&&s(n),u.push(n)}));l.push(d)})),await Promise.all(l),f(g).forEach((e=>h(p(e.message))));const w=q(a)?a.length:1,y=u.length,$=w-y;return h(`saveFileTotal: ${m(w)}, success: ${d(y)}, error: ${p($)}`),f(u.map((e=>({...e,valueOf:()=>e.id}))))}}function L(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}const O=function(e={}){return function(e){return{fetchHTML:S(e),fetchData:M(e),fetchFile:P(e),startPolling:L}}(e)}({timeout:1e4,intervalTime:{max:3e3,min:1e3},mode:"async"});O.fetchHTML("https://www.bilibili.com/guochuang/").then((e=>{console.log(1)})),O.fetchHTML("https://www.bilibili.com/guochuang/").then((e=>{console.log(2)})),O.fetchHTML("https://www.bilibili.com/guochuang/").then((e=>{console.log(3)})); +"use strict";var e=require("node:path"),t=require("node:fs"),n=require("node:fs/promises"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),c=require("https-proxy-agent"),u=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const h=console.log,d=u.hex("#a57fff"),m=u.green,p=u.red,g=u.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function $(e){return Array.isArray(e)}function q(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function v(e){const{protocol:t,hostname:n,port:o,pathname:r,search:u}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?c(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:q(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function x(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=v(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let c;c="http:"===r.protocol?s.request(r,i):a.request(r,i),c.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{n(e)})),"POST"!==r.method||o||c.write(e.data),c.end()}))}async function T(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else h(`Request ${d(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=$(n.requestConifg)?n.requestConifg:[n.requestConifg];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function S(e,t,n,o){const r=$(t)?t:[t];"async"===e?await async function(e,t,n){const o=!w(t),r=y(t);h(`Begin execution, mode: ${g("async")}, total: ${d(e.length)} `);let s=0,a=0,i=0;const c=[],u=[];for(const l of e){const e=++s;await T(o,r,t,e);const f=x(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;u.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));c.push(f)}h(m("All requests have been sent!")),await Promise.all(c),f(u).forEach((e=>h(p(e.message)))),h(`requestsTotal: ${d(e.length)}, success: ${m(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);h(`Begin execution, mode: ${g("sync")}, total: ${d(e.length)} `);let s=0,a=0,i=0;for(const c of e){s++,await T(o,r,t,s);let e=!0,u=null;try{u={id:s,...await x(c)},h(m(`Request ${d(s)} is an success`)),a++}catch(t){e=!1,h(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(u)}h(m("All requests are over!")),h(`requestsTotal: ${d(e.length)}, success: ${m(a)}, error: ${p(i)}`)}(r,n,o)}function P(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e}))),n&&(await Promise.all([n]),n=null);const c=await t.newPage();await c.setViewport({width:1280,height:1024});const{requestConifg:u}=C(e,{requestConifg:(l=a,"string"==typeof l?{url:a}:a)});var l;u.proxy?await t.createIncognitoBrowserContext({proxyServer:u.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await c.goto(u.url,{timeout:u.timeout}),h=await c.content();0==--s&&t.close();const d={httpResponse:f,data:{page:c,jsdom:new o.JSDOM(h)}};return i&&i(d),d}}function b(e){return async function(t,n){const{requestConifg:o,intervalTime:r}=C(e,t),s=[];return await S(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function A(o){return async function(r,s){const{requestConifg:a,intervalTime:i,fileConfig:c}=C(o,r),u=[],l=[],g=[];t.existsSync(c.storeDir)||t.mkdirSync(c.storeDir),await S(o.mode,a,i,(function(t){const{id:o,headers:r,data:a}=t,i=r["content-type"]??"",f=c.extension??i.split("/").pop(),h=(new Date).getTime().toString(),d=e.resolve(c.storeDir,`${h}.${f}`),m=n.writeFile(d,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((e=>{if(e)return;const n={...t,data:{fileName:h,mimeType:i,size:a.length,filePath:d}};s&&s(n),u.push(n)}));l.push(m)})),await Promise.all(l),f(g).forEach((e=>h(p(e.message))));const w=$(a)?a.length:1,y=u.length,q=w-y;return h(`saveFileTotal: ${d(w)}, success: ${m(y)}, error: ${p(q)}`),f(u.map((e=>({...e,valueOf:()=>e.id}))))}}function M(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}const O=function(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),t}(e);return function(e){return{fetchHTML:P(e),fetchData:b(e),fetchFile:A(e),startPolling:M}}(t)}({timeout:1e4,intervalTime:{max:3e3,min:2e3}});O.startPolling({d:1},(()=>{O.fetchHTML("https://www.bilibili.com/guochuang/").then((t=>{const{jsdom:n}=t.data,o=n.window.document.querySelectorAll(".carousel-wrapper .chief-recom-item img"),r=[];o.forEach((e=>r.push({url:`https:${e.src}`}))),O.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}})}))})); diff --git a/test/start/index.ts b/test/start/index.ts index 3d1beeed..e73dfc29 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -1,11 +1,11 @@ import path from 'node:path' -import XCrawl from '../../src' +import xCrawl from '../../src' -const testXCrawl = XCrawl({ - timeout: 10000, - intervalTime: { max: 3000, min: 1000 }, - mode: 'async' -}) +// const testXCrawl = xCrawl({ +// timeout: 10000, +// intervalTime: { max: 3000, min: 1000 }, +// mode: 'async' +// }) // testXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { // const { jsdom } = res.data @@ -28,15 +28,3 @@ const testXCrawl = XCrawl({ // fileConfig: { storeDir: path.resolve(__dirname, 'upload') } // }) // }) - -testXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { - console.log(1) -}) - -testXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { - console.log(2) -}) - -testXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => { - console.log(3) -})