diff --git a/README.md b/README.md index 88f51f39..e694d48a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ If it helps you, please give the [repository](https://github.com/coder-hxl/x-cra - The built-in puppeteer crawls the page, and uses the jsdom library to parse the page. - Support asynchronous/synchronous way to crawl data. - Support Promise/Callback method to get the result. -- Polling function, fixed-point crawling. +- Polling function, timing crawling. - Anthropomorphic request interval. - Written in TypeScript, providing generics. @@ -86,7 +86,7 @@ npm install x-crawl ## Example -Example of fetching featured video cover image for youtube homepage every other day: +Regular crawling: Get the recommended pictures of the youtube homepage every other day as an example: ```js // 1.Import module ES/CJS @@ -135,7 +135,7 @@ running result: -**Note:** Do not crawl randomly, here is just to demonstrate how to use x-crawl, and control the request frequency within 3000ms to 2000ms. +**Note:** Do not crawl at will, you can check the **robots.txt** protocol before crawling. This is just to demonstrate how to use x-crawl. ## Core concepts diff --git a/docs/cn.md b/docs/cn.md index 67493326..96661607 100644 --- a/docs/cn.md +++ b/docs/cn.md @@ -12,7 +12,7 @@ x-crawl 是 Nodejs 多功能爬虫库。 - 内置 puppeteer 爬取页面 ,并用采用 jsdom 库对页面解析。 - 支持 异步/同步 方式爬取数据。 - 支持 Promise/Callback 方式获取结果。 -- 轮询功能,定点爬取。 +- 轮询功能,定时爬取。 - 拟人化的请求间隔时间。 - 使用 TypeScript 编写,提供泛型。 @@ -85,7 +85,7 @@ npm install x-crawl ## 示例 -每隔一天就获取 bilibili 国漫主页的轮播图片为例: +定时爬取: 每隔一天就获取 bilibili 国漫主页的轮播图片为例: ```js // 1.导入模块 ES/CJS @@ -126,8 +126,7 @@ myXCrawl.startPolling({ d: 1 }, () => {
- -**注意:** 请勿随意爬取,这里只是为了演示如何使用 x-crawl ,并将请求频率控制在 3000ms 到 2000ms 内。 +**注意:** 请勿随意爬取,爬取前可查看 **robots.txt** 协议。这里只是为了演示如何使用 x-crawl 。 ## 核心概念 diff --git a/package.json b/package.json index d9ef57c0..5e524c5a 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "2.4.0", + "version": "2.4.2", "author": "coderHXL", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index 88f51f39..e694d48a 100644 --- a/publish/README.md +++ b/publish/README.md @@ -12,7 +12,7 @@ If it helps you, please give the [repository](https://github.com/coder-hxl/x-cra - The built-in puppeteer crawls the page, and uses the jsdom library to parse the page. - Support asynchronous/synchronous way to crawl data. - Support Promise/Callback method to get the result. -- Polling function, fixed-point crawling. +- Polling function, timing crawling. - Anthropomorphic request interval. - Written in TypeScript, providing generics. @@ -86,7 +86,7 @@ npm install x-crawl ## Example -Example of fetching featured video cover image for youtube homepage every other day: +Regular crawling: Get the recommended pictures of the youtube homepage every other day as an example: ```js // 1.Import module ES/CJS @@ -135,7 +135,7 @@ running result: -**Note:** Do not crawl randomly, here is just to demonstrate how to use x-crawl, and control the request frequency within 3000ms to 2000ms. +**Note:** Do not crawl at will, you can check the **robots.txt** protocol before crawling. This is just to demonstrate how to use x-crawl. ## Core concepts diff --git a/publish/package.json b/publish/package.json index 65fe6bc8..eebe39ee 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "2.4.0", + "version": "2.4.2", "author": "coderHXL", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/src/api.ts b/src/api.ts index 883d1f9c..0083d853 100644 --- a/src/api.ts +++ b/src/api.ts @@ -62,10 +62,7 @@ function mergeConfig( } // 2.处理 intervalTime - if ( - Object.hasOwn(newConfig, 'intervalTime') && - isUndefined(newConfig.intervalTime) - ) { + if (isUndefined(newConfig.intervalTime)) { newConfig.intervalTime = baseConfig.intervalTime } diff --git a/test/start/index.js b/test/start/index.js index 0d277c2a..b9e37437 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:fs/promises"),n=require("node:path"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),u=require("https-proxy-agent"),c=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const h=console.log,m=c.hex("#a57fff"),d=c.green,p=c.red,g=c.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function $(e){return Array.isArray(e)}function q(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function v(e){const{protocol:t,hostname:n,port:o,pathname:r,search:c}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:q(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function x(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=v(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let u;u="http:"===r.protocol?s.request(r,i):a.request(r,i),u.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),u.on("error",(e=>{n(e)})),"POST"!==r.method||o||u.write(e.data),u.end()}))}async function T(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else h(`Request ${m(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=$(n.requestConfig)?n.requestConfig:[n.requestConfig];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function S(e,t,n,o){const r=$(t)?t:[t];"async"===e?await async function(e,t,n){const o=!w(t),r=y(t);h(`${d("Begin execution:")} mode: ${g("async")}, total: ${m(e.length)} `);let s=0,a=0,i=0;const u=[],c=[];for(const l of e){const e=++s;await T(o,r,t,e);const f=x(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;c.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));u.push(f)}h(d("All requests have been sent!")),await Promise.all(u),f(c).forEach((e=>h(p(e.message)))),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);h(`${d("Begin execution:")} mode: ${g("sync")}, total: ${m(e.length)}`);let s=0,a=0,i=0;for(const u of e){s++,await T(o,r,t,s);let e=!0,c=null;try{c={id:s,...await x(u)},h(d(`Request ${m(s)} is an success`)),a++}catch(t){e=!1,h(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(c)}h(d("All requests are over!")),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o)}function P(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e}))),n&&(await Promise.all([n]),n=null);const u=await t.newPage();await u.setViewport({width:1280,height:1024});const{requestConfig:c}=C(e,{requestConfig:(l=a,"string"==typeof l?{url:a}:a)});var l;c.proxy?await t.createIncognitoBrowserContext({proxyServer:c.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await u.goto(c.url,{timeout:c.timeout}),h=await u.content();0==--s&&t.close();const m={httpResponse:f,data:{page:u,jsdom:new o.JSDOM(h)}};return i&&i(m),m}}function b(e){return async function(t,n){const{requestConfig:o,intervalTime:r}=C(e,t),s=[];return await S(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function A(o){return async function(r,s){const{requestConfig:a,intervalTime:i,fileConfig:u}=C(o,r),c=[],l=[],g=[];e.existsSync(u.storeDir)||e.mkdirSync(u.storeDir),await S(o.mode,a,i,(function(e){const{id:o,headers:r,data:a}=e,i=r["content-type"]??"",f=u.extension??i.split("/").pop(),h=(new Date).getTime().toString(),m=n.resolve(u.storeDir,`${h}.${f}`),d=t.writeFile(m,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((t=>{if(t)return;const n={...e,data:{fileName:h,mimeType:i,size:a.length,filePath:m}};s&&s(n),c.push(n)}));l.push(d)})),await Promise.all(l),f(g).forEach((e=>h(p(e.message))));const w=$(a)?a.length:1,y=c.length,q=w-y;return h(`saveFileTotal: ${m(w)}, success: ${d(y)}, error: ${p(q)}`),f(c.map((e=>({...e,valueOf:()=>e.id}))))}}function O(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}function D(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),t}(e),n=function(e){return{fetchPage:P(e),fetchData:b(e),fetchFile:A(e),startPolling:O}}(t);return n}D({timeout:1e4,intervalTime:{max:3e3,min:1e3}});const M=D({timeout:1e4,intervalTime:{max:3e3,min:2e3},proxy:"http://localhost:14892"});M.startPolling({d:1},(()=>{M.fetchPage("https://www.youtube.com/").then((e=>{const{jsdom:t}=e.data,n=t.window.document.querySelectorAll(".yt-core-image--fill-parent-width"),o=[];n.forEach((e=>{e.src&&o.push({url:e.src})})),M.fetchFile({requestConfig:o,fileConfig:{storeDir:"./upload"}})}))})); +"use strict";var e=require("node:fs"),t=require("node:fs/promises"),n=require("node:path"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),u=require("https-proxy-agent"),c=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const m=console.log,h=c.hex("#a57fff"),d=c.green,p=c.red,g=c.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function $(e){return Array.isArray(e)}function q(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function v(e){const{protocol:t,hostname:n,port:o,pathname:r,search:c}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:q(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function x(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=v(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let u;u="http:"===r.protocol?s.request(r,i):a.request(r,i),u.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),u.on("error",(e=>{n(e)})),"POST"!==r.method||o||u.write(e.data),u.end()}))}async function T(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else m(`Request ${h(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=$(n.requestConfig)?n.requestConfig:[n.requestConfig];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function S(e,t,n,o){const r=$(t)?t:[t];"async"===e?await async function(e,t,n){const o=!w(t),r=y(t);m(`${d("Begin execution:")} mode: ${g("async")}, total: ${h(e.length)} `);let s=0,a=0,i=0;const u=[],c=[];for(const l of e){const e=++s;await T(o,r,t,e);const f=x(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;c.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));u.push(f)}m(d("All requests have been sent!")),await Promise.all(u),f(c).forEach((e=>m(p(e.message)))),m(`requestsTotal: ${h(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);m(`${d("Begin execution:")} mode: ${g("sync")}, total: ${h(e.length)}`);let s=0,a=0,i=0;for(const u of e){s++,await T(o,r,t,s);let e=!0,c=null;try{c={id:s,...await x(u)},m(d(`Request ${h(s)} is an success`)),a++}catch(t){e=!1,m(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(c)}m(d("All requests are over!")),m(`requestsTotal: ${h(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o)}function P(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e}))),n&&(await Promise.all([n]),n=null);const u=await t.newPage();await u.setViewport({width:1280,height:1024});const{requestConfig:c}=C(e,{requestConfig:(l=a,"string"==typeof l?{url:a}:a)});var l;c.proxy?await t.createIncognitoBrowserContext({proxyServer:c.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await u.goto(c.url,{timeout:c.timeout}),m=await u.content();0==--s&&t.close();const h={httpResponse:f,data:{page:u,jsdom:new o.JSDOM(m)}};return i&&i(h),h}}function b(e){return async function(t,n){const{requestConfig:o,intervalTime:r}=C(e,t),s=[];return await S(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function A(o){return async function(r,s){const{requestConfig:a,intervalTime:i,fileConfig:u}=C(o,r),c=[],l=[],g=[];e.existsSync(u.storeDir)||e.mkdirSync(u.storeDir),await S(o.mode,a,i,(function(e){const{id:o,headers:r,data:a}=e,i=r["content-type"]??"",f=u.extension??i.split("/").pop(),m=(new Date).getTime().toString(),h=n.resolve(u.storeDir,`${m}.${f}`),d=t.writeFile(h,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((t=>{if(t)return;const n={...e,data:{fileName:m,mimeType:i,size:a.length,filePath:h}};s&&s(n),c.push(n)}));l.push(d)})),await Promise.all(l),f(g).forEach((e=>m(p(e.message))));const w=$(a)?a.length:1,y=c.length,q=w-y;return m(`saveFileTotal: ${h(w)}, success: ${d(y)}, error: ${p(q)}`),f(c.map((e=>({...e,valueOf:()=>e.id}))))}}function O(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}function D(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),t}(e),n=function(e){return{fetchPage:P(e),fetchData:b(e),fetchFile:A(e),startPolling:O}}(t);return n}D({timeout:1e4,intervalTime:{max:3e3,min:1e3}});const M=D({timeout:1e4,intervalTime:{max:3e3,min:2e3}});M.startPolling({d:1},(()=>{M.fetchPage("https://www.bilibili.com/guochuang/").then((e=>{const{jsdom:t}=e.data,n=t.window.document.querySelectorAll(".carousel-wrapper .chief-recom-item img"),o=[];n.forEach((e=>o.push({url:`https:${e.src}`}))),M.fetchFile({requestConfig:o,fileConfig:{storeDir:"./upload"}})}))}));