diff --git a/README.md b/README.md
index 88f51f39..e694d48a 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ If it helps you, please give the [repository](https://github.com/coder-hxl/x-cra
- The built-in puppeteer crawls the page, and uses the jsdom library to parse the page.
- Support asynchronous/synchronous way to crawl data.
- Support Promise/Callback method to get the result.
-- Polling function, fixed-point crawling.
+- Polling function, timing crawling.
- Anthropomorphic request interval.
- Written in TypeScript, providing generics.
@@ -86,7 +86,7 @@ npm install x-crawl
## Example
-Example of fetching featured video cover image for youtube homepage every other day:
+Regular crawling: Get the recommended pictures of the youtube homepage every other day as an example:
```js
// 1.Import module ES/CJS
@@ -135,7 +135,7 @@ running result:
-**Note:** Do not crawl randomly, here is just to demonstrate how to use x-crawl, and control the request frequency within 3000ms to 2000ms.
+**Note:** Do not crawl at will, you can check the **robots.txt** protocol before crawling. This is just to demonstrate how to use x-crawl.
## Core concepts
diff --git a/docs/cn.md b/docs/cn.md
index 67493326..96661607 100644
--- a/docs/cn.md
+++ b/docs/cn.md
@@ -12,7 +12,7 @@ x-crawl 是 Nodejs 多功能爬虫库。
- 内置 puppeteer 爬取页面 ,并用采用 jsdom 库对页面解析。
- 支持 异步/同步 方式爬取数据。
- 支持 Promise/Callback 方式获取结果。
-- 轮询功能,定点爬取。
+- 轮询功能,定时爬取。
- 拟人化的请求间隔时间。
- 使用 TypeScript 编写,提供泛型。
@@ -85,7 +85,7 @@ npm install x-crawl
## 示例
-每隔一天就获取 bilibili 国漫主页的轮播图片为例:
+定时爬取: 每隔一天就获取 bilibili 国漫主页的轮播图片为例:
```js
// 1.导入模块 ES/CJS
@@ -126,8 +126,7 @@ myXCrawl.startPolling({ d: 1 }, () => {
-
-**注意:** 请勿随意爬取,这里只是为了演示如何使用 x-crawl ,并将请求频率控制在 3000ms 到 2000ms 内。
+**注意:** 请勿随意爬取,爬取前可查看 **robots.txt** 协议。这里只是为了演示如何使用 x-crawl 。
## 核心概念
diff --git a/package.json b/package.json
index d9ef57c0..5e524c5a 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
{
"private": true,
"name": "x-crawl",
- "version": "2.4.0",
+ "version": "2.4.2",
"author": "coderHXL",
"description": "XCrawl is a Nodejs multifunctional crawler library.",
"license": "MIT",
diff --git a/publish/README.md b/publish/README.md
index 88f51f39..e694d48a 100644
--- a/publish/README.md
+++ b/publish/README.md
@@ -12,7 +12,7 @@ If it helps you, please give the [repository](https://github.com/coder-hxl/x-cra
- The built-in puppeteer crawls the page, and uses the jsdom library to parse the page.
- Support asynchronous/synchronous way to crawl data.
- Support Promise/Callback method to get the result.
-- Polling function, fixed-point crawling.
+- Polling function, timing crawling.
- Anthropomorphic request interval.
- Written in TypeScript, providing generics.
@@ -86,7 +86,7 @@ npm install x-crawl
## Example
-Example of fetching featured video cover image for youtube homepage every other day:
+Regular crawling: Get the recommended pictures of the youtube homepage every other day as an example:
```js
// 1.Import module ES/CJS
@@ -135,7 +135,7 @@ running result:
-**Note:** Do not crawl randomly, here is just to demonstrate how to use x-crawl, and control the request frequency within 3000ms to 2000ms.
+**Note:** Do not crawl at will, you can check the **robots.txt** protocol before crawling. This is just to demonstrate how to use x-crawl.
## Core concepts
diff --git a/publish/package.json b/publish/package.json
index 65fe6bc8..eebe39ee 100644
--- a/publish/package.json
+++ b/publish/package.json
@@ -1,6 +1,6 @@
{
"name": "x-crawl",
- "version": "2.4.0",
+ "version": "2.4.2",
"author": "coderHXL",
"description": "XCrawl is a Nodejs multifunctional crawler library.",
"license": "MIT",
diff --git a/src/api.ts b/src/api.ts
index 883d1f9c..0083d853 100644
--- a/src/api.ts
+++ b/src/api.ts
@@ -62,10 +62,7 @@ function mergeConfig(
}
// 2.处理 intervalTime
- if (
- Object.hasOwn(newConfig, 'intervalTime') &&
- isUndefined(newConfig.intervalTime)
- ) {
+ if (isUndefined(newConfig.intervalTime)) {
newConfig.intervalTime = baseConfig.intervalTime
}
diff --git a/test/start/index.js b/test/start/index.js
index 0d277c2a..b9e37437 100644
--- a/test/start/index.js
+++ b/test/start/index.js
@@ -1 +1 @@
-"use strict";var e=require("node:fs"),t=require("node:fs/promises"),n=require("node:path"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),u=require("https-proxy-agent"),c=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const h=console.log,m=c.hex("#a57fff"),d=c.green,p=c.red,g=c.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function $(e){return Array.isArray(e)}function q(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function v(e){const{protocol:t,hostname:n,port:o,pathname:r,search:c}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:q(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function x(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=v(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let u;u="http:"===r.protocol?s.request(r,i):a.request(r,i),u.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),u.on("error",(e=>{n(e)})),"POST"!==r.method||o||u.write(e.data),u.end()}))}async function T(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else h(`Request ${m(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=$(n.requestConfig)?n.requestConfig:[n.requestConfig];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function S(e,t,n,o){const r=$(t)?t:[t];"async"===e?await async function(e,t,n){const o=!w(t),r=y(t);h(`${d("Begin execution:")} mode: ${g("async")}, total: ${m(e.length)} `);let s=0,a=0,i=0;const u=[],c=[];for(const l of e){const e=++s;await T(o,r,t,e);const f=x(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;c.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));u.push(f)}h(d("All requests have been sent!")),await Promise.all(u),f(c).forEach((e=>h(p(e.message)))),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);h(`${d("Begin execution:")} mode: ${g("sync")}, total: ${m(e.length)}`);let s=0,a=0,i=0;for(const u of e){s++,await T(o,r,t,s);let e=!0,c=null;try{c={id:s,...await x(u)},h(d(`Request ${m(s)} is an success`)),a++}catch(t){e=!1,h(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(c)}h(d("All requests are over!")),h(`requestsTotal: ${m(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o)}function P(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e}))),n&&(await Promise.all([n]),n=null);const u=await t.newPage();await u.setViewport({width:1280,height:1024});const{requestConfig:c}=C(e,{requestConfig:(l=a,"string"==typeof l?{url:a}:a)});var l;c.proxy?await t.createIncognitoBrowserContext({proxyServer:c.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await u.goto(c.url,{timeout:c.timeout}),h=await u.content();0==--s&&t.close();const m={httpResponse:f,data:{page:u,jsdom:new o.JSDOM(h)}};return i&&i(m),m}}function b(e){return async function(t,n){const{requestConfig:o,intervalTime:r}=C(e,t),s=[];return await S(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function A(o){return async function(r,s){const{requestConfig:a,intervalTime:i,fileConfig:u}=C(o,r),c=[],l=[],g=[];e.existsSync(u.storeDir)||e.mkdirSync(u.storeDir),await S(o.mode,a,i,(function(e){const{id:o,headers:r,data:a}=e,i=r["content-type"]??"",f=u.extension??i.split("/").pop(),h=(new Date).getTime().toString(),m=n.resolve(u.storeDir,`${h}.${f}`),d=t.writeFile(m,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((t=>{if(t)return;const n={...e,data:{fileName:h,mimeType:i,size:a.length,filePath:m}};s&&s(n),c.push(n)}));l.push(d)})),await Promise.all(l),f(g).forEach((e=>h(p(e.message))));const w=$(a)?a.length:1,y=c.length,q=w-y;return h(`saveFileTotal: ${m(w)}, success: ${d(y)}, error: ${p(q)}`),f(c.map((e=>({...e,valueOf:()=>e.id}))))}}function O(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}function D(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),t}(e),n=function(e){return{fetchPage:P(e),fetchData:b(e),fetchFile:A(e),startPolling:O}}(t);return n}D({timeout:1e4,intervalTime:{max:3e3,min:1e3}});const M=D({timeout:1e4,intervalTime:{max:3e3,min:2e3},proxy:"http://localhost:14892"});M.startPolling({d:1},(()=>{M.fetchPage("https://www.youtube.com/").then((e=>{const{jsdom:t}=e.data,n=t.window.document.querySelectorAll(".yt-core-image--fill-parent-width"),o=[];n.forEach((e=>{e.src&&o.push({url:e.src})})),M.fetchFile({requestConfig:o,fileConfig:{storeDir:"./upload"}})}))}));
+"use strict";var e=require("node:fs"),t=require("node:fs/promises"),n=require("node:path"),o=require("jsdom"),r=require("puppeteer"),s=require("node:http"),a=require("node:https"),i=require("node:url"),u=require("https-proxy-agent"),c=require("chalk");function l(e,t,n){const o=e[t];e[t]=e[n],e[n]=o}function f(e){return function t(n,o){if(n>=o)return;const r=e[o];let s=n,a=o-1;for(;s<=a;){for(;e[s]r;)a--;s<=a&&(l(e,s,a),s++,a--)}l(e,s,o),t(n,s-1),t(s+1,o)}(0,e.length-1),e}const m=console.log,h=c.hex("#a57fff"),d=c.green,p=c.red,g=c.yellow;function w(e){return void 0===e}function y(e){return"number"==typeof e}function $(e){return Array.isArray(e)}function q(e,t){let n=e?`${e}`:"?";if(t)for(const e in t){n+=`&${e}=${t[e]}`}else n=e;return n}function v(e){const{protocol:t,hostname:n,port:o,pathname:r,search:c}=new i.URL(e.url),l="http:"===t,f={agent:e.proxy?u(e.proxy):l?new s.Agent:new a.Agent,protocol:t,hostname:n,port:o,path:r,search:q(c,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(n["Content-Type"]="application/json",n["Content-Length"]=Buffer.byteLength(e.data)),n}(e,f),f}function x(e){return new Promise(((t,n)=>{const o=w(e.data);e.data=o?e.data:JSON.stringify(e.data);const r=v(e);function i(e){const{statusCode:n,headers:o}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:n,headers:o,data:e})}))}let u;u="http:"===r.protocol?s.request(r,i):a.request(r,i),u.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),u.on("error",(e=>{n(e)})),"POST"!==r.method||o||u.write(e.data),u.end()}))}async function T(e,t,n,o){if(e&&o>1){const e=t?n:function(e,t=0){let n=Math.floor(Math.random()*e);for(;nsetTimeout(t,e)))}(e)}else m(`Request ${h(o)} does not need to sleep, send immediately`)}function C(e,t){const n=structuredClone(t),o=$(n.requestConfig)?n.requestConfig:[n.requestConfig];for(const t of o){const{url:n,timeout:o,proxy:r}=t;w(e.baseUrl)||(t.url=e.baseUrl+n),w(o)&&(t.timeout=e.timeout),w(r)&&(t.proxy=e.proxy)}return w(n.intervalTime)&&(n.intervalTime=e.intervalTime),n}async function S(e,t,n,o){const r=$(t)?t:[t];"async"===e?await async function(e,t,n){const o=!w(t),r=y(t);m(`${d("Begin execution:")} mode: ${g("async")}, total: ${h(e.length)} `);let s=0,a=0,i=0;const u=[],c=[];for(const l of e){const e=++s;await T(o,r,t,e);const f=x(l).catch((t=>{i++;const n=`Request ${e} is an error: ${t.message}`;c.push({message:n,valueOf:()=>e})})).then((t=>{t&&(a++,n({id:e,...t}))}));u.push(f)}m(d("All requests have been sent!")),await Promise.all(u),f(c).forEach((e=>m(p(e.message)))),m(`requestsTotal: ${h(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o):await async function(e,t,n){const o=!w(t),r=y(t);m(`${d("Begin execution:")} mode: ${g("sync")}, total: ${h(e.length)}`);let s=0,a=0,i=0;for(const u of e){s++,await T(o,r,t,s);let e=!0,c=null;try{c={id:s,...await x(u)},m(d(`Request ${h(s)} is an success`)),a++}catch(t){e=!1,m(p(`Request ${s} is an error: ${t.message}`)),i++}e&&n&&n(c)}m(d("All requests are over!")),m(`requestsTotal: ${h(e.length)}, success: ${d(a)}, error: ${p(i)}`)}(r,n,o)}function P(e){let t=null,n=null,s=0;return async function(a,i){s++,1===s&&(n=r.launch().then((e=>{t=e}))),n&&(await Promise.all([n]),n=null);const u=await t.newPage();await u.setViewport({width:1280,height:1024});const{requestConfig:c}=C(e,{requestConfig:(l=a,"string"==typeof l?{url:a}:a)});var l;c.proxy?await t.createIncognitoBrowserContext({proxyServer:c.proxy}):await t.createIncognitoBrowserContext({proxyServer:void 0});const f=await u.goto(c.url,{timeout:c.timeout}),m=await u.content();0==--s&&t.close();const h={httpResponse:f,data:{page:u,jsdom:new o.JSDOM(m)}};return i&&i(h),h}}function b(e){return async function(t,n){const{requestConfig:o,intervalTime:r}=C(e,t),s=[];return await S(e.mode,o,r,(function(e){const t=e.headers["content-type"]??"",o=e.data,r=t.includes("text")?o.toString():JSON.parse(o.toString()),a={...e,data:r};n&&n(a),s.push(a)})),f(s.map((e=>({...e,valueOf:()=>e.id}))))}}function A(o){return async function(r,s){const{requestConfig:a,intervalTime:i,fileConfig:u}=C(o,r),c=[],l=[],g=[];e.existsSync(u.storeDir)||e.mkdirSync(u.storeDir),await S(o.mode,a,i,(function(e){const{id:o,headers:r,data:a}=e,i=r["content-type"]??"",f=u.extension??i.split("/").pop(),m=(new Date).getTime().toString(),h=n.resolve(u.storeDir,`${m}.${f}`),d=t.writeFile(h,a).catch((e=>{const t=`File save error at id ${o}: ${e.message}`;return g.push({message:t,valueOf:()=>o}),!0})).then((t=>{if(t)return;const n={...e,data:{fileName:m,mimeType:i,size:a.length,filePath:h}};s&&s(n),c.push(n)}));l.push(d)})),await Promise.all(l),f(g).forEach((e=>m(p(e.message))));const w=$(a)?a.length:1,y=c.length,q=w-y;return m(`saveFileTotal: ${h(w)}, success: ${d(y)}, error: ${p(q)}`),f(c.map((e=>({...e,valueOf:()=>e.id}))))}}function O(e,t){const{d:n,h:o,m:r}=e,s=(w(n)?0:1e3*n*60*60*24)+(w(o)?0:1e3*o*60*60)+(w(r)?0:1e3*r*60);let a=0;function i(){console.log(g(`Start the ${g.bold(++a)} polling`)),t(a)}i(),setInterval(i,s)}function D(e){const t=function(e){const t=e||{};return t.mode||(t.mode="async"),t}(e),n=function(e){return{fetchPage:P(e),fetchData:b(e),fetchFile:A(e),startPolling:O}}(t);return n}D({timeout:1e4,intervalTime:{max:3e3,min:1e3}});const M=D({timeout:1e4,intervalTime:{max:3e3,min:2e3}});M.startPolling({d:1},(()=>{M.fetchPage("https://www.bilibili.com/guochuang/").then((e=>{const{jsdom:t}=e.data,n=t.window.document.querySelectorAll(".carousel-wrapper .chief-recom-item img"),o=[];n.forEach((e=>o.push({url:`https:${e.src}`}))),M.fetchFile({requestConfig:o,fileConfig:{storeDir:"./upload"}})}))}));