From 6bcb1729d34a331c64892338ed4c45719e129c01 Mon Sep 17 00:00:00 2001 From: coderhxl Date: Mon, 20 Feb 2023 19:16:23 +0800 Subject: [PATCH] Sorting of error messages and fetchData/fetchFile API results --- package.json | 2 +- publish/package.json | 2 +- src/index.ts | 11 ++++++++-- src/request.ts | 21 +++++++++++++------ src/sort.ts | 48 ++++++++++++++++++++++++++++++++++++++++++++ test/start/index.js | 2 +- test/start/index.ts | 35 ++++++++++++++++---------------- 7 files changed, 93 insertions(+), 28 deletions(-) create mode 100644 src/sort.ts diff --git a/package.json b/package.json index b179186a..eee73570 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "1.0.0", + "version": "1.0.1", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/publish/package.json b/publish/package.json index 5b00866d..164aa408 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "1.0.0", + "version": "1.0.1", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/src/index.ts b/src/index.ts index 24b2d8ef..c0aa3848 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,6 +3,7 @@ import path from 'node:path' import { JSDOM } from 'jsdom' import { batchRequest, syncBatchRequest, request } from './request' +import { quickSort } from './sort' import { isArray, isString, @@ -140,7 +141,10 @@ export default class XCrawl { await this.useBatchRequestByMode(requestConifg, intervalTime, handleResItem) - return container + const res = quickSort( + container.map((item) => ({ ...item, valueOf: () => item.id })) + ) + return res } async fetchFile( @@ -189,8 +193,11 @@ export default class XCrawl { success )}, error: ${logError(error)}` ) + const res = quickSort( + container.map((item) => ({ ...item, valueOf: () => item.id })) + ) - return container + return res } fetchPolling(config: IFetchPollingConfig, callback: (count: number) => void) { diff --git a/src/request.ts b/src/request.ts index d4b617c9..8a284853 100644 --- a/src/request.ts +++ b/src/request.ts @@ -3,6 +3,7 @@ import https from 'node:https' import Url, { URL } from 'node:url' import HttpsProxyAgent from 'https-proxy-agent' +import { quickSort } from './sort' import { isNumber, isUndefined, @@ -176,7 +177,9 @@ export async function batchRequest( let index = 0 let successTotal = 0 let errorTotal = 0 - const requestQueue: Promise[] = [] + const requestQueue: Promise[] = [] + const errorMessage: { id: number; message: string; valueOf: () => number }[] = + [] for (const requestConifg of requestConifgs) { const id = ++index @@ -190,10 +193,15 @@ export async function batchRequest( const requestItem = request(requestConifg) .catch((error: any) => { errorTotal++ - return `Request ${id} is an error: ${error.message}` + + const message = `Request ${id} is an error: ${error.message}` + // valueOf 为排序做准备 + const valueOf = () => id + + errorMessage.push({ id, message, valueOf }) }) .then((requestRes) => { - if (typeof requestRes === 'string') return requestRes + if (!requestRes) return successTotal++ callback({ id, ...requestRes }) @@ -204,10 +212,11 @@ export async function batchRequest( log(logSuccess('All requests have been sent!')) - const res = await Promise.all(requestQueue) + // 等待所有请求结束 + await Promise.all(requestQueue) - // 打印错误消息 - res.forEach((item) => (item ? log(logError(item)) : '')) + // 排序后打印错误消息 + quickSort(errorMessage).forEach((item) => log(logError(item.message))) log( `requestsTotal: ${logNumber(requestConifgs.length)}, success: ${logSuccess( diff --git a/src/sort.ts b/src/sort.ts new file mode 100644 index 00000000..d1115dd5 --- /dev/null +++ b/src/sort.ts @@ -0,0 +1,48 @@ +function swap(arr: any[], i: number, j: number) { + const temp = arr[i] + arr[i] = arr[j] + arr[j] = temp +} + +export function quickSort(arr: T): T { + const n = arr.length + + partition(0, n - 1) + + function partition(left: number, right: number) { + if (left >= right) return + + // 1.找基准元素 + const pivot = arr[right] + + // 2.定义双指针进行交换(左小右大) + let i = left + let j = right - 1 + while (i <= j) { + while (arr[i] < pivot) { + i++ + } + + while (arr[j] > pivot) { + j-- + } + + if (i <= j) { + swap(arr, i, j) + i++ + j-- + } + } + + // 3.将 pivot 放到正确位置 + swap(arr, i, right) + + // 4.左右划分区域 + partition(left, i - 1) + partition(i + 1, right) + } + + return arr +} + +// console.log(quickSort([7, 3, 6, 4, 9, 2, 1, 5])) diff --git a/test/start/index.js b/test/start/index.js index 199e1f7a..c699ef39 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),s=require("node:https"),r=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,h=i.red,f=i.yellow;function d(e){return void 0===e}function m(e){return"number"==typeof e}function g(e){return Array.isArray(e)}function p(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function y(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new r.URL(e.url),l="http:"===t,h={agent:e.proxy?a(e.proxy):l?new n.Agent:new s.Agent,protocol:t,hostname:o,port:i,path:c,search:p(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return h.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,h),h}function q(e){return new Promise(((t,o)=>{const r=d(e.data);e.data=r?e.data:JSON.stringify(e.data);const a=y(e);function i(e){const{statusCode:o,headers:n}=e,s=[];e.on("data",(e=>s.push(e))),e.on("end",(()=>{const e=Buffer.concat(s);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):s.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||r||c.write(e.data),c.end()}))}async function $(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const w=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=g(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:s}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(s)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t,o){const n=g(e)?e:[e];"sync"!==this.baseConfig.mode?await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);let r=0,a=0,i=0;const f=[];for(const c of e){const e=++r;await $(n,s,t,e);const u=q(c).catch((t=>(i++,`Request ${e} is an error: ${t.message}`))).then((t=>{if("string"==typeof t)return t;a++,o({id:e,...t})}));f.push(u)}c(l("All requests have been sent!")),(await Promise.all(f)).forEach((e=>e?c(h(e)):"")),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${h(i)}`)}(n,t,o):await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,a=0,i=0;for(const f of e){r++,await $(n,s,t,r);let e=!0,d=null;try{d={id:r,...await q(f)},c(l(`Request ${u(r)} is an success`)),a++}catch(t){e=!1,c(h(`Request ${r} is an error: ${t.message}`)),i++}e&&o&&o(d)}c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${h(i)}`)}(n,t,o)}async fetchHTML(e,t){const{requestConifg:n}=this.mergeConfig({requestConifg:(s=e,"string"==typeof s?{url:e}:e)});var s;const r=await q(n),a=r.data.toString(),i={...r,data:{html:a,jsdom:new o.JSDOM(a)}};return t&&t(i),i}async fetchData(e,t){const{requestConifg:o,intervalTime:n}=this.mergeConfig(e),s=[];return await this.useBatchRequestByMode(o,n,(function(e){const o=e.headers["content-type"]??"",n=e.data,r=o.includes("text")?n.toString():JSON.parse(n.toString()),a={...e,data:r};t&&t(a),s.push(a)})),s}async fetchFile(o,n){const{requestConifg:s,intervalTime:r,fileConfig:a}=this.mergeConfig(o),i=[];await this.useBatchRequestByMode(s,r,(function(o){const{id:s,headers:r,data:u}=o,l=r["content-type"]??"",f=a.extension??l.split("/").pop(),d=(new Date).getTime().toString(),m=e.resolve(a.storeDir,`${d}.${f}`);try{t.writeFileSync(m,u);const e={...o,data:{fileName:d,mimeType:l,size:u.length,filePath:m}};n&&n(e),i.push(e)}catch(e){c(h(`File save error at id ${s}: ${e.message}`))}}));const f=g(s)?s.length:1,d=i.length,m=f-d;return c(`saveTotal: ${u(f)}, success: ${l(d)}, error: ${h(m)}`),i}fetchPolling(e,t){const{Y:o,M:n,d:s,h:r,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(s)?0:1e3*s*60*60*24)+(d(r)?0:1e3*r*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(f(`Start the ${f.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:2e3,min:1e3},mode:"async"});w.fetchPolling({m:3},(()=>{w.fetchHTML("https://www.bilibili.com/guochuang/",(e=>{console.log("fetchHTML Callback: ",e.statusCode)})).then((t=>{const{jsdom:o}=t.data,n=[];o.window.document.querySelectorAll(".chief-recom-item").forEach((e=>n.push(e.querySelector("img").src)));const s=n.map((e=>({url:`https:${e}`})));s.pop(),w.fetchFile({requestConifg:s,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}},(e=>{console.log(e.id,e.statusCode,e.data.fileName)}))}))})); +"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),s=require("node:https"),r=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,f=i.red,h=i.yellow;function d(e){return void 0===e}function m(e){return"number"==typeof e}function g(e){return Array.isArray(e)}function p(e,t,o){const n=e[t];e[t]=e[o],e[o]=n}function y(e){return function t(o,n){if(o>=n)return;const s=e[n];let r=o,a=n-1;for(;r<=a;){for(;e[r]s;)a--;r<=a&&(p(e,r,a),r++,a--)}p(e,r,n),t(o,r-1),t(r+1,n)}(0,e.length-1),e}function q(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function $(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new r.URL(e.url),l="http:"===t,f={agent:e.proxy?a(e.proxy):l?new n.Agent:new s.Agent,protocol:t,hostname:o,port:i,path:c,search:q(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return f.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,f),f}function w(e){return new Promise(((t,o)=>{const r=d(e.data);e.data=r?e.data:JSON.stringify(e.data);const a=$(e);function i(e){const{statusCode:o,headers:n}=e,s=[];e.on("data",(e=>s.push(e))),e.on("end",(()=>{const e=Buffer.concat(s);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):s.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||r||c.write(e.data),c.end()}))}async function C(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const T=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=g(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:s}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(s)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t,o){const n=g(e)?e:[e];"sync"!==this.baseConfig.mode?await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);let r=0,a=0,i=0;const h=[],g=[];for(const c of e){const e=++r;await C(n,s,t,e);const u=w(c).catch((t=>{i++;const o=`Request ${e} is an error: ${t.message}`;g.push({id:e,message:o,valueOf:()=>e})})).then((t=>{t&&(a++,o({id:e,...t}))}));h.push(u)}c(l("All requests have been sent!")),await Promise.all(h),y(g).forEach((e=>c(f(e.message)))),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${f(i)}`)}(n,t,o):await async function(e,t,o){const n=!d(t),s=m(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,a=0,i=0;for(const h of e){r++,await C(n,s,t,r);let e=!0,d=null;try{d={id:r,...await w(h)},c(l(`Request ${u(r)} is an success`)),a++}catch(t){e=!1,c(f(`Request ${r} is an error: ${t.message}`)),i++}e&&o&&o(d)}c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(a)}, error: ${f(i)}`)}(n,t,o)}async fetchHTML(e,t){const{requestConifg:n}=this.mergeConfig({requestConifg:(s=e,"string"==typeof s?{url:e}:e)});var s;const r=await w(n),a=r.data.toString(),i={...r,data:{html:a,jsdom:new o.JSDOM(a)}};return t&&t(i),i}async fetchData(e,t){const{requestConifg:o,intervalTime:n}=this.mergeConfig(e),s=[];await this.useBatchRequestByMode(o,n,(function(e){const o=e.headers["content-type"]??"",n=e.data,r=o.includes("text")?n.toString():JSON.parse(n.toString()),a={...e,data:r};t&&t(a),s.push(a)}));return y(s.map((e=>({...e,valueOf:()=>e.id}))))}async fetchFile(o,n){const{requestConifg:s,intervalTime:r,fileConfig:a}=this.mergeConfig(o),i=[];await this.useBatchRequestByMode(s,r,(function(o){const{id:s,headers:r,data:u}=o,l=r["content-type"]??"",h=a.extension??l.split("/").pop(),d=(new Date).getTime().toString(),m=e.resolve(a.storeDir,`${d}.${h}`);try{t.writeFileSync(m,u);const e={...o,data:{fileName:d,mimeType:l,size:u.length,filePath:m}};n&&n(e),i.push(e)}catch(e){c(f(`File save error at id ${s}: ${e.message}`))}}));const h=g(s)?s.length:1,d=i.length,m=h-d;c(`saveTotal: ${u(h)}, success: ${l(d)}, error: ${f(m)}`);return y(i.map((e=>({...e,valueOf:()=>e.id}))))}fetchPolling(e,t){const{Y:o,M:n,d:s,h:r,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(s)?0:1e3*s*60*60*24)+(d(r)?0:1e3*r*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(h(`Start the ${h.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:0,min:0},mode:"async"});T.fetchHTML("https://www.bilibili.com/guochuang/",(e=>{console.log("fetchHTML Callback: ",e.statusCode)})).then((t=>{const{jsdom:o}=t.data,n=[];o.window.document.querySelectorAll(".chief-recom-item").forEach((e=>n.push(e.querySelector("img").src)));const s=n.map((e=>({url:`https:${e}`})));s.pop(),T.fetchFile({requestConifg:s,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}},(e=>{console.log(e.id,e.statusCode,e.data.fileName)})).then((e=>console.log(e)))})); diff --git a/test/start/index.ts b/test/start/index.ts index f42683c8..480469c5 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -3,7 +3,7 @@ import XCrawl from '../../src' const testXCrawl = new XCrawl({ timeout: 10000, - intervalTime: { max: 2000, min: 1000 }, + intervalTime: { max: 0, min: 0 }, mode: 'async' }) @@ -17,23 +17,23 @@ const testXCrawl = new XCrawl({ // ] // }) -testXCrawl.fetchPolling({ m: 3 }, () => { - testXCrawl - .fetchHTML('https://www.bilibili.com/guochuang/', (res) => { - console.log('fetchHTML Callback: ', res.statusCode) - }) - .then((res) => { - const { jsdom } = res.data +// testXCrawl.fetchPolling({ m: 3 }, () => { +testXCrawl + .fetchHTML('https://www.bilibili.com/guochuang/', (res) => { + console.log('fetchHTML Callback: ', res.statusCode) + }) + .then((res) => { + const { jsdom } = res.data - const imgSrc: string[] = [] - const recomEls = - jsdom.window.document.querySelectorAll('.chief-recom-item') - recomEls.forEach((item) => imgSrc.push(item.querySelector('img')!.src)) + const imgSrc: string[] = [] + const recomEls = jsdom.window.document.querySelectorAll('.chief-recom-item') + recomEls.forEach((item) => imgSrc.push(item.querySelector('img')!.src)) - const requestConifg = imgSrc.map((src) => ({ url: `https:${src}` })) - requestConifg.pop() + const requestConifg = imgSrc.map((src) => ({ url: `https:${src}` })) + requestConifg.pop() - testXCrawl.fetchFile( + testXCrawl + .fetchFile( { requestConifg, fileConfig: { storeDir: path.resolve(__dirname, './upload') } @@ -42,5 +42,6 @@ testXCrawl.fetchPolling({ m: 3 }, () => { console.log(res.id, res.statusCode, res.data.fileName) } ) - }) -}) + .then((res) => console.log(res)) + }) +// })