Skip to content

Callmeboy #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion scripts/LeetCodeProvider.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ const {

module.exports = LeetCodeProvider = {
getProblemsTitle() {
Logger.success('开始抓取问题列表。。。。')
return Utils.httpGet(PROBLEMS_URL)
.then((body) => {
let titles = [];
let sHtml = Iconv.decode(body, "utf-8").toString();
cheerio
.load(sHtml)(QUESTION_DOM_SELECTOR)
.each((idx, ele) => titles.push(ele.attribs["title"]));
Logger.success("获取问题列表成功");
Logger.success("获取问题列表成功");
/**
* 由于QUESTION_DOM_SELECTOR 所选择的结构包含非问题标签,获取title会是undefined,在此需将其过滤掉
*/
Expand Down
12 changes: 12 additions & 0 deletions scripts/constants.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
module.exports = {

/**
* 爬取的数据源站点 github | gitee
*/

CRAWL_SITE: "github",

/**
* 需解析的语言类型
*/
Expand Down Expand Up @@ -45,4 +52,9 @@ module.exports = {
* 过滤英文文档末尾标识
*/
ENGLISH_MARKDOWN_SIGN: ".en.md",

/**
* 爬虫抓取同一文件时的最大失败次数(多为网络原因导致)
*/
MAX_CRAWL_RETRY_NUMBER : 100
};
29 changes: 21 additions & 8 deletions scripts/curlleetcode.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@ const {
RAW_MARKDOWN_OUTPUT_DIR,
REQUEST_RATE,
IS_FORCE_UPDATE_MODE,
MAX_CRAWL_RETRY_NUMBER
} = require("./constants");

/**
* 当前请求问题索引
*/
let requsetNumber = 0;
let retryCounter = 0;

Utils.mkdirSync(RAW_MARKDOWN_OUTPUT_DIR);

const getProblemDetail = (questionsName, requsetNumber) => {

const cachedFilesName = Utils.getDirsFileNameSync(RAW_MARKDOWN_OUTPUT_DIR);

if (
Expand All @@ -30,14 +33,16 @@ const getProblemDetail = (questionsName, requsetNumber) => {

getProblemDetail(questionsName, requsetNumber);
} else {
Logger.success(`开始抓取${questionsName[requsetNumber]}`)
questionsName[requsetNumber] &&
LeetCodeProvider.getProblemDetail(questionsName[requsetNumber])
.then((markDown) => {
.then(markDown => {
if (markDown) {
retryCounter = 0;
Logger.success(
`问题: "${
questionsName[requsetNumber]
}" | 结果: ${JSON.stringify(markDown)}`
}" | 结果: ${JSON.stringify(markDown).slice(100)}...`
);

Utils.writeFileSync(
Expand All @@ -48,19 +53,27 @@ const getProblemDetail = (questionsName, requsetNumber) => {

requsetNumber++;
} else {
Logger.error(`获取${questionsName[requsetNumber]} markdown 失败!`);
retryCounter++;
Logger.error(`获取${questionsName[requsetNumber]} markdown 第${retryCounter}次 失败!`);
}
})
.catch(Logger.error)
.then(() => {
setTimeout(() => {
questionsName[requsetNumber] &&
getProblemDetail(questionsName, requsetNumber);
}, REQUEST_RATE);
if (retryCounter >= MAX_CRAWL_RETRY_NUMBER) {
Logger.error(
`抓去问题 "${questionsName[requsetNumber]}" 失败次数已达上限, 请调整抓取速率 [REQUEST_RATE] 或稍后再试`
);
process.exit(0);
} else {
setTimeout(() => {
questionsName[requsetNumber] &&
getProblemDetail(questionsName, requsetNumber);
}, REQUEST_RATE);
}
});
}
};

LeetCodeProvider.getProblemsTitle().then((questionsName) => {
LeetCodeProvider.getProblemsTitle().then(questionsName => {
getProblemDetail(questionsName, requsetNumber);
});
42 changes: 23 additions & 19 deletions scripts/generateleetcode.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@ const {
SUPPORT_LANGUAGE,
DB_JSON_OUTPUT_DIR,
RAW_MARKDOWN_OUTPUT_DIR,
} = require('./constants')



ENGLISH_MARKDOWN_SIGN
} = require("./constants");

const genertateLeetcodeToJson = () => {
console.time("genertateLeetcodeToJson");

const rawMarkdowns = Utils.getDirsFileNameSync(
RAW_MARKDOWN_OUTPUT_DIR
).filter((name) => !name.endsWith(ENGLISH_MARKDOWN_SIGN));
).filter(name => !name.endsWith(ENGLISH_MARKDOWN_SIGN));

rawMarkdowns.forEach((filename) => {
rawMarkdowns.forEach(filename => {
let languageResloved = [];
let preKnowledge = [];
let keyPoints = [];
Expand All @@ -39,21 +37,25 @@ const genertateLeetcodeToJson = () => {
markdown = markdown.replace(/```python/g, "```py");
markdown = markdown.replace(/```c\+\+/g, "```cpp");

SUPPORT_LANGUAGE.forEach((lang) => {
SUPPORT_LANGUAGE.forEach(lang => {
markdown.replace(Utils.genCodeRegByLang(lang), (noUseMatch, $1) => {
languageResloved.push({
language: lang,
text: $1,
text: $1
});
});
});
markdown.replace(Utils.getSatelliteDataReg().pre, (noUseMatch, $1) => {

preKnowledge.push({
text: $1.replace('-',''),
link: null,
color: "red",
});
$1.replace(/-/g, "")
.split("\n")
.filter(Boolean)
.forEach(preTagName => {
preKnowledge.push({
text: preTagName,
link: null,
color: "red"
});
});
});

markdown.replace(
Expand All @@ -62,8 +64,8 @@ const genertateLeetcodeToJson = () => {
keyPoints = $1
.replace(/\s/g, "")
.split("-")
.filter((s) => s && s !== "解析")
.map((s) => ({ text: s, link: null, color: "blue" }));
.filter(s => s && s !== "解析")
.map(s => ({ text: s, link: null, color: "blue" }));
}
);

Expand All @@ -80,20 +82,22 @@ const genertateLeetcodeToJson = () => {
pre: preKnowledge,
keyPoints,
solution: `https://github.com/azl397985856/leetcode/blob/master/problems/${filename}`,
code: languageResloved,
code: languageResloved
};

console.log(oCustomStruct);

Logger.success(`开始生成 "${filename}"`);

Utils.writeFileSync(
"spider/yield-db-json",
`${name}.json`,
`${filename}.json`,
JSON.stringify(oCustomStruct, null, 2)
);

Logger.success(`生成 "${filename}" 完毕`);
console.timeEnd("genertateLeetcodeToJson");
});
console.timeEnd("genertateLeetcodeToJson");
};

const generateCollectionIndexFile = () => {
Expand Down
Loading