Skip to content

Commit e2daac1

Browse files
committed
add 国内版 Python代码
1 parent 44cb2b9 commit e2daac1

File tree

2 files changed

+145
-8
lines changed

2 files changed

+145
-8
lines changed

国内版.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# coding:utf-8
2+
import re
3+
import json
4+
import os
5+
import threading
6+
import time
7+
import requests
8+
from requests.exceptions import RequestException
9+
from bs4 import BeautifulSoup
10+
11+
def get_proble_set(url):
12+
try:
13+
response = requests.get(url)
14+
if response.status_code == 200:
15+
return response.text
16+
return None
17+
except RequestException:
18+
return None
19+
20+
def parse_proble_set(problemSet):
21+
print(len(problemSet)) # 2573
22+
for i in range(len(problemSet)):
23+
title = problemSet[i]["stat"]["question__title_slug"]
24+
if os.path.exists("originData/[no content]{}.json".format(title)) or os.path.exists("originData/{}.json".format(title)):
25+
print(i, "has been parsed.")
26+
# print("The question has been parsed: {}".format(title))
27+
continue
28+
#construct_url(title)
29+
# time.sleep(0.5)
30+
time.sleep(1)
31+
t =threading.Thread(target=construct_url,args=(title,))
32+
t.start()
33+
34+
print(i, "is done.")
35+
if i > 15:
36+
break # 临时
37+
continue
38+
39+
def construct_url(problemTitle):
40+
# url = "https://leetcode-cn.com/problems/"+ problemTitle + "/description/"
41+
url = "https://leetcode-cn.com/problems/"+ problemTitle + "/"
42+
# print(url)
43+
get_proble_content(url,problemTitle)
44+
45+
def save_problem(title,content, editorType = ""):
46+
#content = bytes(content,encoding = 'utf8')
47+
filename = title + ".html"
48+
if editorType == "MARKDOWN":
49+
filename = title + ".md"
50+
# else if editorType = "CKEDITOR":
51+
with open(filename,'w+',encoding="utf-8")as f:
52+
f.write(content)
53+
54+
def get_proble_content(problemUrl,title):
55+
# 随便请求一个页面,获取csrf_token
56+
response = requests.get('https://leetcode-cn.com/graphql/', data = '''{"operationName":"userPremiumInfo","variables":{},"query":"query userPremiumInfo {\n userStatus {\n isPremium\n subscriptionPlanType\n __typename\n }\n}\n"}''')
57+
setCookie = response.headers["set-cookie"]
58+
# print(setCookie)
59+
'''
60+
print(setCookie)
61+
setCookie = json.loads(setCookie)
62+
print(type(setCookie))
63+
'''
64+
try:
65+
pattern = re.compile(".*?csrftoken=(.*?);.*?",re.S)
66+
csrftoken = re.search(pattern, setCookie)
67+
# print(csrftoken.group(1))
68+
url = "https://leetcode-cn.com/graphql"
69+
data = {
70+
"operationName":"questionData",
71+
"variables":{"titleSlug":title},
72+
"query": "query questionData($titleSlug: String!) {\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n categoryTitle\n boundTopicId\n title\n titleSlug\n content\n translatedTitle\n translatedContent\n isPaidOnly\n difficulty\n likes\n dislikes\n isLiked\n similarQuestions\n contributors {\n username\n profileUrl\n avatarUrl\n __typename\n }\n langToValidPlayground\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n codeSnippets {\n lang\n langSlug\n code\n __typename\n }\n stats\n hints\n solution {\n id\n canSeeDetail\n __typename\n }\n status\n sampleTestCase\n metaData\n judgerAvailable\n judgeType\n mysqlSchemas\n enableRunCode\n envInfo\n book {\n id\n bookName\n pressName\n source\n shortDescription\n fullDescription\n bookImgUrl\n pressImgUrl\n productUrl\n __typename\n }\n isSubscribed\n isDailyQuestion\n dailyRecordStatus\n editorType\n ugcQuestionId\n style\n exampleTestcases\n __typename\n }\n}\n"
73+
}
74+
headers = {
75+
'x-csrftoken': csrftoken.group(1),
76+
'referer':problemUrl,
77+
'content-type':'application/json',
78+
'origin':'https://leetcode-cn.com',
79+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
80+
}
81+
cookies = {
82+
'__cfduid':'d9ce37537c705e759f6bea15fffc9c58b1525271602',
83+
'_ga':'GA1.2.5783653.1525271604',
84+
'_gid':'GA1.2.344320119.1533189808',
85+
'csrftoken':csrftoken.group(1),
86+
' _gat':'1'
87+
}
88+
#payload表单为json格式
89+
90+
dumpJsonData = json.dumps(data)
91+
response = requests.post(url,data = dumpJsonData, headers = headers,cookies = cookies)
92+
dictInfo = json.loads(response.text)
93+
# print(response.text)
94+
if dictInfo["data"]["question"].get("content") is not None:
95+
saveJSON(dictInfo, "originData/" + title + ".json")
96+
# 英文版
97+
content = dictInfo["data"]["question"]["content"]
98+
title = dictInfo["data"]["question"]["title"]
99+
100+
# 中文版
101+
translatedContent = dictInfo["data"]["question"]["translatedContent"]
102+
translatedTitle = dictInfo["data"]["question"]["translatedTitle"]
103+
titleSlug = dictInfo["data"]["question"]["titleSlug"]
104+
editorType = dictInfo["data"]["question"]["editorType"] # 分为 MARKDOWN 和 CKEDITOR 两种编辑器
105+
106+
# 中文版
107+
save_problem("problem (Chinese)/" + translatedTitle + " [{}]".format(titleSlug), translatedContent, editorType)
108+
# 英文版
109+
if content != "" and content != "<p>English description is not available for the problem. Please switch to Chinese.</p>":
110+
save_problem("problem (English)/" + translatedTitle + "(English) [{}]".format(titleSlug), content)
111+
else:
112+
pass # 有一些题目没有英文,那么就不保存
113+
else:
114+
saveJSON(dictInfo, "originData/[no content]" + title + ".json")
115+
# print("no content")
116+
except Exception as e:
117+
print("[error] ", e, problemUrl)
118+
119+
def saveJSON(data, filename):
120+
with open(filename, 'w', encoding='utf-8') as f:
121+
json.dump(data, f, ensure_ascii=False, indent=4)
122+
123+
def main():
124+
# url = "https://leetcode-cn.com/api/problems/all/"
125+
# html = json.loads(get_proble_set(url))
126+
# problemset = html["stat_status_pairs"]
127+
# saveJSON(html, "[cn]json1-origin-data.json")
128+
# saveJSON(problemset, "[cn]json2-problemset.json")
129+
130+
problemset = json.load(open("[cn]json2-problemset.json", 'r', encoding='utf-8'))
131+
parse_proble_set(problemset)
132+
133+
134+
if __name__=='__main__':
135+
folderName = "算法题(国内版)"
136+
if not os.path.exists(folderName):
137+
os.mkdir(folderName)
138+
if not os.path.exists(folderName + "/originData"):
139+
os.mkdir(folderName + "/originData")
140+
if not os.path.exists(folderName + "/problem (Chinese)"):
141+
os.mkdir(folderName + "/problem (Chinese)")
142+
if not os.path.exists(folderName + "/problem (English)"):
143+
os.mkdir(folderName + "/problem (English)")
144+
os.chdir(folderName)
145+
main()

国外版.py

-8
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def parse_proble_set(problemSet):
2929
#construct_url(title)
3030
# time.sleep(0.5)
3131
time.sleep(1)
32-
# time.sleep(random.randint(0,9) / 10)
3332
t =threading.Thread(target=construct_url,args=(title,))
3433
t.start()
3534

@@ -108,13 +107,6 @@ def main():
108107
# saveJSON(html, "[en]json1-origin-data.json")
109108
# saveJSON(problemset, "[en]json2-problemset.json")
110109

111-
# url = "https://leetcode-cn.com/api/problems/all/"
112-
# html = json.loads(get_proble_set(url))
113-
# problemset = html["stat_status_pairs"]
114-
# saveJSON(html, "[cn]json1-origin-data.json")
115-
# saveJSON(problemset, "[cn]json2-problemset.json")
116-
# exit()
117-
118110
problemset = json.load(open("[en]json2-problemset.json", 'r', encoding='utf-8'))
119111
parse_proble_set(problemset)
120112

0 commit comments

Comments
 (0)