-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathleetcode.py
138 lines (125 loc) · 7.25 KB
/
leetcode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# coding:utf-8
import re
import json
import os
import threading
import time
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_proble_set(url):
try:
response = requests.get(url, headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
}, verify=False)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_proble_set(problemSet):
# print(len(problemSet)) # 2218
for i in range(len(problemSet)):
# for i in range(930, len(problemSet)):
title = problemSet[i]["stat"]["question__title_slug"]
if os.path.exists("originData/[no content]{}.json".format(title)) or os.path.exists("originData/{}.json".format(title)):
print(i, "has been parsed.")
# print("The question has been parsed: {}".format(title))
continue
elif "paid_only" in problemSet[i]:
paid_only = problemSet[i]["paid_only"]
if paid_only:
print(i, '付费题目,跳过')
continue
#construct_url(title)
# time.sleep(0.5)
time.sleep(1)
t = threading.Thread(target=construct_url, args=(title,))
t.start()
print(i, "is done.")
continue
def construct_url(problemTitle):
url = "https://leetcode.com/problems/" + problemTitle + "/description/"
# print(url)
get_proble_content(url, problemTitle)
def save_problem(title,content):
#content = bytes(content,encoding = 'utf8')
filename = title + ".html"
with open(filename,'w+',encoding="utf-8")as f:
f.write(content)
def get_proble_content(problemUrl,title):
response = requests.get(problemUrl, headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
}, verify=False)
setCookie = response.headers["Set-Cookie"]
'''
print(setCookie)
setCookie = json.loads(setCookie)
print(type(setCookie))
'''
try:
pattern = re.compile("__cf_bm=(.*?);.*?",re.S)
csrftoken = re.search(pattern, setCookie)
url = "https://leetcode.com/graphql"
data = {
#"operationName":"getQuestionDetail",
"operationName":"questionData",
"variables":{"titleSlug":title},
# "query":"query getQuestionDetail($titleSlug: String!) {\n isCurrentUserAuthenticated\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n questionTitle\n translatedTitle\n questionTitleSlug\n content\n translatedContent\n difficulty\n stats\n allowDiscuss\n contributors\n similarQuestions\n mysqlSchemas\n randomQuestionUrl\n sessionId\n categoryTitle\n submitUrl\n interpretUrl\n codeDefinition\n sampleTestCase\n enableTestMode\n metaData\n enableRunCode\n enableSubmit\n judgerAvailable\n infoVerified\n envInfo\n urlManager\n article\n questionDetailUrl\n libraryUrl\n companyTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n __typename\n }\n interviewed {\n interviewedUrl\n companies {\n id\n name\n slug\n __typename\n }\n timeOptions {\n id\n name\n __typename\n }\n stageOptions {\n id\n name\n __typename\n }\n __typename\n }\n subscribeUrl\n isPremium\n loginUrl\n}\n"
"query": "query questionData($titleSlug: String!) {\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n boundTopicId\n title\n titleSlug\n content\n translatedTitle\n translatedContent\n isPaidOnly\n difficulty\n likes\n dislikes\n isLiked\n similarQuestions\n exampleTestcases\n categoryTitle\n contributors {\n username\n profileUrl\n avatarUrl\n __typename\n }\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n codeSnippets {\n lang\n langSlug\n code\n __typename\n }\n stats\n hints\n solution {\n id\n canSeeDetail\n paidOnly\n hasVideoSolution\n paidOnlyVideo\n __typename\n }\n status\n sampleTestCase\n metaData\n judgerAvailable\n judgeType\n mysqlSchemas\n enableRunCode\n enableTestMode\n enableDebugger\n envInfo\n libraryUrl\n adminUrl\n challengeQuestion {\n id\n date\n incompleteChallengeCount\n streakCount\n type\n __typename\n }\n __typename\n }\n}\n"
}
headers = {
'x-csrftoken': csrftoken.group(1),
'referer':problemUrl,
'content-type':'application/json',
'origin':'https://leetcode.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
cookies = {
'__cfduid':'d9ce37537c705e759f6bea15fffc9c58b1525271602',
'_ga':'GA1.2.5783653.1525271604',
'_gid':'GA1.2.344320119.1533189808',
'csrftoken':csrftoken.group(1),
' _gat':'1'
}
#payload表单为json格式
dumpJsonData = json.dumps(data)
response = requests.post(url,data = dumpJsonData, headers = headers,cookies = cookies)
dictInfo = json.loads(response.text)
if dictInfo["data"]["question"].get("content") is not None:
saveJSON(dictInfo, "originData/" + title + ".json")
content = dictInfo["data"]["question"]["content"]
save_problem("problem/" + title, content)
# soup = BeautifulSoup(content, 'lxml')
# save_problem(title,soup.prettify())
else:
saveJSON(dictInfo, "originData/[no content]" + title + ".json")
# print("no content")
except Exception as e:
print("[error] ", e, problemUrl)
def saveJSON(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def main():
url = "https://leetcode.com/api/problems/all/"
jsonContent = get_proble_set(url)
if jsonContent == None:
print('列表请求失败!')
return
html = json.loads(jsonContent)
saveJSON(html, "origin-data.json")
# html = json.load(open("origin-data.json", 'r', encoding='utf-8'))
problemset = html["stat_status_pairs"]
parse_proble_set(problemset)
if __name__=='__main__':
folderName = "leetcode"
if not os.path.exists(folderName):
os.mkdir(folderName)
if not os.path.exists(folderName + "/originData"):
os.mkdir(folderName + "/originData")
if not os.path.exists(folderName + "/problem"):
os.mkdir(folderName + "/problem")
os.chdir(folderName)
main()