1
+ # coding:utf-8
2
+ import re
3
+ import json
4
+ import os
5
+ import threading
6
+ import time
7
+ import requests
8
+ from requests .exceptions import RequestException
9
+ from bs4 import BeautifulSoup
10
+
11
+ def get_proble_set (url ):
12
+ try :
13
+ response = requests .get (url )
14
+ if response .status_code == 200 :
15
+ return response .text
16
+ return None
17
+ except RequestException :
18
+ return None
19
+
20
+ def parse_proble_set (problemSet ):
21
+ print (len (problemSet )) # 2573
22
+ for i in range (len (problemSet )):
23
+ title = problemSet [i ]["stat" ]["question__title_slug" ]
24
+ if os .path .exists ("originData/[no content]{}.json" .format (title )) or os .path .exists ("originData/{}.json" .format (title )):
25
+ print (i , "has been parsed." )
26
+ # print("The question has been parsed: {}".format(title))
27
+ continue
28
+ #construct_url(title)
29
+ # time.sleep(0.5)
30
+ time .sleep (1 )
31
+ t = threading .Thread (target = construct_url ,args = (title ,))
32
+ t .start ()
33
+
34
+ print (i , "is done." )
35
+ if i > 15 :
36
+ break # 临时
37
+ continue
38
+
39
+ def construct_url (problemTitle ):
40
+ # url = "https://leetcode-cn.com/problems/"+ problemTitle + "/description/"
41
+ url = "https://leetcode-cn.com/problems/" + problemTitle + "/"
42
+ # print(url)
43
+ get_proble_content (url ,problemTitle )
44
+
45
+ def save_problem (title ,content , editorType = "" ):
46
+ #content = bytes(content,encoding = 'utf8')
47
+ filename = title + ".html"
48
+ if editorType == "MARKDOWN" :
49
+ filename = title + ".md"
50
+ # else if editorType = "CKEDITOR":
51
+ with open (filename ,'w+' ,encoding = "utf-8" )as f :
52
+ f .write (content )
53
+
54
+ def get_proble_content (problemUrl ,title ):
55
+ # 随便请求一个页面,获取csrf_token
56
+ response = requests .get ('https://leetcode-cn.com/graphql/' , data = '''{"operationName":"userPremiumInfo","variables":{},"query":"query userPremiumInfo {\n userStatus {\n isPremium\n subscriptionPlanType\n __typename\n }\n }\n "}''' )
57
+ setCookie = response .headers ["set-cookie" ]
58
+ # print(setCookie)
59
+ '''
60
+ print(setCookie)
61
+ setCookie = json.loads(setCookie)
62
+ print(type(setCookie))
63
+ '''
64
+ try :
65
+ pattern = re .compile (".*?csrftoken=(.*?);.*?" ,re .S )
66
+ csrftoken = re .search (pattern , setCookie )
67
+ # print(csrftoken.group(1))
68
+ url = "https://leetcode-cn.com/graphql"
69
+ data = {
70
+ "operationName" :"questionData" ,
71
+ "variables" :{"titleSlug" :title },
72
+ "query" : "query questionData($titleSlug: String!) {\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n categoryTitle\n boundTopicId\n title\n titleSlug\n content\n translatedTitle\n translatedContent\n isPaidOnly\n difficulty\n likes\n dislikes\n isLiked\n similarQuestions\n contributors {\n username\n profileUrl\n avatarUrl\n __typename\n }\n langToValidPlayground\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n codeSnippets {\n lang\n langSlug\n code\n __typename\n }\n stats\n hints\n solution {\n id\n canSeeDetail\n __typename\n }\n status\n sampleTestCase\n metaData\n judgerAvailable\n judgeType\n mysqlSchemas\n enableRunCode\n envInfo\n book {\n id\n bookName\n pressName\n source\n shortDescription\n fullDescription\n bookImgUrl\n pressImgUrl\n productUrl\n __typename\n }\n isSubscribed\n isDailyQuestion\n dailyRecordStatus\n editorType\n ugcQuestionId\n style\n exampleTestcases\n __typename\n }\n }\n "
73
+ }
74
+ headers = {
75
+ 'x-csrftoken' : csrftoken .group (1 ),
76
+ 'referer' :problemUrl ,
77
+ 'content-type' :'application/json' ,
78
+ 'origin' :'https://leetcode-cn.com' ,
79
+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
80
+ }
81
+ cookies = {
82
+ '__cfduid' :'d9ce37537c705e759f6bea15fffc9c58b1525271602' ,
83
+ '_ga' :'GA1.2.5783653.1525271604' ,
84
+ '_gid' :'GA1.2.344320119.1533189808' ,
85
+ 'csrftoken' :csrftoken .group (1 ),
86
+ ' _gat' :'1'
87
+ }
88
+ #payload表单为json格式
89
+
90
+ dumpJsonData = json .dumps (data )
91
+ response = requests .post (url ,data = dumpJsonData , headers = headers ,cookies = cookies )
92
+ dictInfo = json .loads (response .text )
93
+ # print(response.text)
94
+ if dictInfo ["data" ]["question" ].get ("content" ) is not None :
95
+ saveJSON (dictInfo , "originData/" + title + ".json" )
96
+ # 英文版
97
+ content = dictInfo ["data" ]["question" ]["content" ]
98
+ title = dictInfo ["data" ]["question" ]["title" ]
99
+
100
+ # 中文版
101
+ translatedContent = dictInfo ["data" ]["question" ]["translatedContent" ]
102
+ translatedTitle = dictInfo ["data" ]["question" ]["translatedTitle" ]
103
+ titleSlug = dictInfo ["data" ]["question" ]["titleSlug" ]
104
+ editorType = dictInfo ["data" ]["question" ]["editorType" ] # 分为 MARKDOWN 和 CKEDITOR 两种编辑器
105
+
106
+ # 中文版
107
+ save_problem ("problem (Chinese)/" + translatedTitle + " [{}]" .format (titleSlug ), translatedContent , editorType )
108
+ # 英文版
109
+ if content != "" and content != "<p>English description is not available for the problem. Please switch to Chinese.</p>" :
110
+ save_problem ("problem (English)/" + translatedTitle + "(English) [{}]" .format (titleSlug ), content )
111
+ else :
112
+ pass # 有一些题目没有英文,那么就不保存
113
+ else :
114
+ saveJSON (dictInfo , "originData/[no content]" + title + ".json" )
115
+ # print("no content")
116
+ except Exception as e :
117
+ print ("[error] " , e , problemUrl )
118
+
119
+ def saveJSON (data , filename ):
120
+ with open (filename , 'w' , encoding = 'utf-8' ) as f :
121
+ json .dump (data , f , ensure_ascii = False , indent = 4 )
122
+
123
+ def main ():
124
+ # url = "https://leetcode-cn.com/api/problems/all/"
125
+ # html = json.loads(get_proble_set(url))
126
+ # problemset = html["stat_status_pairs"]
127
+ # saveJSON(html, "[cn]json1-origin-data.json")
128
+ # saveJSON(problemset, "[cn]json2-problemset.json")
129
+
130
+ problemset = json .load (open ("[cn]json2-problemset.json" , 'r' , encoding = 'utf-8' ))
131
+ parse_proble_set (problemset )
132
+
133
+
134
+ if __name__ == '__main__' :
135
+ folderName = "算法题(国内版)"
136
+ if not os .path .exists (folderName ):
137
+ os .mkdir (folderName )
138
+ if not os .path .exists (folderName + "/originData" ):
139
+ os .mkdir (folderName + "/originData" )
140
+ if not os .path .exists (folderName + "/problem (Chinese)" ):
141
+ os .mkdir (folderName + "/problem (Chinese)" )
142
+ if not os .path .exists (folderName + "/problem (English)" ):
143
+ os .mkdir (folderName + "/problem (English)" )
144
+ os .chdir (folderName )
145
+ main ()
0 commit comments