-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpypi_pip_search.py
executable file
·599 lines (523 loc) · 23.1 KB
/
pypi_pip_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
#!/usr/bin/env python
# PYTHON_ARGCOMPLETE_OK
"""
Main module for running "new and improved" python package searches with better metrics.
"""
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import csv
from datetime import datetime, time as dt_time, timedelta
import json
import logging
import os
import re
import sys
from tempfile import NamedTemporaryFile
import time
import urlparse
from lxml.html import etree, HTMLParser
from namedlist import namedlist
from path import Path
import argcomplete
import dateutil.parser
import requests
import progbar
from generic_download_queue import GenericDownloadQueue
from queuing_thread import QueuingThread
try:
import sh
except ImportError:
import pbs as sh # On Windows, pbs takes the place of sh
if not logging.root.handlers:
logging.basicConfig(format='%(asctime)s-{0}'.format(logging.BASIC_FORMAT),
level=logging.INFO)
ARIA2C_OPTIONS = {"no-conf": True,
"timeout": 5,
"connect-timeout": 5,
"lowest-speed-limit": 256,
"file-allocation": "falloc",
"min-split-size": 1048576,
"summary-interval": 3,
"max-connection-per-server": 2,
"max-tries": 21,
"max-file-not-found": 5,
"max-resume-failure-tries": 5,
"retry-wait": 1,
"deferred-input": True,
"max-concurrent-downloads": 34,
"auto-file-renaming": False,
"allow-overwrite": True,
# "async-dns": True,
"conditional-get": True,
"remote-time": True,
"http-accept-gzip": True,
"enable-http-pipelining": True,
"enable-http-keep-alive": True,
"log-level": "notice",
"console-log-level": "notice",
"ca-certificate": "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem"}
ARIA2C_OPTIONS["check-certificate"] = os.path.exists(ARIA2C_OPTIONS["ca-certificate"])
_PypiSearchResult = namedlist("_PypiSearchResult", ["link", "weight", "summary",
("download_counts", []),
("last_update", None)])
class PypiSearchResult(_PypiSearchResult):
"""
A named object representing a search result.
"""
@classmethod
def from_dict(cls, data_dict):
return PypiSearchResult(link=data_dict["link"],
weight=data_dict["weight"],
summary=data_dict["summary"] or "",
download_counts=data_dict.get("download_counts"),
last_update=data_dict.get("last_update"))
def to_dict(self):
return {"link": self.link,
"weight": self.weight,
"summary": self.summary,
"download_rate": self.download_rate,
"last_update": self.last_update.isoformat() if self.last_update is not None else None}
@property
def version(self):
"""
:return: The package version
:rtype: str
"""
return self.link.split("/")[-1]
@property
def name(self):
"""
:return: The name of this named object
:rtype: str
"""
return self.link.split("/")[-2]
@property
def ftp_page_url(self):
return "https://pypi.python.org/packages/source/{0[0]}/{0}/".format(self.name)
@property
def age(self):
if self.last_update is None:
return 3488
return (datetime.now().date() - self.last_update.date()).days
@property
def scaled_age(self):
"""
The age, scaled by rank, is 116.8 - 45506 / (397.7 + age).
"""
return 45506.0 / (self.age + 397.7) - 16.8
@property
def download_rate(self):
return max(self.download_counts[1] / 7.0, self.download_counts[2] / 30.0) if self.download_counts else -1
@property
def scaled_download_rate(self):
"""
The overall download rate, scaled by rank, is 98.21 - 742 / (6.404 + rate).
"""
return 98.21 - (742 / (self.download_rate + 6.404))
@property
def scaled_weight(self):
return (self.weight - 1) * 0.1
@property
def score(self):
"""
The total score of this search result, scaled by category. The weight (according to PyPI) is worth 2,
the age is worth 3, and the download rate is worth 5.
:return: The total score for this search result
:rtype: float
"""
return self.scaled_weight * 2.0 + self.scaled_age * 3.0e-2 + self.scaled_download_rate * 5.0e-2
def has_recent_download(self, search_dir, max_days):
"""
Return True if this file has been recently downloaded.
:param search_dir: The directory to search in
:type search_dir: str
:param max_days: The maximum number of days to consider "recent"
:type max_days: float
:return: True if there is a file recently downloaded, otherwise False
:rtype: bool
"""
cur_time = time.time()
target_file = os.path.join(search_dir, self.name)
if not os.path.exists(target_file):
return False
stats = os.stat(target_file)
file_time = max(stats.st_ctime, stats.st_mtime)
return cur_time - file_time < (max_days * 86400.0)
def apply_update(self, page_content):
"""
From the given page content, parse and add the download statistics to this search result.
"""
tree = etree.fromstring(page_content, HTMLParser())
counts = tree.xpath("//ul[@class='nodot'][li[strong[starts-with(text(), 'Downloads')]]]/li/span/text()")
self.download_counts = [float(count) for count in counts]
last_update = tree.xpath("//table[@class='list']/tr[@class]/td[4]/text()")
if last_update not in [None, []]:
self.last_update = dateutil.parser.parse(last_update[0], ignoretz=True)
return True
self.last_update = None
return False
def add_latest_date_from_ftp_page(self, page_content):
"""
From the given page content, parse and add the latest date listed.
"""
tree = etree.fromstring(page_content, HTMLParser())
xpath_arg = "//a[@href][starts-with(., '{0}')]".format(self.name)
link_elems = tree.xpath(xpath_arg)
max_date = datetime.min
for elem in link_elems:
date_size_parts = (elem.tail or "").strip().split()
if not date_size_parts:
continue
date_str = " ".join(date_size_parts[:-1])
date_val = dateutil.parser.parse(date_str, ignoretz=True)
# If parser returns default date, it's most likely an error, so skip over it.
default_date = datetime.combine(datetime.now().date(), dt_time.min)
if date_val == default_date:
continue
max_date = max(date_val, max_date)
self.last_update = max_date
def run_backup_update(self):
"""
Run a secondary update method in order to get the timestamp for the last project update, in case the primary
update method (via parsing the PyPI project main page) fails.
Basically this entails trying to navigate to 2 possible PyPI FTP sites and finding the latest date(s) listed.
"""
if self.last_update is not None:
return
ftp_url = self.ftp_page_url
ftp_resp = requests.get(ftp_url)
if not ftp_resp.ok:
orig_part = "/{0[0]}/{0[0]}".format(self.name)
capitalized_url = ftp_url.replace(orig_part, orig_part.upper())
ftp_resp = requests.get(capitalized_url)
self.add_latest_date_from_ftp_page(ftp_resp.content)
def is_pip_result(self, search_term):
"""
Return True if this result would be expected in the list from pip search, otherwise False.
:param search_term: the specific search term to compare
:type search_term: str
"""
if self.weight >= 4:
return True
if self.weight in (2, 3):
return search_term.lower() in self.summary.lower()
return False
def to_aria2_input_entry(self):
"""
Return this result formatted as an aria2c input file entry.
"""
link_obj = urlparse.urlparse(self.link)._asdict()
versionless_path = Path(link_obj.pop('path')).dirname().dirname().joinpath('json')
new_link_obj = urlparse.ParseResult(path=versionless_path, **link_obj)
return '{0}\n out={1}\n'.format(new_link_obj.geturl(), self.name)
def to_csv(self):
"""
Return a line of CSV for this result.
"""
csv_fmt = "\"{0.name}\",\"{0.version}\",{0.weight},{0.download_rate:0.2f},{0.age},{0.score:0.3f}"
return csv_fmt.format(self)
@classmethod
def from_csv(cls, csv_line, ref_date=None):
"""
Given a line from a CSV file, read it and return a basic :class:`PypiSearchResult` object.
"""
ref_date = ref_date or datetime.utcnow()
csv_parts = list(csv.reader([csv_line]))[0] if isinstance(csv_line, (str, unicode)) else csv_line
link = "https://pypi.python.org/pypi/{0[0]}/{0[1]}".format(csv_parts)
weight = int(csv_parts[2])
rates = [float(csv_parts[3]), float(csv_parts[3]) * 7.0, float(csv_parts[3]) * 30.0]
start_date = ref_date - timedelta(days=int(csv_parts[4]))
return PypiSearchResult(link, weight, "", rates, start_date)
class PypiJsonSearchResult(PypiSearchResult):
@property
def version(self):
"""
:return: The package version
:rtype: str
"""
return self.link.split("/")[-2]
@property
def name(self):
"""
:return: The name of this named object
:rtype: str
"""
return self.link.split("/")[-3]
@classmethod
def from_csv(cls, csv_line, ref_date=None):
ref_date = ref_date or datetime.utcnow()
csv_parts = list(csv.reader([csv_line]))[0] if isinstance(csv_line, str) else csv_line
link = "https://pypi.python.org/pypi/{0[0]}/json".format(csv_parts)
weight = int(csv_parts[2])
rates = [float(csv_parts[3]), float(csv_parts[3]) * 7.0, float(csv_parts[3]) * 30.0]
start_date = ref_date - timedelta(days=int(csv_parts[4]))
return PypiJsonSearchResult(link, weight, "", rates, start_date)
def apply_update(self, new_content):
try:
json_dict = json.loads(new_content)
except ValueError:
logging.exception("Error parsing JSON content update:\n%r", new_content)
self.download_counts = [-1.0, -1.0, -1.0]
self.last_update = None
return False
dl_info = json_dict["info"]["downloads"]
self.download_counts = [float(dl_info[k]) for k in ["last_day", "last_week", "last_month"]]
upload_time_strs = [url_info["upload_time"] for url_info in json_dict["urls"]]
upload_times = [dateutil.parser.parse(up_time, ignoretz=True) for up_time in upload_time_strs]
if upload_times:
self.last_update = max(upload_times)
return True
else:
self.last_update = None
return False
class DownloadMapper(QueuingThread):
"""
Class to handle the parallel downloading of named objects.
"""
def __init__(self, queue, named_objects, max_age_days, aria2c_path):
"""
:param named_objects: The list of named objects
:type named_objects: [NamedObject]
"""
QueuingThread.__init__(self, queue)
self.nrmap = {}
for nobj in named_objects:
self.nrmap[nobj.name] = nobj
self.paths = []
self.backups_needed = []
self.max_age_days = max_age_days
self.aria2c_path = aria2c_path
self.ntf = NamedTemporaryFile(delete=False)
self.ntf_dir = self.get_proper_path(os.path.dirname(self.ntf.name))
for result in self.nrmap.values():
# Skip results that have already been downloaded recently.
if result.has_recent_download(self.ntf_dir, self.max_age_days):
self.paths.append(os.path.join(self.ntf_dir, result.name))
continue
self.ntf.write(result.to_aria2_input_entry())
self.ntf.close()
logging.info("aria2c input file saved to %r (dir: %r)", self.ntf.name, self.ntf_dir)
def get_proper_path(self, file_path):
"""
Get the "proper" format for :attr:`file_path` by removing Cygwin-specific formatting, if it exists.
This is necessary because aria2c.exe won't recognize Cygwin-formatted paths if it was built with MinGW.
:param str file_path: The path to convert if deemed necessary
:return str: The converted file path
"""
if "cygwin" in sys.platform.lower():
file_path = sh.cygpath("-w", file_path).stdout.strip()
print file_path
return file_path
def find_aria2c(self):
for path in (self.aria2c_path, 'aria2c', 'aria2c.exe'):
if path is None:
continue
next_path = sh.which(path)
if next_path:
return next_path
logging.error('aria2c is missing from the current configuration!')
return None
def build_command(self):
aria2c_path = self.find_aria2c()
aria2_cmd = sh.Command(aria2c_path)
# Run and observe the above aria2c executable, reporting download progress to the appropriate logger.
local_aria2c_options = {"input-file": self.get_proper_path(self.ntf.name),
"dir": self.ntf_dir,
"max-download-result": len(self.nrmap)}
aria2_cmd = aria2_cmd.bake(ARIA2C_OPTIONS).bake(local_aria2c_options)
aria2_cmd = aria2_cmd.bake(_iter=True, _tty_out=False, _ok_code=1)
logging.info("Command to execute: %s", aria2_cmd)
return aria2_cmd
def compile_progress_regex(self):
return re.compile("Download complete:\\s+(?P<path>.*)$")
def enqueue_progress_match(self, progress_match):
group_dict = progress_match.groupdict()
self.paths.append(group_dict["path"])
msg_dict = {"value": len(self.paths),
"maximum": len(self.nrmap),
"status": "Download complete: {0}".format(group_dict["path"])}
self.queue.put(msg_dict)
def run(self):
"""
Run aria2c to execute all the downloads and save their file paths.
:param max_age_days: The maximum age a file should be in order to be considered "recent" (and skipped over)
:type max_age_days: float
:param aria2c_path: The path to the aria2c(.exe) executable, or None to search for it in the PATH environment
:type aria2c_path: str or None
"""
if self.paths:
log_fmt = "Download mapper has already run or is currently running! (%d paths came back)"
logging.error(log_fmt, len(self.paths)) # TODO:ABC: make this raise some kind of exception?
return
QueuingThread.run(self)
self.update_objects()
def update_objects(self):
"""
Apply the downloaded updates to all their corresponding named objects.
"""
if not self.paths:
logging.error("No paths to update! Make sure the download has actually been executed")
return # raise MyException(err_msg, errorcodes.DOWNLOAD_MAPPER_MISSING_PATHS)
for path in self.paths:
with open(path, 'r') as f:
new_content = f.read()
# Look up and apply the relevant update.
original_name = os.path.split(path)[-1] # TODO:ABC: mapping path to name to be done by named object?
original_result = self.nrmap[original_name]
update_status = original_result.apply_update(new_content) # TODO:ABC: make this generic!
if not update_status:
self.backups_needed.append(original_name)
def update_required_backups(self):
"""
Run backup updates on any named objects that require them.
"""
if not self.paths:
logging.error("No paths to update! Make sure the download has actually been executed")
return
for backup_name in self.backups_needed:
self.nrmap[backup_name].run_backup_update()
@property
def named_objects(self):
"""
:rtype: NamedObject
"""
return self.nrmap.values()
@property
def names(self):
return self.nrmap.keys()
def query_initial_packages(search_term):
"""
Perform an initial package search on PyPI with the given :attr:`search_term`, and return a list of
:attr:`PypiSearchResult` named objects.
:param str search_term: The initial search query
:return: The list of search results
:rtype: list[PypiSearchResult]
"""
logging.info("Querying initial packages for %s...", search_term)
result_page = requests.get("https://pypi.python.org/pypi", params={":action": "search", "term": search_term})
result_tree = etree.fromstring(result_page.content, HTMLParser())
result_tree.make_links_absolute(result_page.url)
result_tags = result_tree.xpath("//table[@class='list']/tr[@class][td]")
results = []
for lxml_element in result_tags:
result_obj = PypiJsonSearchResult(link="{0}/json".format(lxml_element[0][0].get("href")),
weight=int(lxml_element[1].text),
summary=lxml_element[2].text or '')
if result_obj.is_pip_result(search_term):
results.append(result_obj)
return results
def search_packages(search_term, collect_stats=True, backup_search=False,
max_age_days=0.5, aria2c_path=None):
"""
Search for packages matching :attr:`search_term`, optionally collecting stats
and/or running backup updates for any packages whose age was not determined
initially.
:param str search_term: The search term
:param bool collect_stats: True to collect stats, otherwise False
:param bool backup_search: True to run backup searches, otherwise False
:param float max_age_days: The maximum days of age files should be
:param str aria2c_path: The path to the aria2c executable, or None to look for it on PATH
:return: The resulting search results
:rtype: list[:class:`PypiSearchResult`]
"""
initial_results = query_initial_packages(search_term)
if not collect_stats:
return initial_results
thread_creator = lambda queue: DownloadMapper(queue, initial_results, max_age_days, aria2c_path)
# Create a generic progress bar dialog for monitoring the download progress.
try:
stats_progbar = progbar.GenericProgressBar(title="Downloading packages...",
maximum=len(initial_results),
value=0,
status="Starting aria2c...",
thread_creator=thread_creator)
except Exception as e:
logging.exception("Exception was raised drawing progress bar: %s", e)
stats_progbar = GenericDownloadQueue(thread_creator=thread_creator)
with stats_progbar:
pass
if not backup_search:
return stats_progbar.thread.named_objects
stats_progbar.thread.update_required_backups()
return stats_progbar.thread.named_objects
class OutputFile(object):
def __init__(self, search_term):
self.search_term = search_term
def __repr__(self):
return '{0}({1})'.format(self.__class__.__name__, self.search_term)
def __str__(self):
return repr(self)
@property
def file_name(self):
return "{0}.csv".format(self.search_term)
@property
def path(self):
return os.path.abspath(self.file_name)
@property
def ref_date(self):
try:
file_stats = os.stat(self.path)
cm_time = max(file_stats.st_mtime, file_stats.st_ctime)
except OSError:
cm_time = -1
return datetime.utcfromtimestamp(cm_time)
@property
def age(self):
age_td = datetime.utcnow() - self.ref_date
total_secs = (age_td.days * 86.4e3) + age_td.seconds + (age_td.microseconds * 1.0e-6)
return total_secs / 86.4e3
def main(args):
"""
:type args: list
"""
parser = ArgumentParser(description="Search for python packages using better metrics",
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("search_term",
type=str,
help="The search term or phrase to query")
parser.add_argument("-S", "--disable-stats",
dest="collect_stats",
action="store_false",
help="Disable extra stats collection (i.e. revert to old behavior)")
parser.add_argument("-s", "--enable-stats",
dest="collect_stats",
action="store_true",
help="Enable extra stats collection (i.e. the default)")
parser.set_defaults(collect_stats=True)
parser.add_argument("-B", "--disable-backup-search",
dest="backup_search",
action="store_false",
help="Disable backup search for last update (i.e. the default)")
parser.add_argument("-b", "--enable-backup-search",
dest="backup_search",
action="store_true",
help="Enable backup search for last update (can be slow!)")
parser.set_defaults(backup_search=False)
parser.add_argument("-d", "--max-age-days",
dest="max_age_days",
type=float,
help="Max days to consider recent when downloading already-existing files")
parser.set_defaults(max_age_days=0.5)
parser.add_argument("-p", "--path-to-aria2c",
dest="aria2c_path",
type=str,
help="The path to aria2c(.exe) if not in current PATH environment")
parser.set_defaults(aria2c_path=None)
argcomplete.autocomplete(parser)
parser_ns = parser.parse_args(args)
out_obj = OutputFile(parser_ns.search_term)
if out_obj.age < parser_ns.max_age_days:
with open(out_obj.path, 'r') as f:
csv_lines = f.read().splitlines()
packages = [PypiSearchResult.from_csv(line, ref_date=out_obj.ref_date) for line in csv_lines]
else:
packages = search_packages(parser_ns.search_term, parser_ns.collect_stats,
parser_ns.backup_search, parser_ns.max_age_days,
parser_ns.aria2c_path)
packages.sort()
logging.info("Saving CSV entries to %s", out_obj.path)
with open(out_obj.path, "w") as f:
for package in packages:
f.write(package.to_csv())
f.write(os.linesep)
if __name__ == "__main__":
main(sys.argv[1:])