-
-
Notifications
You must be signed in to change notification settings - Fork 181
/
Copy pathmirror_manager.py
executable file
·546 lines (477 loc) · 18.4 KB
/
mirror_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
This script manages Sage Mirrors
Features:
- tests if they are online
- tests if they are in sync
- generates suitable download page for the website + python list
Aim:
- don't present outdated or offline mirrors to the downloader
- provide list of up-to-date mirrors (i.e. eval(urllib.urlopen(...).read())
Notes:
- you don't have to publish the page with go_live, see "TARGET" variable below
- script is designed to be called by crontab every 10 minutes
to edit the crontab entry, do "$ crontab -e" as user sage and enter this line:
*/10 * * * * /home/sage/www2-dev/mirror_manager.py 2> /home/sage/www2-dev/mirror_manager.error > /home/sage/www2-dev/mirror_manager.out
or better the wrapper script to ensure, that only on instance runs at the same time
*/10 * * * * /home/sage/www2-dev/mirror_manager_wrapper.sh
COPYRIGHT: Harald Schilly <[email protected]>, 2009, Vienna, Austria
LICENSE: GPL2+
'''
from __future__ import unicode_literals
try: # python3
from urllib.request import urlopen
except ImportError: # python2
from urllib2 import urlopen
# import subprocess #we use curl, urllib2.urlopen doesn't work :(
import re
import time
from threading import Thread
import os
from os.path import join
import sys
import yaml
import codecs
# safeguard, don't run script two times at the same time
# there is something very odd, probably nfs file system and
# vm ware and stuff like that -> no, it was the open socket
# problem, that's not exposed in urllib, see below.
# the wrapper script uses gnu's
# flock -xn ./lockfile python script.py RUNME
# to ensure that only one instance is running
if len(sys.argv) <= 1 or sys.argv[1] != 'RUNME':
print('ERROR: USE THE ', sys.argv[0], '_wrapper.sh SCRIPT !!!')
sys.exit(1)
# everything stalls sometimes since timetouts are not part of urllib!
# http://viralcontentnetwork.blogspot.com/2007/08/handling-timeouts-with-urllib2-in.html
import socket
socket.setdefaulttimeout(10)
# script uses relative paths, switch to its
os.chdir(os.path.dirname(os.path.abspath(sys.argv[0])))
##################### constants and data #######################
TIMESTAMP_SUFFIX = "zzz/timestamp.html"
TIMESTAMP_RE = re.compile(r'<pre>(.*)</pre>')
DELIM_RE = re.compile(r' / ')
# where the output file is written to
MIRROR_YAML = join("..", "conf", "mirrors.yaml")
OUTDIR = join("..", "templates")
TARGETS = [join(OUTDIR, 'mirrorselector.html')]
TARGETS_SPKG = [join(OUTDIR, 'mirrorselector-src.html')]
TARGETS_LIST = ['mirror_list']
OUTPUT_FILE = 'mirror_manager.out'
METALINK_FILE = ['metalink.helper']
TORRENT_FILE = ['torrent.helper']
# mirrors.html holds the list of all mirrors
# the are inserted between <!--STARTTOKEN--> and <!--ENDTOKEN-->
MIRRORS_HTML = join(OUTDIR, 'all-mirrors.html')
tslog = time.strftime('%Y-%U', time.gmtime())
LOGFILE = './mirror-log/mirror_manager_%s.log' % tslog
SYMLINK = './mirror-log/mirror_manager.log'
if os.path.islink(SYMLINK):
os.remove(SYMLINK)
os.symlink('./mirror-log/mirror_manager_%s.log' % tslog, SYMLINK)
OUTPUT = ''
# categories of the mirror list, used to group the mirrors
CATEGORY = {
"g": " Global",
"na": "America, North",
"sa": "America, South",
"e": "Europe",
"a": "Asia",
"af": "Africa",
"aus": "Australia"
}
class Mirror(object):
"""
mirror class contains all info needed to
contact and analyze a mirror
"""
def __init__(self, name, url, cat, active=True, flag=None, country=None, priority=50):
if cat not in CATEGORY:
raise RuntimeError('category %s does not exist' % cat)
if not isinstance(active, bool):
raise RuntimeError('type(active) is not bool')
self.name = str(name)
self.cat = cat
assert url.endswith("/")
self.url = url
self.active = active
if country is None:
self.country = url.split(r'/')[2].split(r'.')[-1]
else:
self.country = country
self.flag = flag if flag is not None else self.country
self.priority = priority
def entry(self):
n = self.name
f = '<img src="http://www.sagemath.org/pix/flags/%s.png" width="22" height="14"></img>' % self.flag
# the self.url *always* ends with a /, hence mirrordir needs to end with a "/", too.
# this happens in the macro
e = '<a href="%s{{ mirrordir }}index.html">%s %s</a>\n' % (self.url, f, n)
return e
def __unicode__(self):
x = u"Mirror '{name}' in {country}".format(**self.__dict__)
#.encode("utf8")
return x
MIRRORS = [Mirror(**m) for m in yaml.load(codecs.open(MIRROR_YAML, "r", "utf8"), Loader=yaml.SafeLoader)]
TOTAL_NUMBER = len(MIRRORS)
# define master server as Seattle. this one is the "Correct" one.
# it's also the master if it is not active, see lines below to be invisible but still used as the reference
# TODO: maybe, it's better to automatically define the most recent one as the master? - currently NO!
MASTER = MIRRORS[0]
# filter only actives
MIRRORS = [_ for _ in MIRRORS if _.active]
# intro text
TS_INTRO = r"""
{# ATTENTION: THIS FILE IS AUTOGENERATED BY mirror_manager.py #}
{% macro mirrors(mirrordir) %}
{% if mirrordir != "" %}
{% set mirrordir = mirrordir + "/" %}
{% endif %}
Please select a download server close to your location below.
"""
TS_OUTRO = """
{% endmacro %}
"""
#r"""You can see the full list of mirror servers <a href="./mirrors.html">here</a>."""
# syncing info
TS_SYNC = r"""
<br/>
<strong>Download servers in the mirror network are currently synchronizing. Please try again later.</strong>
"""
TS_NEW = r"""
<br/>
<strong>A new release</strong> is upcoming. Maybe come back later for the next version of Sage.
"""
# delimiter sign
TS_DELIM = r"""·"""
################# functions ####################
def info(txt, width=127):
"""
helper function
"""
global OUTPUT
OUTPUT += '\n' + '=' * width + '\n'
OUTPUT += ' ' + txt.center(width - 4) + '\n'
OUTPUT += '-' * width + '\n'
def log(mirrors):
ts = '%4s%2s%2s-%2s:%2s:%2s' % time.gmtime()[0:6]
with codecs.open(LOGFILE, 'a', "utf8") as f:
f.write(ts)
f.write(';')
for m in sorted(mirrors, key=lambda x: x.name):
f.write(m.name)
f.write(';')
f.write('\n')
def mirrors_html():
"""
this function inserts *all* mirrors between the START and ENDTOKENS
in the html code of MIRRORS_HTML
there are two templates for the headings and the entries in the table
algorithm:
first, the header is read (HEADER, state 0)
then nothing is done until the ENDTOKEN comes (state 1)
in the end, everything else is stored in FOOTER (state 2)
return is HEADER + generated list + FOOTER
"""
from string import Template
section = Template("""
<tr>
<td><h2>${NAME}</h2></td>
</tr>
""")
entry = Template("""
<tr>
<td>
<img alt="" src="${URL}sageicon.png" width="16" height="16" />
<a href="${URL}index.html">${NAME}</a>
</td>
</tr>
<tr>
<td><iframe scrolling="no" src="${URL}zzz/timestamp.html"></iframe></td>
</tr>
""")
# now building the list
LIST = ""
sm = sorted(MIRRORS, key=lambda x: x.name)
for c in sorted(CATEGORY.items(), key=lambda x: x[1]):
selected_mirrors = [entry.substitute(URL=m.url, NAME=m.name) for m in sm if m.cat == c[0]]
if selected_mirrors:
LIST += section.substitute(NAME=c[1])
# per category, mirrors sorted by name
LIST += '\n'.join(selected_mirrors)
return LIST
# TODO urllib.quote(u) -- replace spaces by %20 and more ... (use this everywhere!)
def fetch_timestamps():
"""
fetches the timestamps in parallel,
@return map of mirror <-> timestamp.html file
"""
info("fetching timestamps in parallel")
ret = {}
def stdout(msg):
sys.stdout.write(" " + msg)
sys.stdout.flush()
def fetch_task(mirror):
global OUTPUT
time1 = time.time()
try:
response = urlopen(mirror.url + TIMESTAMP_SUFFIX)
ret[mirror] = response.read().decode('utf-8')
m = '%8.2f [ms] %s\n' % ((time.time() - time1) * 1000.0, mirror.name)
OUTPUT += m
stdout(m)
except Exception as err: # URLError, err:
if True: # err_count > 5:
m = '%8.2f [ms] %s' % ((time.time() - time1) * 1000.0, mirror.name)
m += " -> %s\n" % str(err)
OUTPUT += m
stdout(m)
tasks = []
for mirror in MIRRORS:
t = Thread(target=fetch_task, args=(mirror,))
t.start()
tasks.append(t)
time.sleep(0.1)
for t in tasks:
t.join(timeout=50) # a bit more than the socket timeout
return ret
def extract_timestamps(TS):
"""
Extract the actual timestamp from timestamp.html
"""
info("extracting timestamps")
ret = {}
global OUTPUT
for mirror in sorted(TS.keys(), key=lambda m: m.name):
ts = TS[mirror]
t = TIMESTAMP_RE.search(ts)
if t is not None and len(t.groups()) > 0:
OUTPUT += "%-20s %s\n" % (mirror.name, t.group(1))
ret[mirror] = t.group(1)
else:
OUTPUT += "%-20s %s\n" % (mirror.name, "TIMESTAMP RETRIEVAL ERROR (404 page, garbage, ...)")
return ret
def dissect_timestamps(TS):
"""
dissects timestamps based on their specific format
into individual tokens for later analysis
"""
from datetime import datetime
info("dissecting timestamps")
global OUTPUT
ret = {}
for mirror in sorted(TS.keys(), key=lambda m: m.name):
tokens = DELIM_RE.split(TS[mirror])
if tokens is not None and len(tokens) >= 3:
time = datetime.strptime(tokens[0], '%Y-%m-%d %H:%M %Z')
if len(tokens) == 3:
OUTPUT += "%-20s %-5s %-5s \n" % (mirror.name, tokens[1], tokens[2])
ret[mirror] = (tokens[1], tokens[2], " ", time)
elif len(tokens) == 4:
OUTPUT += "%-20s %-5s %-5s %s\n" % (mirror.name, tokens[1], tokens[2], tokens[3])
ret[mirror] = (tokens[1], tokens[2], tokens[3], time)
elif len(tokens) == 5:
OUTPUT += "%-20s %-5s %-5s %s %s\n" % (mirror.name, tokens[1], tokens[2], tokens[3], tokens[4])
ret[mirror] = (tokens[1], tokens[2], tokens[3], time, tokens[4])
else:
OUTPUT += mirror[0], "ERROR, len(tokens)=%s\n" % (len(tokens))
return ret
def find_reference(TS):
"""
finds reference tokens, for now MASTER, the most recent timestamp counts
"""
info("selecting reference timestamp")
global OUTPUT
# instead of return TS[MASTER][2]
# of all timstaps, sort them by date, get the *last* one and return the hashcode
ref = sorted(TS.items(), key=lambda _: _[1][3])[-1]
OUTPUT += '%s @ %s selected\n' % (ref[0].name, ref[0].url)
return ref[1][2], ref[1][4]
def good_mirrors(TS, ref):
"""
returns list of good mirrors, that's what this is all about!
"""
info("compiling list of good mirrors")
good = []
global OUTPUT
for mirror in sorted(TS.keys(), key=lambda m: m.name):
if TS[mirror][2] == ref:
good.append(mirror)
OUTPUT += '+ %s\n' % mirror.name
else:
OUTPUT += '- %s rejected\n' % mirror.name
OUTPUT += '\n'
# remove master (Seattle), if we have more than half synced mirrors in North America
# AND the number of all synced mirrors is more than half of all mirrors
# saves bandwidth for all the other services hosted there.
# if len([m for m in good if m.cat=='na']) > len([m for m in MIRRORS if m.cat=='na'])/2 and len(good) > TOTAL_NUMBER/2:
# option B, remove MASTER, if there is another one in North America, more aggressive...
# if len([m for m in good if m.cat=='na']) >= 2:
# OUTPUT += 'removed master (Seattle), to save bandwidth for other services,\n'
# OUTPUT += 'since there are other servers in North America available...\n'
# if MASTER in good:
# good.remove(MASTER)
# if MIRRORS[1] in good and MIRRORS[0] in good:
# good.remove(MIRRORS[1])
# OUTPUT += 'removed secondary UW mirror, since the primary one is online\n'
return good
def good_mirrors_spkg(TS, ref_spkg):
"""
returns list of good mirrors for the spkgs
"""
info("compiling list of good mirrors for the src/spkgs")
good = []
global OUTPUT
for mirror in sorted(TS.keys(), key=lambda m: m.name):
if len(TS[mirror]) >= 5 and TS[mirror][4] == ref_spkg:
good.append(mirror)
OUTPUT += '+ %s\n' % mirror.name
else:
OUTPUT += '- %s rejected\n' % mirror.name
OUTPUT += '\n'
return good
def build_mirrorselector(mirrors, TS, best_mirror):
"""
builds the mirrorselector.html page, see sample at the bottom
"""
global OUTPUT
page = TS_INTRO
global TOTAL_NUMBER
#page += '(%2.0f%%)' % (float(len(mirrors))/float(TOTAL_NUMBER-1)*100.0)
if len(mirrors) <= 5:
page += TS_SYNC
# if bin != src version, new release upcoming!
#if TS.values()[0][0] != TS.values()[0][1]:
# page += TS_NEW
page += '\n<table id="mirror">\n'
# category by name
for c in sorted(CATEGORY.items(), key=lambda x: x[1]):
OUTPUT += c[1] + ' : '
# per category, mirrors sorted by name (re-enabled!)
ms = [m.entry() for m in sorted(mirrors, key=lambda x: x.name) if m.cat == c[0]]
# ms = [m.entry() for m in mirrors if m.cat == c[0]]
## shuffeling list, because the first mirror per category
## is selected very often.
## -> no longer shuffle, rather, make it stable to avoid unnecessary commits&pushes
# import random
# random.shuffle(ms)
OUTPUT += ', '.join(m.name for m in sorted(mirrors, key=lambda x: x.name) if m.cat == c[0]) + '\n'
if len(ms) > 0:
page += '<tr>\n'
page += '<td>' + c[1] + '</td><td>\n'
page += ('<br/>\n').join(ms)
page += '</td></tr>\n'
page += '</table>\n'
# global category for metalinks, then continent/server
page += '''\
<h4>Distributed / P2P</h4>
<!-- <ul>
<li style="font-size: 90%%;">
-->
<div>
Consider downloading via Torrent using <b><a href="%(server)s">BitTorrent web-seed files</a></b>!
This automatically balances and parallelizes the download across all servers,
makes it resumable, and the checksum is automatically verified.
This gives you maximum speed and protection against corrupt/malicious data.
Either install a libtorrent based client like <a href="https://deluge-torrent.org/">Deluge</a>,
<a href="https://transmissionbt.com/about/">Transmission</a> (default in many Linux distributions),
<a href="https://www.vuze.com/">Vuze</a>,
or <a href="http://aria2.sourceforge.net/">Aria2</a> for the command-line.
(Install Aria2 via <code>sudo apt-get install aria2</code> and then <code>$ aria2c http://...*.torrent</code>).
<a href="%(server)s">Sage web-seed files</a>.
</div>
<!--
</li>
<li style="font-size: 90%%;">
<a href="http://files.sagemath.org/metalinks.html">Metalinks</a> —
provide fast, stable and resumeable downloads via a
<a href="http://www.metalinker.org/samples.html">download client</a> (<a href="https://en.wikipedia.org/wiki/Metalink">read more</a>).
</li>
</ul>
-->
''' % {"server": os.path.join(best_mirror.url, "torrents.html")}
page += TS_OUTRO
return page
def build_mirror_list(good):
"""
mirror_list file to be parsed by a python program,
import urllib
eval(urllib.urlopen('http://.../mirror_list').read())
"""
global OUTPUT
info('building mirror_link page')
ret = '# Sage Mirror List - %s\n' % time.asctime()
ret += '# python usage:\n'
ret += '# import urllib\n'
ret += '# eval(urllib.urlopen(\'http://www.sagemath.org/mirror_list\').read())\n'
ret += '[' + ','.join('"%s"' % m.url for m in sorted(good, key=lambda x: x.url)) + ']'
OUTPUT += ret + '\n'
return ret
def publish(good, good_spkg, TS, best_mirror):
"""
publishs the page and mirror list to the website
"""
info('building mirrorselector page')
out = build_mirrorselector(good, TS, best_mirror)
info('building mirrorselector page for src/spkg')
out_spkg = build_mirrorselector(good_spkg, TS, best_mirror)
out_list = build_mirror_list(good_spkg)
global OUTPUT
info('publishing')
for t in TARGETS:
with codecs.open(t, 'w', "utf8") as F:
F.write(out)
OUTPUT += 'published mirrorselector page to %s\n' % t
for t in TARGETS_SPKG:
with codecs.open(t, 'w', "utf8") as F:
F.write(out_spkg)
OUTPUT += 'published mirrorselector page for src/spkg to %s\n' % t
for t in TARGETS_LIST:
with codecs.open(t, 'w', "utf8") as F:
F.write(out_list)
OUTPUT += 'published mirror_list page to %s\n' % t
with codecs.open(OUTPUT_FILE, 'w', "utf8") as F:
F.write(OUTPUT)
def metalink_helper(M):
info("metalink helper file")
# also includes master with priority 1, reversed(M[1:]) to avoid master completely
out = '\n'.join([' '.join([m.country, str(m.priority), ' % ', m.url]) for m in reversed(M)])
out += '\n'
for MF in METALINK_FILE:
with codecs.open(MF, 'w', "utf8") as F:
F.write(out)
global OUTPUT
OUTPUT += out
info("torren helper file")
# torrent.helper
out = '\n'.join([m.url for m in M])
for TF in TORRENT_FILE:
with codecs.open(TF, "w", "utf8") as F:
F.write(out)
OUTPUT += out
############### main program and logic ###################
# split into blocks for easier understanding and editing #
if __name__ == '__main__':
info('Mirror Management Script started %s' % time.asctime(time.gmtime()))
with codecs.open(MIRRORS_HTML, 'w', "utf8") as MH:
MH.write(mirrors_html())
TS = fetch_timestamps()
TS = extract_timestamps(TS)
TS = dissect_timestamps(TS)
ref, ref_spkg = find_reference(TS)
good = good_mirrors(TS, ref)
log(good)
good_spkg = good_mirrors_spkg(TS, ref_spkg)
info("list of valid mirrors")
for m in good:
OUTPUT += m.name + '\n'
info("list of valid mirrors for src/spkg")
for m in good_spkg:
OUTPUT += m.name + '\n'
metalink_helper(MIRRORS)
one_good_mirror = [gm for gm in good if not gm.url.startswith("ftp")][0]
publish(good, good_spkg, TS, one_good_mirror)
# calling visualization -- disabled, we don't use this any more
#os.system("python mirror_log_visualize.py")