-
Notifications
You must be signed in to change notification settings - Fork 102
/
Copy pathscrub.py
151 lines (133 loc) · 6.25 KB
/
scrub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Process a set of configuration defined sanitations on a given feed.
"""
# Standard library modules
import time
# Planet modules
import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
def scrub(feed_uri, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed_uri).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed_uri):
title_type = config.title_type(feed_uri)
title_type = type_map.get(title_type, title_type)
for entry in data.entries:
if entry.has_key('title_detail'):
entry.title_detail['type'] = title_type
# adjust summary types
if config.summary_type(feed_uri):
summary_type = config.summary_type(feed_uri)
summary_type = type_map.get(summary_type, summary_type)
for entry in data.entries:
if entry.has_key('summary_detail'):
entry.summary_detail['type'] = summary_type
# adjust content types
if config.content_type(feed_uri):
content_type = config.content_type(feed_uri)
content_type = type_map.get(content_type, content_type)
for entry in data.entries:
if entry.has_key('content'):
entry.content[0]['type'] = content_type
# some people put html in author names
if config.name_type(feed_uri).find('html')>=0:
from shell.tmpl import stripHtml
if data.feed.has_key('author_detail') and \
data.feed.author_detail.has_key('name'):
data.feed.author_detail['name'] = \
str(stripHtml(data.feed.author_detail.name))
for entry in data.entries:
if entry.has_key('author_detail') and \
entry.author_detail.has_key('name'):
entry.author_detail['name'] = \
str(stripHtml(entry.author_detail.name))
if entry.has_key('source'):
source = entry.source
if source.has_key('author_detail') and \
source.author_detail.has_key('name'):
source.author_detail['name'] = \
str(stripHtml(source.author_detail.name))
# handle dates in the future
future_dates = config.future_dates(feed_uri).lower()
if future_dates == 'ignore_date':
now = time.gmtime()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
for entry in data.entries:
if entry.has_key('published_parsed') and entry['published_parsed']:
if entry['published_parsed'] > now:
del entry['published_parsed']
del entry['published']
if entry.has_key('updated_parsed') and entry['updated_parsed']:
if entry['updated_parsed'] > now:
del entry['updated_parsed']
del entry['updated']
elif future_dates == 'ignore_entry':
now = time.time()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
data.entries = [entry for entry in data.entries if
(not entry.has_key('published_parsed') or not entry['published_parsed']
or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content'and not entry.has_key('content_detail'):
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'
if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')
from html5lib import treewalkers, serializer
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
tree = xhtml.serialize(walker, encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree])