-
Notifications
You must be signed in to change notification settings - Fork 342
/
Copy pathhtml_diff
executable file
·264 lines (252 loc) · 11.2 KB
/
html_diff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python3
# Script to compare two html files, ignoring differences that we consider
# to be unimportant. The output is a unified diff of formatted html meant
# to be readable and precise at identifying differences.
#
# This script is designed to be run in the container managed by the
# Dockerfile at the root of this repository.
from bs4 import BeautifulSoup, NavigableString
import difflib
import re
def normalize_html(html):
"""Normalizes html to remove expected differences between AsciiDoc's
output and Asciidoctor's output.
"""
soup = BeautifulSoup(html, 'lxml')
# Remove all duplicate classes because that doesn't make any
# visual difference.
for e in soup.descendants:
if hasattr(e, 'attrs') and 'class' in e.attrs:
e['class'] = list(set(e['class']))
# Replace many whitespace characters with a single space in some elements
# kind of like a browser does.
for e in soup.select(':not(script,pre,code,style)'):
for part in e:
if isinstance(part, NavigableString):
crunched = NavigableString(re.sub(r'\s+', ' ', part))
if crunched != part:
part.replace_with(crunched)
# Asciidoctor adds a "content" wrapper. It doesn't really change the layout
# so we're ok with it.
for e in soup.select('#content'):
e.unwrap()
# Docbook adds <span class="emphasis"> around <em> tags. We don't need them
# and it isn't worth making Asciidoctor make them.
for e in soup.select('.emphasis'):
e.unwrap()
# Asciidoctor adds a "ulist" class to all unordered lists which doesn't
# hurt anything so we can ignore it.
for e in soup.select('.itemizedlist.ulist'):
e['class'].remove('ulist')
# Docbook adds type which we override in css.
for e in soup.select('ul'):
if 'type' in e.attrs:
del e['type']
# Asciidoctor adds a "olist" class to all ordered lists which doesn't
# hurt anything so we can ignore it.
for e in soup.select('.orderedlist.olist'):
e['class'].remove('olist')
# Docbook adds type="1" to ol which we override with css.
for e in soup.select('ol'):
if 'type' in e.attrs:
del e['type']
# Docbook emits images with the 'inlinemediaobject' class and Asciidoctor
# has the 'image' class. We've updated our styles to make both work.
for e in soup.select('.inlinemediaobject'):
e['class'].remove('inlinemediaobject')
e['class'].append('image')
# Docbook emits screenshot images with the 'informalfigure' class and
# Asciidoctor has the 'imageblock' class. We've updated our styles to make
# both work.
for e in soup.select('.informalfigure'):
e['class'].remove('informalfigure')
e['class'].append('imageblock')
# Docbook emits images with titles as 'figure' but docbook just uses
# 'imageblock'. And all kinds of other things are different. But they don't
# *look* different.
for e in soup.select('.imageblock.text-center'):
e['class'].remove('text-center')
for e in soup.select('.imageblock[id]'):
anchor = soup.new_tag('a')
anchor['id'] = e['id']
e.insert(0, anchor)
del e['id']
for e in soup.select('.figure'):
e['class'].remove('figure')
e['class'].append('imageblock')
for a in e.select('a[id]'):
if a['id'].startswith('id-'):
a.extract()
for t in e.select('.title'):
t.extract()
new_t = soup.new_tag("div")
new_t['class'] = ['title']
new_t.append(t.contents[0].extract().contents[0])
e.append(new_t)
for c in e.select('.figure-contents'):
c['class'].remove('figure-contents')
c['class'].append('mediaobject')
c.contents[0].unwrap()
# Same for mediaobject and content
for e in soup.select('.content'):
e['class'].remove('content')
e['class'].append('mediaobject')
# Blast some "defaults" that docbook renders and asciidoctor doesn't.
for e in soup.select('.imageblock .mediaobject[align=center]'):
del e['align']
for e in soup.select('.imageblock img[align=middle]'):
del e['align']
# Asciidoctor emits examples as "exampleblock" instead of "informalexample"
# but these don't make any visual difference.
for e in soup.select('.exampleblock'):
e['class'].remove('exampleblock')
e['class'].append('informalexample')
for c in e.select('.mediaobject'):
c.unwrap()
# Docbook adds ids to all examples and asciidoctor doesn't. We don't need
# the ids so we ignore them.
for e in soup.select('.example a[id]'):
if e['id'].startswith('id-'):
e.extract()
# Docbook links with `<a class="link"` when linking from one page of a book
# to another. Asciidoctor emits `<a class="link"`. Both look fine.
for e in soup.select('a.xref'):
e['class'].remove('xref')
e['class'].append('link')
# Docbook sprinkles `class="simpara"` all over the place and I'm not 100%
# sure why. But we don't need it anyway.
for e in soup.select('.simpara'):
e['class'].remove('simpara')
# Docbook *never* makes a <dd> tag that contains only a <p> tag.
# Asciidoctor *can* do that if the description is formed in a very
# particular way. Visually, it all looks the same.
for e in soup.select('dd > p:only-child'):
e.unwrap()
# Docbook adds ids to every "question and answer" style dlist. Asciidoctor
# doesn't and we have no chance of emulating docbook's generated ids. Since
# we don't *think* anyone does anything with the ids we ignore them.
for e in soup.select('.qandaset a[id]'):
if e['id'].startswith('id-'):
e.extract()
# Docbook would sometimes spit out empty <p> tags. We're fine with
# dropping those.
for e in soup.select('p:empty'):
e.extract()
# Docbook makes `<code class="literal"><code class="literal">` sometimes.
# Just one `<code>` is fine though.
for e in soup.select("code.literal > code.literal"):
e.unwrap()
# Docbook adds non-breaking spaces inside the title attributes for some
# links. It doesn't seem like we need that.
for e in soup.select('link[title]'):
e['title'] = e['title'].replace('\xa0', ' ')
# Docbook adds a generated id to tables with ids. They aren't used and we
# have no change of generating it properly so we just ignore it.
for a in soup.select('.table a[id]'):
if a['id'].startswith('id-'):
a.extract()
# Docbook renders partintro's slightly differently but the difference isn't
# visually distinct so we can ignore it.
for e in soup.select('.partintro:not(.openblock) > div:empty'):
e.extract()
for e in soup.select('.partintro:not(.openblock)'):
e['class'].remove('partintro')
e['class'].append('mediaobject')
wrapper = soup.new_tag("div")
wrapper['class'] = ['openblock', 'partintro']
e.wrap(wrapper)
# Asciidoctor adds 'blockquote' and 'quoteblock'. Docbook doesn't and we
# don't need them.
for e in soup.select('.blockquote'):
e['class'].remove('blockquote')
for e in soup.select('.quoteblock'):
e['class'].remove('quoteblock')
# Docbook doesn't add `Admonishment--change` but Asciidoctor does to keep
# it consistent with the care admonitions.
for e in soup.select('.Admonishment--change'):
e['class'].remove('Admonishment--change')
# Docbook adds `<span class="quote"` whenever it sees quotes. These don't
# change the rendering and Asciidoctor doesns't add them.
for e in soup.select('span.quote'):
parent = e.parent
e.unwrap()
parent.smooth()
# Asciidoctor suppports adding a `<meta name="description"` if a book
# defines a `:description:` attribute. Docbook doesn't. We're quite ok with
# adding it but we can ignore it in the diff because, well, we don't need
# to see it.
for e in soup.select("meta[name='description']"):
e.extract()
# Docbook renders "indexterms" with an inline anchor and asciidoctor
# doesn't. You can't see them and no one is linking to them and the links
# don't really do anything. So we're ok ignoring them.
for e in soup.select("a.indexterm"):
parent = e.parent
e.extract()
parent.smooth()
# Docbook sometimes renders the xpack tag *after* the edit me link but
# asciidoctor always renders it before the edit me link. Either way is fine
# so we ignore the difference.
for e in soup.select("a.edit_me"):
parent = e.parent
parent.append(e.extract())
# Docbook passes the `exclude` class on some paragraphs and asciidoctor
# doesn't which is fine because it doesn't make a visual difference.
for e in soup.select('.exclude'):
e['class'].remove('exclude')
# Asciidoctor adds 'px' to the end of raw pixel values but docbook doesn't.
# We're quite happy to have the new px.
for e in soup.select('img[width]'):
if re.match('^\\d+$', e['width']):
e['width'] += 'px'
# Remove empty "class" attributes and sort the listed classes.
for e in soup.select('*'):
if 'class' in e.attrs:
if e['class']:
e['class'] = sorted(e['class'])
else:
del e['class']
# Format the html with indentation so we can *see* things
html = soup.prettify()
# docbook spits out the long-form charset and asciidoctor spits out the
# short form but they are equivalent
html = html.replace(
'<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>',
'<meta charset="utf-8"/>')
html = re.sub(r'(Appendix\s\w+)\.([^<]+)', '\\1:\\2', html)
return html
def html_diff(lhs_name, lhs, rhs_name, rhs):
"""Compare two html blobs, ignoring expected differences between AsciiDoc
and Asciidoctor. The result is a generator for lines in the diff report.
If it is entirely empty then there is no diff.
"""
lhs_lines = normalize_html(lhs).splitlines()
rhs_lines = normalize_html(rhs).splitlines()
return difflib.unified_diff(
lhs_lines,
rhs_lines,
fromfile=lhs_name,
tofile=rhs_name,
lineterm='')
def html_file_diff(lhs, rhs):
"""Compare two html files, ignoring expected differences between AsciiDoc
and Asciidoctor. The result is a generator for lines in the diff report.
If it is entirely empty then there is no diff.
"""
with open(lhs, encoding='utf-8') as lhs_file:
lhs_text = lhs_file.read()
with open(rhs, encoding='utf-8') as rhs_file:
rhs_text = rhs_file.read()
return html_diff(lhs, lhs_text, rhs, rhs_text)
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
print("Expected exactly 2 arguments but got %s" % sys.argv[1:])
exit(1)
had_diff = False
for line in html_file_diff(sys.argv[1], sys.argv[2]):
had_diff = True
# print doesn't like to print utf-8 in all cases but buffer.write is ok
sys.stderr.buffer.write(line.encode('utf-8'))
sys.stderr.buffer.write("\n".encode('utf-8'))
exit(1 if had_diff else 0)