#!/usr/bin/env python3 # Script to compare two html files, ignoring differences that we consider # to be unimportant. The output is a unified diff of formatted html meant # to be readable and precise at identifying differences. # # This script is designed to be run in the container managed by the # Dockerfile at the root of this repository. from bs4 import BeautifulSoup, NavigableString import difflib import re def normalize_html(html): """Normalizes html to remove expected differences between AsciiDoc's output and Asciidoctor's output. """ soup = BeautifulSoup(html, 'lxml') # Remove all duplicate classes because that doesn't make any # visual difference. for e in soup.descendants: if hasattr(e, 'attrs') and 'class' in e.attrs: e['class'] = list(set(e['class'])) # Replace many whitespace characters with a single space in some elements # kind of like a browser does. for e in soup.select(':not(script,pre,code,style)'): for part in e: if isinstance(part, NavigableString): crunched = NavigableString(re.sub(r'\s+', ' ', part)) if crunched != part: part.replace_with(crunched) # Asciidoctor adds a "content" wrapper. It doesn't really change the layout # so we're ok with it. for e in soup.select('#content'): e.unwrap() # Docbook adds around tags. We don't need them # and it isn't worth making Asciidoctor make them. for e in soup.select('.emphasis'): e.unwrap() # Asciidoctor adds a "ulist" class to all unordered lists which doesn't # hurt anything so we can ignore it. for e in soup.select('.itemizedlist.ulist'): e['class'].remove('ulist') # Docbook adds type which we override in css. for e in soup.select('ul'): if 'type' in e.attrs: del e['type'] # Asciidoctor adds a "olist" class to all ordered lists which doesn't # hurt anything so we can ignore it. for e in soup.select('.orderedlist.olist'): e['class'].remove('olist') # Docbook adds type="1" to ol which we override with css. for e in soup.select('ol'): if 'type' in e.attrs: del e['type'] # Docbook emits images with the 'inlinemediaobject' class and Asciidoctor # has the 'image' class. We've updated our styles to make both work. for e in soup.select('.inlinemediaobject'): e['class'].remove('inlinemediaobject') e['class'].append('image') # Docbook emits screenshot images with the 'informalfigure' class and # Asciidoctor has the 'imageblock' class. We've updated our styles to make # both work. for e in soup.select('.informalfigure'): e['class'].remove('informalfigure') e['class'].append('imageblock') # Docbook emits images with titles as 'figure' but docbook just uses # 'imageblock'. And all kinds of other things are different. But they don't # *look* different. for e in soup.select('.imageblock.text-center'): e['class'].remove('text-center') for e in soup.select('.imageblock[id]'): anchor = soup.new_tag('a') anchor['id'] = e['id'] e.insert(0, anchor) del e['id'] for e in soup.select('.figure'): e['class'].remove('figure') e['class'].append('imageblock') for a in e.select('a[id]'): if a['id'].startswith('id-'): a.extract() for t in e.select('.title'): t.extract() new_t = soup.new_tag("div") new_t['class'] = ['title'] new_t.append(t.contents[0].extract().contents[0]) e.append(new_t) for c in e.select('.figure-contents'): c['class'].remove('figure-contents') c['class'].append('mediaobject') c.contents[0].unwrap() # Same for mediaobject and content for e in soup.select('.content'): e['class'].remove('content') e['class'].append('mediaobject') # Blast some "defaults" that docbook renders and asciidoctor doesn't. for e in soup.select('.imageblock .mediaobject[align=center]'): del e['align'] for e in soup.select('.imageblock img[align=middle]'): del e['align'] # Asciidoctor emits examples as "exampleblock" instead of "informalexample" # but these don't make any visual difference. for e in soup.select('.exampleblock'): e['class'].remove('exampleblock') e['class'].append('informalexample') for c in e.select('.mediaobject'): c.unwrap() # Docbook adds ids to all examples and asciidoctor doesn't. We don't need # the ids so we ignore them. for e in soup.select('.example a[id]'): if e['id'].startswith('id-'): e.extract() # Docbook links with ` tag that contains only a
tag. # Asciidoctor *can* do that if the description is formed in a very # particular way. Visually, it all looks the same. for e in soup.select('dd > p:only-child'): e.unwrap() # Docbook adds ids to every "question and answer" style dlist. Asciidoctor # doesn't and we have no chance of emulating docbook's generated ids. Since # we don't *think* anyone does anything with the ids we ignore them. for e in soup.select('.qandaset a[id]'): if e['id'].startswith('id-'): e.extract() # Docbook would sometimes spit out empty
tags. We're fine with # dropping those. for e in soup.select('p:empty'): e.extract() # Docbook makes `` sometimes. # Just one `` is fine though. for e in soup.select("code.literal > code.literal"): e.unwrap() # Docbook adds non-breaking spaces inside the title attributes for some # links. It doesn't seem like we need that. for e in soup.select('link[title]'): e['title'] = e['title'].replace('\xa0', ' ') # Docbook adds a generated id to tables with ids. They aren't used and we # have no change of generating it properly so we just ignore it. for a in soup.select('.table a[id]'): if a['id'].startswith('id-'): a.extract() # Docbook renders partintro's slightly differently but the difference isn't # visually distinct so we can ignore it. for e in soup.select('.partintro:not(.openblock) > div:empty'): e.extract() for e in soup.select('.partintro:not(.openblock)'): e['class'].remove('partintro') e['class'].append('mediaobject') wrapper = soup.new_tag("div") wrapper['class'] = ['openblock', 'partintro'] e.wrap(wrapper) # Asciidoctor adds 'blockquote' and 'quoteblock'. Docbook doesn't and we # don't need them. for e in soup.select('.blockquote'): e['class'].remove('blockquote') for e in soup.select('.quoteblock'): e['class'].remove('quoteblock') # Docbook doesn't add `Admonishment--change` but Asciidoctor does to keep # it consistent with the care admonitions. for e in soup.select('.Admonishment--change'): e['class'].remove('Admonishment--change') # Docbook adds `', '') html = re.sub(r'(Appendix\s\w+)\.([^<]+)', '\\1:\\2', html) return html def html_diff(lhs_name, lhs, rhs_name, rhs): """Compare two html blobs, ignoring expected differences between AsciiDoc and Asciidoctor. The result is a generator for lines in the diff report. If it is entirely empty then there is no diff. """ lhs_lines = normalize_html(lhs).splitlines() rhs_lines = normalize_html(rhs).splitlines() return difflib.unified_diff( lhs_lines, rhs_lines, fromfile=lhs_name, tofile=rhs_name, lineterm='') def html_file_diff(lhs, rhs): """Compare two html files, ignoring expected differences between AsciiDoc and Asciidoctor. The result is a generator for lines in the diff report. If it is entirely empty then there is no diff. """ with open(lhs, encoding='utf-8') as lhs_file: lhs_text = lhs_file.read() with open(rhs, encoding='utf-8') as rhs_file: rhs_text = rhs_file.read() return html_diff(lhs, lhs_text, rhs, rhs_text) if __name__ == '__main__': import sys if len(sys.argv) != 3: print("Expected exactly 2 arguments but got %s" % sys.argv[1:]) exit(1) had_diff = False for line in html_file_diff(sys.argv[1], sys.argv[2]): had_diff = True # print doesn't like to print utf-8 in all cases but buffer.write is ok sys.stderr.buffer.write(line.encode('utf-8')) sys.stderr.buffer.write("\n".encode('utf-8')) exit(1 if had_diff else 0)