integtest/html_diff

#!/usr/bin/env python3

# Script to compare two html files, ignoring differences that we consider
# to be unimportant. The output is a unified diff of formatted html meant
# to be readable and precise at identifying differences.
#
# This script is designed to be run in the container managed by the
# Dockerfile at the root of this repository.


from bs4 import BeautifulSoup, NavigableString
import difflib
import re


def normalize_html(html):
    """Normalizes html to remove expected differences between AsciiDoc's
    output and Asciidoctor's output.
    """
    soup = BeautifulSoup(html, 'lxml')
    # Remove all duplicate classes because that doesn't make any
    # visual difference.
    for e in soup.descendants:
        if hasattr(e, 'attrs') and 'class' in e.attrs:
            e['class'] = list(set(e['class']))
    # Replace many whitespace characters with a single space in some elements
    # kind of like a browser does.
    for e in soup.select(':not(script,pre,code,style)'):
        for part in e:
            if isinstance(part, NavigableString):
                crunched = NavigableString(re.sub(r'\s+', ' ', part))
                if crunched != part:
                    part.replace_with(crunched)
    # Asciidoctor adds a "content" wrapper. It doesn't really change the layout
    # so we're ok with it.
    for e in soup.select('#content'):
        e.unwrap()
    # Docbook adds <span class="emphasis"> around <em> tags. We don't need them
    # and it isn't worth making Asciidoctor make them.
    for e in soup.select('.emphasis'):
        e.unwrap()
    # Asciidoctor adds a "ulist" class to all unordered lists which doesn't
    # hurt anything so we can ignore it.
    for e in soup.select('.itemizedlist.ulist'):
        e['class'].remove('ulist')
    # Docbook adds type which we override in css.
    for e in soup.select('ul'):
        if 'type' in e.attrs:
            del e['type']
    # Asciidoctor adds a "olist" class to all ordered lists which doesn't
    # hurt anything so we can ignore it.
    for e in soup.select('.orderedlist.olist'):
        e['class'].remove('olist')
    # Docbook adds type="1" to ol which we override with css.
    for e in soup.select('ol'):
        if 'type' in e.attrs:
            del e['type']
    # Docbook emits images with the 'inlinemediaobject' class and Asciidoctor
    # has the 'image' class. We've updated our styles to make both work.
    for e in soup.select('.inlinemediaobject'):
        e['class'].remove('inlinemediaobject')
        e['class'].append('image')
    # Docbook emits screenshot images with the 'informalfigure' class and
    # Asciidoctor has the 'imageblock' class. We've updated our styles to make
    # both work.
    for e in soup.select('.informalfigure'):
        e['class'].remove('informalfigure')
        e['class'].append('imageblock')
    # Docbook emits images with titles as 'figure' but docbook just uses
    # 'imageblock'. And all kinds of other things are different. But they don't
    # *look* different.
    for e in soup.select('.imageblock.text-center'):
        e['class'].remove('text-center')
    for e in soup.select('.imageblock[id]'):
        anchor = soup.new_tag('a')
        anchor['id'] = e['id']
        e.insert(0, anchor)
        del e['id']
    for e in soup.select('.figure'):
        e['class'].remove('figure')
        e['class'].append('imageblock')
        for a in e.select('a[id]'):
            if a['id'].startswith('id-'):
                a.extract()
        for t in e.select('.title'):
            t.extract()
            new_t = soup.new_tag("div")
            new_t['class'] = ['title']
            new_t.append(t.contents[0].extract().contents[0])
            e.append(new_t)
        for c in e.select('.figure-contents'):
            c['class'].remove('figure-contents')
            c['class'].append('mediaobject')
            c.contents[0].unwrap()
    # Same for mediaobject and content
    for e in soup.select('.content'):
        e['class'].remove('content')
        e['class'].append('mediaobject')
    # Blast some "defaults" that docbook renders and asciidoctor doesn't.
    for e in soup.select('.imageblock .mediaobject[align=center]'):
        del e['align']
    for e in soup.select('.imageblock img[align=middle]'):
        del e['align']
    # Asciidoctor emits examples as "exampleblock" instead of "informalexample"
    # but these don't make any visual difference.
    for e in soup.select('.exampleblock'):
        e['class'].remove('exampleblock')
        e['class'].append('informalexample')
        for c in e.select('.mediaobject'):
            c.unwrap()
    # Docbook adds ids to all examples and asciidoctor doesn't. We don't need
    # the ids so we ignore them.
    for e in soup.select('.example a[id]'):
        if e['id'].startswith('id-'):
            e.extract()
    # Docbook links with `<a class="link"` when linking from one page of a book
    # to another. Asciidoctor emits `<a class="link"`. Both look fine.
    for e in soup.select('a.xref'):
        e['class'].remove('xref')
        e['class'].append('link')
    # Docbook sprinkles `class="simpara"` all over the place and I'm not 100%
    # sure why. But we don't need it anyway.
    for e in soup.select('.simpara'):
        e['class'].remove('simpara')
    # Docbook *never* makes a <dd> tag that contains only a <p> tag.
    # Asciidoctor *can* do that if the description is formed in a very
    # particular way. Visually, it all looks the same.
    for e in soup.select('dd > p:only-child'):
        e.unwrap()
    # Docbook adds ids to every "question and answer" style dlist. Asciidoctor
    # doesn't and we have no chance of emulating docbook's generated ids. Since
    # we don't *think* anyone does anything with the ids we ignore them.
    for e in soup.select('.qandaset a[id]'):
        if e['id'].startswith('id-'):
            e.extract()
    # Docbook would sometimes spit out empty <p> tags. We're fine with
    # dropping those.
    for e in soup.select('p:empty'):
        e.extract()
    # Docbook makes `<code class="literal"><code class="literal">` sometimes.
    # Just one `<code>` is fine though.
    for e in soup.select("code.literal > code.literal"):
        e.unwrap()
    # Docbook adds non-breaking spaces inside the title attributes for some
    # links. It doesn't seem like we need that.
    for e in soup.select('link[title]'):
        e['title'] = e['title'].replace('\xa0', ' ')
    # Docbook adds a generated id to tables with ids. They aren't used and we
    # have no change of generating it properly so we just ignore it.
    for a in soup.select('.table a[id]'):
        if a['id'].startswith('id-'):
            a.extract()
    # Docbook renders partintro's slightly differently but the difference isn't
    # visually distinct so we can ignore it.
    for e in soup.select('.partintro:not(.openblock) > div:empty'):
        e.extract()
    for e in soup.select('.partintro:not(.openblock)'):
        e['class'].remove('partintro')
        e['class'].append('mediaobject')
        wrapper = soup.new_tag("div")
        wrapper['class'] = ['openblock', 'partintro']
        e.wrap(wrapper)
    # Asciidoctor adds 'blockquote' and 'quoteblock'. Docbook doesn't and we
    # don't need them.
    for e in soup.select('.blockquote'):
        e['class'].remove('blockquote')
    for e in soup.select('.quoteblock'):
        e['class'].remove('quoteblock')
    # Docbook doesn't add `Admonishment--change` but Asciidoctor does to keep
    # it consistent with the care admonitions.
    for e in soup.select('.Admonishment--change'):
        e['class'].remove('Admonishment--change')
    # Docbook adds `<span class="quote"` whenever it sees quotes. These don't
    # change the rendering and Asciidoctor doesns't add them.
    for e in soup.select('span.quote'):
        parent = e.parent
        e.unwrap()
        parent.smooth()
    # Asciidoctor suppports adding a `<meta name="description"` if a book
    # defines a `:description:` attribute. Docbook doesn't. We're quite ok with
    # adding it but we can ignore it in the diff because, well, we don't need
    # to see it.
    for e in soup.select("meta[name='description']"):
        e.extract()
    # Docbook renders "indexterms" with an inline anchor and asciidoctor
    # doesn't. You can't see them and no one is linking to them and the links
    # don't really do anything. So we're ok ignoring them.
    for e in soup.select("a.indexterm"):
        parent = e.parent
        e.extract()
        parent.smooth()
    # Docbook sometimes renders the xpack tag *after* the edit me link but
    # asciidoctor always renders it before the edit me link. Either way is fine
    # so we ignore the difference.
    for e in soup.select("a.edit_me"):
        parent = e.parent
        parent.append(e.extract())
    # Docbook passes the `exclude` class on some paragraphs and asciidoctor
    # doesn't which is fine because it doesn't make a visual difference.
    for e in soup.select('.exclude'):
        e['class'].remove('exclude')
    # Asciidoctor adds 'px' to the end of raw pixel values but docbook doesn't.
    # We're quite happy to have the new px.
    for e in soup.select('img[width]'):
        if re.match('^\\d+$', e['width']):
            e['width'] += 'px'

    # Remove empty "class" attributes and sort the listed classes.
    for e in soup.select('*'):
        if 'class' in e.attrs:
            if e['class']:
                e['class'] = sorted(e['class'])
            else:
                del e['class']
    # Format the html with indentation so we can *see* things
    html = soup.prettify()
    # docbook spits out the long-form charset and asciidoctor spits out the
    # short form but they are equivalent
    html = html.replace(
        '<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>',
        '<meta charset="utf-8"/>')
    html = re.sub(r'(Appendix\s\w+)\.([^<]+)', '\\1:\\2', html)
    return html


def html_diff(lhs_name, lhs, rhs_name, rhs):
    """Compare two html blobs, ignoring expected differences between AsciiDoc
    and Asciidoctor. The result is a generator for lines in the diff report.
    If it is entirely empty then there is no diff.
    """
    lhs_lines = normalize_html(lhs).splitlines()
    rhs_lines = normalize_html(rhs).splitlines()
    return difflib.unified_diff(
            lhs_lines,
            rhs_lines,
            fromfile=lhs_name,
            tofile=rhs_name,
            lineterm='')


def html_file_diff(lhs, rhs):
    """Compare two html files, ignoring expected differences between AsciiDoc
    and Asciidoctor. The result is a generator for lines in the diff report.
    If it is entirely empty then there is no diff.
    """
    with open(lhs, encoding='utf-8') as lhs_file:
        lhs_text = lhs_file.read()
    with open(rhs, encoding='utf-8') as rhs_file:
        rhs_text = rhs_file.read()
    return html_diff(lhs, lhs_text, rhs, rhs_text)


if __name__ == '__main__':
    import sys
    if len(sys.argv) != 3:
        print("Expected exactly 2 arguments but got %s" % sys.argv[1:])
        exit(1)
    had_diff = False
    for line in html_file_diff(sys.argv[1], sys.argv[2]):
        had_diff = True
        # print doesn't like to print utf-8 in all cases but buffer.write is ok
        sys.stderr.buffer.write(line.encode('utf-8'))
        sys.stderr.buffer.write("\n".encode('utf-8'))
    exit(1 if had_diff else 0)