diff options
author | Magnus Hagander | 2008-10-22 20:58:02 +0000 |
---|---|---|
committer | Magnus Hagander | 2008-10-22 20:58:02 +0000 |
commit | ec4838b24eda8045d10c08e1b3bd4bb220ce5f5f (patch) | |
tree | d3c337170f478531c1fc8cb8a8c9905714025d70 | |
parent | 494c6c3dffb366c9a42562d7c710100ee0772665 (diff) |
Use HTMLTidy and some attribute trickery to make output XHTML valid.
Change to XHTML Transitional, because that's really what it is.
Fix issue where [...] appendings weren't always rewritten to proper
HTML links in the HTML output.
git-svn-id: file:///Users/dpage/pgweb/svn-repo/trunk@2227 8f5c7a92-453e-0410-a47f-ad33c8a6b003
-rwxr-xr-x | planet/generator.py | 33 | ||||
-rw-r--r-- | planet/planethtml.py | 6 |
2 files changed, 30 insertions, 9 deletions
diff --git a/planet/generator.py b/planet/generator.py index c1850065..5fb4d3e0 100755 --- a/planet/generator.py +++ b/planet/generator.py @@ -11,12 +11,22 @@ import psycopg2 import PyRSS2Gen import datetime import sys +import tidy +import urllib from HTMLParser import HTMLParser from planethtml import PlanetHtml class Generator: def __init__(self,db): self.db = db + self.tidyopts = dict( drop_proprietary_attributes=1, + alt_text='', + hide_comments=1, + output_xhtml=1, + show_body_only=1, + clean=1, + ) + def Generate(self): rss = PyRSS2Gen.RSS2( @@ -48,6 +58,10 @@ class Generator: html.WriteFile("www/index.html") def TruncateAndCleanDescription(self, txt, title): + # First apply Tidy + txt = str(tidy.parseString(txt, **self.tidyopts)) + + # Then truncate as necessary ht = HtmlTruncator(1024, title) ht.feed(txt) out = ht.GetText() @@ -78,10 +92,19 @@ class HtmlTruncator(HTMLParser): if self.skiprest: return self.trunctxt += self.get_starttag_text() + def quoteurl(self, str): + p = str.split(":",2) + return p[0] + ":" + urllib.quote(p[1]) + + def cleanhref(self, attrs): + if attrs[0] == 'href': + return 'href', self.quoteurl(attrs[1]) + return attrs + def handle_starttag(self, tag, attrs): if self.skiprest: return self.trunctxt += "<" + tag - self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in attrs])) + self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in map(self.cleanhref, attrs)])) self.trunctxt += ">" self.tagstack.append(tag) @@ -102,11 +125,6 @@ class HtmlTruncator(HTMLParser): if self.len > self.maxlen: # Passed max length, so truncate text as close to the limit as possible self.trunctxt = self.trunctxt[0:len(self.trunctxt)-(self.len-self.maxlen)] - # Terminate at whitespace if possible, max 12 chars back - for i in range(len(self.trunctxt)-1, len(self.trunctxt)-12, -1): - if self.trunctxt[i].isspace(): - self.trunctxt = self.trunctxt[0:i] + " [...]" - break # Now append any tags that weren't properly closed self.tagstack.reverse() @@ -114,6 +132,9 @@ class HtmlTruncator(HTMLParser): self.trunctxt += "</" + tag + ">" self.skiprest = True + # Finally, append the continuation chars + self.trunctxt += "[...]" + def GetText(self): if self.len > self.maxlen: return self.trunctxt diff --git a/planet/planethtml.py b/planet/planethtml.py index 6fd14def..dff287d8 100644 --- a/planet/planethtml.py +++ b/planet/planethtml.py @@ -15,8 +15,8 @@ class PlanetHtml: def __init__(self): self.items = [] self.feeds = [] - self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" - "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr"> <head> <title>Planet PostgreSQL</title> @@ -46,7 +46,7 @@ class PlanetHtml: lastdate = None for post in self.items: if post[6].endswith('[...]'): - txt = post[6][:len(post[6])-4] + """<a href="%s">continue reading...</a>]""" % (post[1]) + txt = post[6][:len(post[6])-5] + """<p>[<a href="%s">continue reading...</a>]</p>""" % (post[1]) else: txt = post[6] |