summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMagnus Hagander2008-10-22 20:58:02 +0000
committerMagnus Hagander2008-10-22 20:58:02 +0000
commitec4838b24eda8045d10c08e1b3bd4bb220ce5f5f (patch)
treed3c337170f478531c1fc8cb8a8c9905714025d70
parent494c6c3dffb366c9a42562d7c710100ee0772665 (diff)
Use HTMLTidy and some attribute trickery to make output XHTML valid.
Change to XHTML Transitional, because that's really what it is. Fix issue where [...] appendings weren't always rewritten to proper HTML links in the HTML output. git-svn-id: file:///Users/dpage/pgweb/svn-repo/trunk@2227 8f5c7a92-453e-0410-a47f-ad33c8a6b003
-rwxr-xr-xplanet/generator.py33
-rw-r--r--planet/planethtml.py6
2 files changed, 30 insertions, 9 deletions
diff --git a/planet/generator.py b/planet/generator.py
index c1850065..5fb4d3e0 100755
--- a/planet/generator.py
+++ b/planet/generator.py
@@ -11,12 +11,22 @@ import psycopg2
import PyRSS2Gen
import datetime
import sys
+import tidy
+import urllib
from HTMLParser import HTMLParser
from planethtml import PlanetHtml
class Generator:
def __init__(self,db):
self.db = db
+ self.tidyopts = dict( drop_proprietary_attributes=1,
+ alt_text='',
+ hide_comments=1,
+ output_xhtml=1,
+ show_body_only=1,
+ clean=1,
+ )
+
def Generate(self):
rss = PyRSS2Gen.RSS2(
@@ -48,6 +58,10 @@ class Generator:
html.WriteFile("www/index.html")
def TruncateAndCleanDescription(self, txt, title):
+ # First apply Tidy
+ txt = str(tidy.parseString(txt, **self.tidyopts))
+
+ # Then truncate as necessary
ht = HtmlTruncator(1024, title)
ht.feed(txt)
out = ht.GetText()
@@ -78,10 +92,19 @@ class HtmlTruncator(HTMLParser):
if self.skiprest: return
self.trunctxt += self.get_starttag_text()
+ def quoteurl(self, str):
+ p = str.split(":",2)
+ return p[0] + ":" + urllib.quote(p[1])
+
+ def cleanhref(self, attrs):
+ if attrs[0] == 'href':
+ return 'href', self.quoteurl(attrs[1])
+ return attrs
+
def handle_starttag(self, tag, attrs):
if self.skiprest: return
self.trunctxt += "<" + tag
- self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in attrs]))
+ self.trunctxt += (' '.join([(' %s="%s"' % (k,v)) for k,v in map(self.cleanhref, attrs)]))
self.trunctxt += ">"
self.tagstack.append(tag)
@@ -102,11 +125,6 @@ class HtmlTruncator(HTMLParser):
if self.len > self.maxlen:
# Passed max length, so truncate text as close to the limit as possible
self.trunctxt = self.trunctxt[0:len(self.trunctxt)-(self.len-self.maxlen)]
- # Terminate at whitespace if possible, max 12 chars back
- for i in range(len(self.trunctxt)-1, len(self.trunctxt)-12, -1):
- if self.trunctxt[i].isspace():
- self.trunctxt = self.trunctxt[0:i] + " [...]"
- break
# Now append any tags that weren't properly closed
self.tagstack.reverse()
@@ -114,6 +132,9 @@ class HtmlTruncator(HTMLParser):
self.trunctxt += "</" + tag + ">"
self.skiprest = True
+ # Finally, append the continuation chars
+ self.trunctxt += "[...]"
+
def GetText(self):
if self.len > self.maxlen:
return self.trunctxt
diff --git a/planet/planethtml.py b/planet/planethtml.py
index 6fd14def..dff287d8 100644
--- a/planet/planethtml.py
+++ b/planet/planethtml.py
@@ -15,8 +15,8 @@ class PlanetHtml:
def __init__(self):
self.items = []
self.feeds = []
- self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ self.str = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
<head>
<title>Planet PostgreSQL</title>
@@ -46,7 +46,7 @@ class PlanetHtml:
lastdate = None
for post in self.items:
if post[6].endswith('[...]'):
- txt = post[6][:len(post[6])-4] + """<a href="%s">continue reading...</a>]""" % (post[1])
+ txt = post[6][:len(post[6])-5] + """<p>[<a href="%s">continue reading...</a>]</p>""" % (post[1])
else:
txt = post[6]