summaryrefslogtreecommitdiff
path: root/planet/aggregator.py
blob: 20de7f7590ecf6bc2540f83d9a85aeae18acdb6e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# vim: ai ts=4 sts=4 sw=4
"""PostgreSQL Planet Aggregator

This file contains the functions to suck down RSS/Atom feeds 
(using feedparser) and store the results in a PostgreSQL database.

Copyright (C) 2008-2009 PostgreSQL Global Development Group
"""

import psycopg2
import feedparser
import datetime
import socket
import ConfigParser

class Aggregator:
	def __init__(self, db):
		self.db = db
		self.stored = 0
		self.authorfilter = None
		socket.setdefaulttimeout(20)
		
	def Update(self):
		feeds = self.db.cursor()
		feeds.execute('SELECT id,feedurl,name,lastget,authorfilter FROM planet.feeds')
		for feed in feeds.fetchall():
			try:
				n = self.ParseFeed(feed)
				if n > 0:
					c = self.db.cursor()
					c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 't', %(info)s)", {
						'feed': feed[0],
						'info': 'Fetched %s posts.' % n,
					})
			except Exception, e:
				print "Exception when parsing feed '%s': %s" % (feed[1], e)
				self.db.rollback()
				c = self.db.cursor()
				c.execute("INSERT INTO planet.aggregatorlog (feed, success, info) VALUES (%(feed)s, 'f', %(info)s)", {
					'feed': feed[0],
					'info': 'Error: "%s"' % e,
				})
			self.db.commit()

	def ParseFeed(self, feedinfo):
		numadded = 0
		parsestart = datetime.datetime.now()
		feed = feedparser.parse(feedinfo[1], modified=feedinfo[3].timetuple())
		
		if not hasattr(feed, 'status'):
			# bozo_excpetion can seemingly be set when there is no error as well,
			# so make sure we only check if we didn't get a status.
			if hasattr(feed,'bozo_exception'):
				raise Exception('Feed load error %s' % feed.bozo_exception)
			raise Exception('Feed load error with not exception!')

		if feed.status == 304:
			# not changed
			return 0
		if feed.status != 200:
			raise Exception('Feed returned status %s' % feed.status)

		self.authorfilter = feedinfo[4]

		for entry in feed.entries:
			if not self.matches_filter(entry):
				continue
				
			# Grab the entry. At least atom feeds from wordpress store what we
			# want in entry.content[0].value and *also* has a summary that's
			# much shorter. Other blog software store what we want in the summary
			# attribute. So let's just try one after another until we hit something.
			try:
				txt = entry.content[0].value
			except:
				txt = ''
			if txt == '' and entry.has_key('summary'):
				txt = entry.summary
			if txt == '':
				# Not a critical error, we just ignore empty posts
				print "Failed to get text for entry at %s" % entry.link
				continue

			if entry.has_key('guidislink'):
				guidisperma = entry.guidislink
			else:
				guidisperma = True
			if self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt) > 0:
				numadded += 1
		if numadded > 0:
			self.db.cursor().execute("UPDATE planet.feeds SET lastget=COALESCE((SELECT max(dat) FROM planet.posts WHERE planet.posts.feed=planet.feeds.id),'2000-01-01') WHERE planet.feeds.id=%(feed)s", {'feed': feedinfo[0]})
		return numadded

	def matches_filter(self, entry):
		# For now, we only match against self.authorfilter. In the future,
		# there may be more filters.
		if self.authorfilter:
			# Match against an author filter
			
			if entry.has_key('author_detail'):
				return entry.author_detail.name == self.authorfilter
			else: 
				return False

		# No filters, always return true
		return True

	def StoreEntry(self, feedid, guid, date, link, guidisperma, title, txt):
		c = self.db.cursor()
		c.execute("SELECT id FROM planet.posts WHERE feed=%(feed)s AND guid=%(guid)s", {'feed':feedid, 'guid':guid})
		if c.rowcount > 0:
			return 0
		print "Store entry %s from feed %s" % (guid, feedid)
		c.execute("INSERT INTO planet.posts (feed,guid,link,guidisperma,dat,title,txt) VALUES (%(feed)s,%(guid)s,%(link)s,%(guidisperma)s,%(date)s,%(title)s,%(txt)s)",
			{'feed': feedid,
			 'guid': guid,
			 'link': link,
			 'guidisperma': guidisperma,
			 'date': date,
			 'title': title,
			 'txt': txt})
		self.stored += 1
		return 1

if __name__=="__main__":
	c = ConfigParser.ConfigParser()
	c.read('planet.ini')
	Aggregator(psycopg2.connect(c.get('planet','db'))).Update()