#!/usr/bin/env python import datetime # http://www.feedparser.org/ import feedparser # http://www.dalkescientific.com/Python/PyRSS2Gen.html import PyRSS2Gen import goose import lxml.html import os import tempfile # Get the data parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml') class FullRSSItem(PyRSS2Gen.RSSItem): def __init__(self, **kwargs): if 'content' in kwargs: self.content = kwargs['content'] del kwargs['content'] else: self.content = None PyRSS2Gen.RSSItem.__init__(self, **kwargs) def publish_extensions(self, handler): if self.content is not None: PyRSS2Gen._opt_element(handler, "content:encoded", self.content) # Modify the parsed_feed data here CACHE_DIR = '/var/cache/rssmangle' if not os.path.exists(CACHE_DIR): os.mkdir(CACHE_DIR) import urllib import hashlib fn_template = '%s#%s' def get_content(item): sha_hasher = hashlib.sha256(item.link) cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published)) if os.path.exists(cache_key): with open(cache_key) as f: return f.read() else: with open(cache_key, 'w') as f: g = goose.Goose() article = g.extract(item.link) if article.doc == None: return '' else: result = lxml.html.tostring(article.doc) print >>f, result return result items = [ FullRSSItem( title = x.title, link = x.link, description = x.summary, guid = x.link, pubDate = datetime.datetime( x.published_parsed[0], x.published_parsed[1], x.published_parsed[2], x.published_parsed[3], x.published_parsed[4], x.published_parsed[5]), content = get_content(x) ) for x in parsed_feed.entries ] # make the RSS2 object # Try to grab the title, link, language etc from the orig feed rss = PyRSS2Gen.RSS2( title = parsed_feed['feed'].get("title"), link = parsed_feed['feed'].get("link"), description = parsed_feed['feed'].get("description"), language = parsed_feed['feed'].get("language"), copyright = parsed_feed['feed'].get("copyright"), managingEditor = parsed_feed['feed'].get("managingEditor"), webMaster = parsed_feed['feed'].get("webMaster"), pubDate = parsed_feed['feed'].get("pubDate"), lastBuildDate = parsed_feed['feed'].get("lastBuildDate"), categories = parsed_feed['feed'].get("categories"), generator = parsed_feed['feed'].get("generator"), docs = parsed_feed['feed'].get("docs"), items = items ) print rss.to_xml()