git.fiddlerwoaroof.com
rssmangle.py
fe39ddf7
 #!/usr/bin/env python
 import datetime
 
 # http://www.feedparser.org/
 import feedparser
 # http://www.dalkescientific.com/Python/PyRSS2Gen.html
 import PyRSS2Gen
 import goose
 import lxml.html
 import os
 import tempfile
 
 # Get the data
 parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml')
 
 class FullRSSItem(PyRSS2Gen.RSSItem):
     def __init__(self, **kwargs):
         if 'content' in kwargs:
             self.content = kwargs['content']
             del kwargs['content']
         else:
             self.content = None
         PyRSS2Gen.RSSItem.__init__(self, **kwargs)
 
     def publish_extensions(self, handler):
         if self.content is not None:
             PyRSS2Gen._opt_element(handler, "content:encoded", self.content)
 
 
 # Modify the parsed_feed data here
 
 
 CACHE_DIR = '/var/cache/rssmangle'
 if not os.path.exists(CACHE_DIR):
     os.mkdir(CACHE_DIR)
 
 import urllib
 import hashlib
 fn_template = '%s#%s'
 def get_content(item):
     sha_hasher = hashlib.sha256(item.link)
     cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published))
     if os.path.exists(cache_key):
         with open(cache_key) as f: return f.read()
     else:
         with open(cache_key, 'w') as f:
             g = goose.Goose()
             article = g.extract(item.link)
             if article.doc == None: return ''
             else:
                 result = lxml.html.tostring(article.doc)
                 print >>f, result
                 return result
 
 
 items = [
     FullRSSItem(
         title = x.title,
         link = x.link,
         description = x.summary,
         guid = x.link,
         pubDate = datetime.datetime(
             x.published_parsed[0],
             x.published_parsed[1],
             x.published_parsed[2],
             x.published_parsed[3],
             x.published_parsed[4],
             x.published_parsed[5]),
         content = get_content(x)
         )
     for x in parsed_feed.entries
 ]
 
 # make the RSS2 object
 # Try to grab the title, link, language etc from the orig feed
 
 rss = PyRSS2Gen.RSS2(
     title = parsed_feed['feed'].get("title"),
     link = parsed_feed['feed'].get("link"),
     description = parsed_feed['feed'].get("description"),
 
     language = parsed_feed['feed'].get("language"),
     copyright = parsed_feed['feed'].get("copyright"),
     managingEditor = parsed_feed['feed'].get("managingEditor"),
     webMaster = parsed_feed['feed'].get("webMaster"),
     pubDate = parsed_feed['feed'].get("pubDate"),
     lastBuildDate = parsed_feed['feed'].get("lastBuildDate"),
 
     categories = parsed_feed['feed'].get("categories"),
     generator = parsed_feed['feed'].get("generator"),
     docs = parsed_feed['feed'].get("docs"),
 
     items = items
 )
 
 
 print rss.to_xml()