GitList

Raw Blame History
#!/usr/bin/env python
import datetime

# http://www.feedparser.org/
import feedparser
# http://www.dalkescientific.com/Python/PyRSS2Gen.html
import PyRSS2Gen
import goose
import lxml.html
import os
import tempfile

# Get the data
parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml')

class FullRSSItem(PyRSS2Gen.RSSItem):
    def __init__(self, **kwargs):
        if 'content' in kwargs:
            self.content = kwargs['content']
            del kwargs['content']
        else:
            self.content = None
        PyRSS2Gen.RSSItem.__init__(self, **kwargs)

    def publish_extensions(self, handler):
        if self.content is not None:
            PyRSS2Gen._opt_element(handler, "content:encoded", self.content)


# Modify the parsed_feed data here


CACHE_DIR = '/var/cache/rssmangle'
if not os.path.exists(CACHE_DIR):
    os.mkdir(CACHE_DIR)

import urllib
import hashlib
fn_template = '%s#%s'
def get_content(item):
    sha_hasher = hashlib.sha256(item.link)
    cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published))
    if os.path.exists(cache_key):
        with open(cache_key) as f: return f.read()
    else:
        with open(cache_key, 'w') as f:
            g = goose.Goose()
            article = g.extract(item.link)
            if article.doc == None: return ''
            else:
                result = lxml.html.tostring(article.doc)
                print >>f, result
                return result


items = [
    FullRSSItem(
        title = x.title,
        link = x.link,
        description = x.summary,
        guid = x.link,
        pubDate = datetime.datetime(
            x.published_parsed[0],
            x.published_parsed[1],
            x.published_parsed[2],
            x.published_parsed[3],
            x.published_parsed[4],
            x.published_parsed[5]),
        content = get_content(x)
        )
    for x in parsed_feed.entries
]

# make the RSS2 object
# Try to grab the title, link, language etc from the orig feed

rss = PyRSS2Gen.RSS2(
    title = parsed_feed['feed'].get("title"),
    link = parsed_feed['feed'].get("link"),
    description = parsed_feed['feed'].get("description"),

    language = parsed_feed['feed'].get("language"),
    copyright = parsed_feed['feed'].get("copyright"),
    managingEditor = parsed_feed['feed'].get("managingEditor"),
    webMaster = parsed_feed['feed'].get("webMaster"),
    pubDate = parsed_feed['feed'].get("pubDate"),
    lastBuildDate = parsed_feed['feed'].get("lastBuildDate"),

    categories = parsed_feed['feed'].get("categories"),
    generator = parsed_feed['feed'].get("generator"),
    docs = parsed_feed['feed'].get("docs"),

    items = items
)


print rss.to_xml()