fe39ddf7 |
#!/usr/bin/env python
import datetime
# http://www.feedparser.org/
import feedparser
# http://www.dalkescientific.com/Python/PyRSS2Gen.html
import PyRSS2Gen
import goose
import lxml.html
import os
import tempfile
# Get the data
parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml')
class FullRSSItem(PyRSS2Gen.RSSItem):
def __init__(self, **kwargs):
if 'content' in kwargs:
self.content = kwargs['content']
del kwargs['content']
else:
self.content = None
PyRSS2Gen.RSSItem.__init__(self, **kwargs)
def publish_extensions(self, handler):
if self.content is not None:
PyRSS2Gen._opt_element(handler, "content:encoded", self.content)
# Modify the parsed_feed data here
CACHE_DIR = '/var/cache/rssmangle'
if not os.path.exists(CACHE_DIR):
os.mkdir(CACHE_DIR)
import urllib
import hashlib
fn_template = '%s#%s'
def get_content(item):
sha_hasher = hashlib.sha256(item.link)
cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published))
if os.path.exists(cache_key):
with open(cache_key) as f: return f.read()
else:
with open(cache_key, 'w') as f:
g = goose.Goose()
article = g.extract(item.link)
if article.doc == None: return ''
else:
result = lxml.html.tostring(article.doc)
print >>f, result
return result
items = [
FullRSSItem(
title = x.title,
link = x.link,
description = x.summary,
guid = x.link,
pubDate = datetime.datetime(
x.published_parsed[0],
x.published_parsed[1],
x.published_parsed[2],
x.published_parsed[3],
x.published_parsed[4],
x.published_parsed[5]),
content = get_content(x)
)
for x in parsed_feed.entries
]
# make the RSS2 object
# Try to grab the title, link, language etc from the orig feed
rss = PyRSS2Gen.RSS2(
title = parsed_feed['feed'].get("title"),
link = parsed_feed['feed'].get("link"),
description = parsed_feed['feed'].get("description"),
language = parsed_feed['feed'].get("language"),
copyright = parsed_feed['feed'].get("copyright"),
managingEditor = parsed_feed['feed'].get("managingEditor"),
webMaster = parsed_feed['feed'].get("webMaster"),
pubDate = parsed_feed['feed'].get("pubDate"),
lastBuildDate = parsed_feed['feed'].get("lastBuildDate"),
categories = parsed_feed['feed'].get("categories"),
generator = parsed_feed['feed'].get("generator"),
docs = parsed_feed['feed'].get("docs"),
items = items
)
print rss.to_xml()
|