import re import time import collections import urllib.request import urllib.parse import feedparser import json def urlopen(url): headers = {'User-Agent': "the/edgent"} req = urllib.request.Request(url, headers=headers) return urllib.request.urlopen(req) class URLHandler(object): registry = collections.OrderedDict() @classmethod def register(cls, pattern): def _inner(ncls): cls.registry[re.compile(pattern)] = ncls return ncls return _inner @classmethod def handle(cls, url, **args): print(args) for x in reversed(cls.registry): if x.match(url): return cls.registry[x](url).run(url, **args) def __init__(self, url): self.url = url def run(self, url, **args): data = self.get_data(url, **args) return self.postprocess(data) def get_data(self, url, **args): return urlopen(url) def postprocess(self, result): return result @URLHandler.register('.') class BasicHandler(URLHandler): def get_data(self, url, **args): return feedparser.parse(self.url, **args) @URLHandler.register(r'^http[s]?://(www\.)?reddit.com/r/[^/]*/$') class RedditJSONHandler(URLHandler): def get_data(self, url, **args): result = urlopen('%s.json' % url) result = result.read().decode(result.headers.get_content_charset()) return json.loads(result) def postprocess(self, data): result = feedparser.FeedParserDict() desc = urllib.parse.urljoin(self.url, 'about.json') desc = urlopen(desc) desc = json.loads(desc.read().decode(desc.headers.get_content_charset()))['data'] result['feed'] = feedparser.FeedParserDict() result.feed['title'] = desc['title'] result.feed['link'] = 'http://reddit.com/%s' % desc['url'] result['entries'] = [] result.etag = None result.modified = None result.status = 200 for x in data['data']['children']: result.entries.append(feedparser.FeedParserDict()) dat = x['data'] result.entries[-1]['title'] = dat['title'] result.entries[-1]['link'] = dat['url'] result.entries[-1]['published_parsed'] = time.gmtime(dat['created_utc']) result.entries[-1]['id'] = dat['id'] return result