import url_handler import feedparser import hashlib import json #import redis class Feed(object): @property def url(self): return self._url @url.setter def url(self, url): self.urlhash = hashlib.md5(url.encode('utf-8')).hexdigest() self._url = url def __init__(self, title, link, url, etag=None, modified=None, status=0): self.urlhash = hashlib.md5(url.encode('utf-8')).hexdigest() self.title, self.link, self.url = title, link, url print('__init__', 'etag is', etag, 'modified is', modified) self.etag = etag self.modified = modified self.status = int(status) self.entries = [] def add_entry(self, entry): if self.entries: self.entries[-1].sep = False self.entries.append(entry) self.entries[-1].sep = True def to_redis(self, redis): feed_key = self.get_feed_key(self.url) object_prefix = '%s:%%s' % feed_key print(object_prefix) redis.set(feed_key, 'exists') redis.set(object_prefix % 'title', self.title) redis.set(object_prefix % 'link', self.link) redis.set(object_prefix % 'url', self.url) redis.set(object_prefix % 'etag', self.etag) redis.set(object_prefix % 'modified', self.modified) redis.set(object_prefix % 'status', self.status) self.put_entries(redis, object_prefix) def put_entries(self, redis, object_prefix): for entry in reversed(self.entries): eid = hashlib.md5(entry.id.encode('utf-8')).hexdigest() entry_key = object_prefix % ('entry:%s' % eid) redis.sadd(object_prefix % 'entries', eid) entry.to_redis(redis, entry_key) @classmethod def get_feed_key(cls, url): return 'feed:%s' % hashlib.md5(url.encode('utf-8')).hexdigest() @classmethod def from_redis(cls, redis, url): feed_key = cls.get_feed_key(url) if redis.get(feed_key): object_prefix = '%s:%%s' % feed_key title = redis.get(object_prefix % 'title').decode('utf-8') link = redis.get(object_prefix % 'link').decode('utf-8') url = redis.get(object_prefix % 'url').decode('utf-8') etag = redis.get(object_prefix % 'etag').decode('utf-8') modified = redis.get(object_prefix % 'modified').decode('utf-8') status = redis.get(object_prefix % 'status').decode('utf-8') self = cls(title, link, url) entries = redis.smembers(object_prefix % 'entries') for eid in entries: eid = eid.decode('utf-8') entry_key = object_prefix % ('entry:%s' % eid) hl = Headline.from_redis(redis, entry_key) if hl is None: break else: self.add_entry(hl) if self.entries != []: self.entries[-1].sep = False self.entries.sort(key=lambda x:x.date) self.entries[-1].sep = True return self @classmethod def pull_feed(cls, url, etag=None, modified=None): print('etag is', etag, 'modified is', modified) feed = url_handler.URLHandler.handle(url, etag=etag, modified=modified) return cls.from_parsed_feed(feed, url) @classmethod def from_parsed_feed(cls, data, url): title = data.feed.title url = url link = data.feed.link etag = data.etag if hasattr(data, 'etag') else 'No Etag' modified = data.modified if hasattr(data, 'modified') else 'No Last Modified' print('parsed_feed', 'etag is', etag, 'modified is', modified) status = data.status self = cls(title, link, url, etag=etag, modified=modified, status=status) for entry in data.entries: hl = Headline(entry.title, entry.link, date=entry.published_parsed, id=entry.id) self.add_entry(hl) return self @classmethod def get_feed(cls, url, redis=None): res = None update = False newfeed = None if redis is not None: res = cls.from_redis(redis, url) if res is not None: newfeed = url_handler.URLHandler.handle(url, etag=res.etag, modified=res.modified) update = newfeed.status != 304 print('newfeed.status is', newfeed.status, 'update is', update) print('res is', res, 'update is', update, 'url is', url) if update or res is None: if update: updates = cls.from_parsed_feed(newfeed, url) object_prefix = '%s:%%s' % cls.get_feed_key(url) updates.put_entries(redis, object_prefix) print('putting updates!') updates.to_redis(redis) res = cls.from_redis(redis, url) else: data = url_handler.URLHandler.handle(url) res = cls.from_parsed_feed(data, url) res.to_redis(redis) return res class Headline(object): @property def url(self): return self._url @url.setter def url(self, url): self.urlhash = hashlib.md5(url.encode('utf-8')).hexdigest() self._url = url serialized_attributes = ['title', 'url', 'img', 'id', 'date'] def __init__(self, title, url, sep=False, img=None, id=None, date=None): for x in self.serialized_attributes: setattr(self, x, locals()[x]) self.date = list(self.date) self.sep = sep def __repr__(self): return '<%s>' % ', '.join(str(getattr(self,x)) for x in self.serialized_attributes) trans_map = dict(date=json.dumps) rtrans_map = { y:x for (x,y) in trans_map.items() } def to_redis(self, redis, entry_key): redis.set(entry_key, 'exists') object_prefix = '%s:%%s' % entry_key for x in self.serialized_attributes: redis.set(object_prefix % x, self.trans_map.get(x, lambda x:x)(getattr(self, x))) @classmethod def from_redis(cls, redis, entry_key): if redis.get(entry_key) is not None: object_prefix = '%s:%%s' % entry_key args = {} for x in cls.serialized_attributes: args[x] = redis.get(object_prefix % x).decode('utf-8') args[x] = cls.rtrans_map.get(x, lambda x:x)(args[x]) return cls(**args) @classmethod def from_rss(cls, entry): name_mapping = dict( url='link', date='published_parsed', ) self = cls(entry.title, entry.link, id=entry.id, date=entry.published_parsed) def to_json(self): return json.dumps([self.title, self.link, self.sep, self.img]) @classmethod def from_json(cls, enc): enc = json.loads(enc) self = cls(*json.loads(enc)) class Feeds(object): def __init__(self, urls, redis=None): self.feeds = list(filter(None, (Feed.get_feed(url, redis) for url in urls))) print(self.feeds) if __name__ == '__main__': import json import redis print('getting feeds . . .', end=' ') with open('blogs.json') as f: feeds = json.load(f) feeds = Feeds(feeds, redis.Redis()) print('done.')