GitList

initial

fiddlerwoaroof authored on 09/02/2015 17:35:24
Showing 5 changed files

.gitignore index 0000000..c3a3fbc
COPYING index 0000000..fc11ff2
README.md index 0000000..e69de29
TODO.md index 0000000..e69de29
rssmangle.py index 0000000..04c9d68

                 new file mode 100644
@@ -0,0 +1,6 @@
                 +bin
                 +include
                 +lib
                 +local
                 +pip-selfcheck.json
                 +.rss_framework.py.swp

COPYING

History View file @ fe39ddf

                 new file mode 100644
@@ -0,0 +1,29 @@
                 +Copyright (c) 2011 Edward Langley
                 +All rights reserved.
+                +
                 +Redistribution and use in source and binary forms, with or without
                 +modification, are permitted provided that the following conditions
                 +are met:
+                +
                 +Redistributions of source code must retain the above copyright notice,
                 +this list of conditions and the following disclaimer.
+                +
                 +Redistributions in binary form must reproduce the above copyright
                 +notice, this list of conditions and the following disclaimer in the
                 +documentation and/or other materials provided with the distribution.
+                +
                 +Neither the name of the project's author nor the names of its
                 +contributors may be used to endorse or promote products derived from
                 +this software without specific prior written permission.
+                +
                 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
                 +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
                 +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
                 +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
                 +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
                 +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
                 +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
                 +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
                 +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
                 +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
                 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md

History View file @ fe39ddf

new file mode 100644

TODO.md

History View file @ fe39ddf

new file mode 100644

rssmangle.py

History View file @ fe39ddf

                 new file mode 100644
@@ -0,0 +1,97 @@
                 +#!/usr/bin/env python
                 +import datetime
+                +
                 +# http://www.feedparser.org/
                 +import feedparser
                 +# http://www.dalkescientific.com/Python/PyRSS2Gen.html
                 +import PyRSS2Gen
                 +import goose
                 +import lxml.html
                 +import os
                 +import tempfile
+                +
                 +# Get the data
                 +parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml')
+                +
                 +class FullRSSItem(PyRSS2Gen.RSSItem):
                 +    def __init__(self, **kwargs):
                 +        if 'content' in kwargs:
                 +            self.content = kwargs['content']
                 +            del kwargs['content']
                 +        else:
                 +            self.content = None
                 +        PyRSS2Gen.RSSItem.__init__(self, **kwargs)
+                +
                 +    def publish_extensions(self, handler):
                 +        if self.content is not None:
                 +            PyRSS2Gen._opt_element(handler, "content:encoded", self.content)
+                +
+                +
                 +# Modify the parsed_feed data here
+                +
+                +
                 +CACHE_DIR = '/var/cache/rssmangle'
                 +if not os.path.exists(CACHE_DIR):
                 +    os.mkdir(CACHE_DIR)
+                +
                 +import urllib
                 +import hashlib
                 +fn_template = '%s#%s'
                 +def get_content(item):
                 +    sha_hasher = hashlib.sha256(item.link)
                 +    cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published))
                 +    if os.path.exists(cache_key):
                 +        with open(cache_key) as f: return f.read()
                 +    else:
                 +        with open(cache_key, 'w') as f:
                 +            g = goose.Goose()
                 +            article = g.extract(item.link)
                 +            if article.doc == None: return ''
                 +            else:
                 +                result = lxml.html.tostring(article.doc)
                 +                print >>f, result
                 +                return result
+                +
+                +
                 +items = [
                 +    FullRSSItem(
                 +        title = x.title,
                 +        link = x.link,
                 +        description = x.summary,
                 +        guid = x.link,
                 +        pubDate = datetime.datetime(
                 +            x.published_parsed[0],
                 +            x.published_parsed[1],
                 +            x.published_parsed[2],
                 +            x.published_parsed[3],
                 +            x.published_parsed[4],
                 +            x.published_parsed[5]),
                 +        content = get_content(x)
                 +        )
                 +    for x in parsed_feed.entries
                 +]
+                +
                 +# make the RSS2 object
                 +# Try to grab the title, link, language etc from the orig feed
+                +
                 +rss = PyRSS2Gen.RSS2(
                 +    title = parsed_feed['feed'].get("title"),
                 +    link = parsed_feed['feed'].get("link"),
                 +    description = parsed_feed['feed'].get("description"),
+                +
                 +    language = parsed_feed['feed'].get("language"),
                 +    copyright = parsed_feed['feed'].get("copyright"),
                 +    managingEditor = parsed_feed['feed'].get("managingEditor"),
                 +    webMaster = parsed_feed['feed'].get("webMaster"),
                 +    pubDate = parsed_feed['feed'].get("pubDate"),
                 +    lastBuildDate = parsed_feed['feed'].get("lastBuildDate"),
+                +
                 +    categories = parsed_feed['feed'].get("categories"),
                 +    generator = parsed_feed['feed'].get("generator"),
                 +    docs = parsed_feed['feed'].get("docs"),
+                +
                 +    items = items
                 +)
+                +
+                +
                 +print rss.to_xml()