git.fiddlerwoaroof.com
Browse code

initial

fiddlerwoaroof authored on 09/02/2015 17:35:24
Showing 5 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,6 @@
1
+bin
2
+include
3
+lib
4
+local
5
+pip-selfcheck.json
6
+.rss_framework.py.swp
0 7
new file mode 100644
... ...
@@ -0,0 +1,29 @@
1
+Copyright (c) 2011 Edward Langley
2
+All rights reserved.
3
+
4
+Redistribution and use in source and binary forms, with or without
5
+modification, are permitted provided that the following conditions
6
+are met:
7
+
8
+Redistributions of source code must retain the above copyright notice,
9
+this list of conditions and the following disclaimer.
10
+
11
+Redistributions in binary form must reproduce the above copyright
12
+notice, this list of conditions and the following disclaimer in the
13
+documentation and/or other materials provided with the distribution.
14
+
15
+Neither the name of the project's author nor the names of its
16
+contributors may be used to endorse or promote products derived from
17
+this software without specific prior written permission.
18
+
19
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
25
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2 32
new file mode 100644
... ...
@@ -0,0 +1,97 @@
1
+#!/usr/bin/env python
2
+import datetime
3
+
4
+# http://www.feedparser.org/
5
+import feedparser
6
+# http://www.dalkescientific.com/Python/PyRSS2Gen.html
7
+import PyRSS2Gen
8
+import goose
9
+import lxml.html
10
+import os
11
+import tempfile
12
+
13
+# Get the data
14
+parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml')
15
+
16
+class FullRSSItem(PyRSS2Gen.RSSItem):
17
+    def __init__(self, **kwargs):
18
+        if 'content' in kwargs:
19
+            self.content = kwargs['content']
20
+            del kwargs['content']
21
+        else:
22
+            self.content = None
23
+        PyRSS2Gen.RSSItem.__init__(self, **kwargs)
24
+
25
+    def publish_extensions(self, handler):
26
+        if self.content is not None:
27
+            PyRSS2Gen._opt_element(handler, "content:encoded", self.content)
28
+
29
+
30
+# Modify the parsed_feed data here
31
+
32
+
33
+CACHE_DIR = '/var/cache/rssmangle'
34
+if not os.path.exists(CACHE_DIR):
35
+    os.mkdir(CACHE_DIR)
36
+
37
+import urllib
38
+import hashlib
39
+fn_template = '%s#%s'
40
+def get_content(item):
41
+    sha_hasher = hashlib.sha256(item.link)
42
+    cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published))
43
+    if os.path.exists(cache_key):
44
+        with open(cache_key) as f: return f.read()
45
+    else:
46
+        with open(cache_key, 'w') as f:
47
+            g = goose.Goose()
48
+            article = g.extract(item.link)
49
+            if article.doc == None: return ''
50
+            else:
51
+                result = lxml.html.tostring(article.doc)
52
+                print >>f, result
53
+                return result
54
+
55
+
56
+items = [
57
+    FullRSSItem(
58
+        title = x.title,
59
+        link = x.link,
60
+        description = x.summary,
61
+        guid = x.link,
62
+        pubDate = datetime.datetime(
63
+            x.published_parsed[0],
64
+            x.published_parsed[1],
65
+            x.published_parsed[2],
66
+            x.published_parsed[3],
67
+            x.published_parsed[4],
68
+            x.published_parsed[5]),
69
+        content = get_content(x)
70
+        )
71
+    for x in parsed_feed.entries
72
+]
73
+
74
+# make the RSS2 object
75
+# Try to grab the title, link, language etc from the orig feed
76
+
77
+rss = PyRSS2Gen.RSS2(
78
+    title = parsed_feed['feed'].get("title"),
79
+    link = parsed_feed['feed'].get("link"),
80
+    description = parsed_feed['feed'].get("description"),
81
+
82
+    language = parsed_feed['feed'].get("language"),
83
+    copyright = parsed_feed['feed'].get("copyright"),
84
+    managingEditor = parsed_feed['feed'].get("managingEditor"),
85
+    webMaster = parsed_feed['feed'].get("webMaster"),
86
+    pubDate = parsed_feed['feed'].get("pubDate"),
87
+    lastBuildDate = parsed_feed['feed'].get("lastBuildDate"),
88
+
89
+    categories = parsed_feed['feed'].get("categories"),
90
+    generator = parsed_feed['feed'].get("generator"),
91
+    docs = parsed_feed['feed'].get("docs"),
92
+
93
+    items = items
94
+)
95
+
96
+
97
+print rss.to_xml()