Browse code
initial
fiddlerwoaroof authored on 09/02/2015 17:35:24
Showing 5 changed files
Showing 5 changed files
0 | 7 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,29 @@ |
1 |
+Copyright (c) 2011 Edward Langley |
|
2 |
+All rights reserved. |
|
3 |
+ |
|
4 |
+Redistribution and use in source and binary forms, with or without |
|
5 |
+modification, are permitted provided that the following conditions |
|
6 |
+are met: |
|
7 |
+ |
|
8 |
+Redistributions of source code must retain the above copyright notice, |
|
9 |
+this list of conditions and the following disclaimer. |
|
10 |
+ |
|
11 |
+Redistributions in binary form must reproduce the above copyright |
|
12 |
+notice, this list of conditions and the following disclaimer in the |
|
13 |
+documentation and/or other materials provided with the distribution. |
|
14 |
+ |
|
15 |
+Neither the name of the project's author nor the names of its |
|
16 |
+contributors may be used to endorse or promote products derived from |
|
17 |
+this software without specific prior written permission. |
|
18 |
+ |
|
19 |
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
20 |
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
21 |
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
22 |
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
23 |
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
24 |
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
|
25 |
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
26 |
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
|
27 |
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|
28 |
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
29 |
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
2 | 32 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,97 @@ |
1 |
+#!/usr/bin/env python |
|
2 |
+import datetime |
|
3 |
+ |
|
4 |
+# http://www.feedparser.org/ |
|
5 |
+import feedparser |
|
6 |
+# http://www.dalkescientific.com/Python/PyRSS2Gen.html |
|
7 |
+import PyRSS2Gen |
|
8 |
+import goose |
|
9 |
+import lxml.html |
|
10 |
+import os |
|
11 |
+import tempfile |
|
12 |
+ |
|
13 |
+# Get the data |
|
14 |
+parsed_feed = feedparser.parse('http://www.drudgereportarchives.com/rss/popular.xml') |
|
15 |
+ |
|
16 |
+class FullRSSItem(PyRSS2Gen.RSSItem): |
|
17 |
+ def __init__(self, **kwargs): |
|
18 |
+ if 'content' in kwargs: |
|
19 |
+ self.content = kwargs['content'] |
|
20 |
+ del kwargs['content'] |
|
21 |
+ else: |
|
22 |
+ self.content = None |
|
23 |
+ PyRSS2Gen.RSSItem.__init__(self, **kwargs) |
|
24 |
+ |
|
25 |
+ def publish_extensions(self, handler): |
|
26 |
+ if self.content is not None: |
|
27 |
+ PyRSS2Gen._opt_element(handler, "content:encoded", self.content) |
|
28 |
+ |
|
29 |
+ |
|
30 |
+# Modify the parsed_feed data here |
|
31 |
+ |
|
32 |
+ |
|
33 |
+CACHE_DIR = '/var/cache/rssmangle' |
|
34 |
+if not os.path.exists(CACHE_DIR): |
|
35 |
+ os.mkdir(CACHE_DIR) |
|
36 |
+ |
|
37 |
+import urllib |
|
38 |
+import hashlib |
|
39 |
+fn_template = '%s#%s' |
|
40 |
+def get_content(item): |
|
41 |
+ sha_hasher = hashlib.sha256(item.link) |
|
42 |
+ cache_key = os.path.join(CACHE_DIR, fn_template % (sha_hasher.hexdigest(),item.published)) |
|
43 |
+ if os.path.exists(cache_key): |
|
44 |
+ with open(cache_key) as f: return f.read() |
|
45 |
+ else: |
|
46 |
+ with open(cache_key, 'w') as f: |
|
47 |
+ g = goose.Goose() |
|
48 |
+ article = g.extract(item.link) |
|
49 |
+ if article.doc == None: return '' |
|
50 |
+ else: |
|
51 |
+ result = lxml.html.tostring(article.doc) |
|
52 |
+ print >>f, result |
|
53 |
+ return result |
|
54 |
+ |
|
55 |
+ |
|
56 |
+items = [ |
|
57 |
+ FullRSSItem( |
|
58 |
+ title = x.title, |
|
59 |
+ link = x.link, |
|
60 |
+ description = x.summary, |
|
61 |
+ guid = x.link, |
|
62 |
+ pubDate = datetime.datetime( |
|
63 |
+ x.published_parsed[0], |
|
64 |
+ x.published_parsed[1], |
|
65 |
+ x.published_parsed[2], |
|
66 |
+ x.published_parsed[3], |
|
67 |
+ x.published_parsed[4], |
|
68 |
+ x.published_parsed[5]), |
|
69 |
+ content = get_content(x) |
|
70 |
+ ) |
|
71 |
+ for x in parsed_feed.entries |
|
72 |
+] |
|
73 |
+ |
|
74 |
+# make the RSS2 object |
|
75 |
+# Try to grab the title, link, language etc from the orig feed |
|
76 |
+ |
|
77 |
+rss = PyRSS2Gen.RSS2( |
|
78 |
+ title = parsed_feed['feed'].get("title"), |
|
79 |
+ link = parsed_feed['feed'].get("link"), |
|
80 |
+ description = parsed_feed['feed'].get("description"), |
|
81 |
+ |
|
82 |
+ language = parsed_feed['feed'].get("language"), |
|
83 |
+ copyright = parsed_feed['feed'].get("copyright"), |
|
84 |
+ managingEditor = parsed_feed['feed'].get("managingEditor"), |
|
85 |
+ webMaster = parsed_feed['feed'].get("webMaster"), |
|
86 |
+ pubDate = parsed_feed['feed'].get("pubDate"), |
|
87 |
+ lastBuildDate = parsed_feed['feed'].get("lastBuildDate"), |
|
88 |
+ |
|
89 |
+ categories = parsed_feed['feed'].get("categories"), |
|
90 |
+ generator = parsed_feed['feed'].get("generator"), |
|
91 |
+ docs = parsed_feed['feed'].get("docs"), |
|
92 |
+ |
|
93 |
+ items = items |
|
94 |
+) |
|
95 |
+ |
|
96 |
+ |
|
97 |
+print rss.to_xml() |