Browse code
Switching from urrlib2 to requests
- Now the DefaultTitleGetter canonicalizes the URL via the link
rel=canonical attribute, if such an attribute exists.
Showing 5 changed files
- requirements.txt
- src/marrow/bone.py
- src/marrow/titlegetter/default.py
- src/marrow/titlegetter/nytimes.py
- src/marrow/titlegetter/titlegetter.py
... | ... |
@@ -1,12 +1,13 @@ |
1 | 1 |
Flask==0.10.1 |
2 | 2 |
Flask-Cors==2.1.0 |
3 |
-Flask-Limiter==0.8.1 |
|
4 |
-Flask-Login==0.3.0 |
|
3 |
+Flask-Limiter==0.8.5 |
|
4 |
+Flask-Login==0.3.2 |
|
5 | 5 |
Flask-OAuth==0.12 |
6 | 6 |
Flask-Security==1.7.4 |
7 |
-Flask-WTF==0.11 |
|
7 |
+Flask-WTF==0.12 |
|
8 | 8 |
lxml==3.4.4 |
9 | 9 |
psycopg2==2.6.1 |
10 | 10 |
python-dateutil==2.4.2 |
11 |
-textblob==0.9.1 |
|
12 |
-uWSGI==2.0.11 |
|
11 |
+requests==2.8.1 |
|
12 |
+textblob==0.10.0 |
|
13 |
+uWSGI==2.0.11.2 |
... | ... |
@@ -56,7 +56,7 @@ def clean_url(url): |
56 | 56 |
netloc, path = path, netloc |
57 | 57 |
return urlparse.urlunparse((scheme, netloc, path, params, query, fragment)) |
58 | 58 |
|
59 |
-def get_title(url): |
|
59 |
+def get_siteinfo(url): |
|
60 | 60 |
return config.titlegetter.get_title(url) |
61 | 61 |
|
62 | 62 |
@bone_blueprint.route('/vote/total') |
... | ... |
@@ -135,7 +135,8 @@ def submit_link(): |
135 | 135 |
if username is not None: |
136 | 136 |
url, title = obj['url'],obj['title'] |
137 | 137 |
url = clean_url(url) |
138 |
- title = get_title(url) |
|
138 |
+ title, url = get_siteinfo(url) # this makes sure that the url is the site's preferred URL |
|
139 |
+ # TODO: this might need sanity checks . . . like make sure same site? |
|
139 | 140 |
with db.cursor() as cur: |
140 | 141 |
cur.callproc('put_link', (username, url, title)) |
141 | 142 |
## This returns (link_id, user_id) |
... | ... |
@@ -1,7 +1,8 @@ |
1 |
-import lxml.html |
|
2 |
-import urllib2 |
|
3 |
-import urlparse |
|
4 | 1 |
import re |
2 |
+import urlparse |
|
3 |
+ |
|
4 |
+import lxml.html |
|
5 |
+import requests |
|
5 | 6 |
|
6 | 7 |
from textblob import TextBlob |
7 | 8 |
from textblob_aptagger import PerceptronTagger |
... | ... |
@@ -14,19 +15,22 @@ class DefaultTitleGetter(object): |
14 | 15 |
url_cleaner = re.compile('[+\-_]') |
15 | 16 |
|
16 | 17 |
def get_title(self, url): |
18 |
+ s = requests.session() |
|
17 | 19 |
scheme, netloc, path, params, query, fragment = urlparse.urlparse(url, 'http') |
18 |
- data = urllib2.urlopen(url) |
|
19 |
- content_type = data.headers['content-type'].lower() |
|
20 |
- charset = 'utf-8' |
|
21 |
- if 'charset' in content_type: |
|
22 |
- charset = content_type.partition('charset=')[-1] |
|
23 |
- data = data.read() |
|
24 |
- data = data.decode(charset) |
|
25 |
- etree = lxml.html.fromstring(data) |
|
26 |
- titleElems = etree.xpath('//title') |
|
27 |
- title = url |
|
28 |
- if titleElems != []: |
|
29 |
- title = titleElems[0].text |
|
20 |
+ data = s.get(url) |
|
21 |
+ etree = lxml.html.fromstring(data.content.decode(data.encoding)) |
|
22 |
+ |
|
23 |
+ canonicalLink = etree.xpath('//link[@rel="canonical"]/@href') |
|
24 |
+ if canonicalLink != []: |
|
25 |
+ canonicalLink = canonicalLink[0] |
|
26 |
+ data = s.get(canonicalLink) |
|
27 |
+ etree = lxml.html.fromstring(data.content.decode(data.encoding)) |
|
28 |
+ else: |
|
29 |
+ canonicalLink = url |
|
30 |
+ |
|
31 |
+ title = etree.xpath('//title/text()') |
|
32 |
+ if title != []: |
|
33 |
+ title = title[0] |
|
30 | 34 |
elif path: |
31 | 35 |
# hacky way to make a title |
32 | 36 |
path = urlparse.unquote(path) |
... | ... |
@@ -37,4 +41,4 @@ class DefaultTitleGetter(object): |
37 | 41 |
title = map(titlecase, path) |
38 | 42 |
title = u' \u2014 '.join(title) |
39 | 43 |
title = u' \u2014 '.join([title, netloc]) |
40 |
- return title |
|
44 |
+ return title, canonicalLink |
... | ... |
@@ -2,6 +2,7 @@ import urlparse |
2 | 2 |
import urllib2 |
3 | 3 |
import json |
4 | 4 |
|
5 |
+# TODO: this should use the articlesearch API, if this is actually necessary |
|
5 | 6 |
class TimesTitleGetter(object): |
6 | 7 |
api_url='http://api.nytimes.com/svc/news/v3/content.json?url=%(url)s&api-key=%(api_key)s' |
7 | 8 |
site='nytimes.com' |
... | ... |
@@ -13,4 +14,4 @@ class TimesTitleGetter(object): |
13 | 14 |
info = json.load(urllib2.urlopen(api_url)) |
14 | 15 |
title = info['results'][0]['title'] |
15 | 16 |
source = info['results'][0]['source'] |
16 |
- return u'%s \u2014 %s' % (title, source) |
|
17 |
+ return u'%s \u2014 %s' % (title, source), url |
... | ... |
@@ -1,4 +1,5 @@ |
1 | 1 |
import urllib2 |
2 |
+import requests |
|
2 | 3 |
import urlparse |
3 | 4 |
|
4 | 5 |
from .utils import memoize |
... | ... |
@@ -34,11 +35,10 @@ class TitleGetter(object): |
34 | 35 |
handler = self.getters[site] |
35 | 36 |
break |
36 | 37 |
|
37 |
- title = None |
|
38 | 38 |
try: |
39 |
- title = handler.get_title(url) |
|
40 |
- except urllib2.HTTPError: |
|
41 |
- title = self.default_handler.get_title(url) |
|
39 |
+ title, canonicalUrl = handler.get_title(url) |
|
40 |
+ except requests.exceptions.RequestException: |
|
41 |
+ title, canonicalUrl = self.default_handler.get_title(url) |
|
42 | 42 |
|
43 |
- return title.encode('utf-8') |
|
43 |
+ return title.encode('utf-8'), canonicalUrl |
|
44 | 44 |
|