git.fiddlerwoaroof.com
Browse code

Switching from urrlib2 to requests

- Now the DefaultTitleGetter canonicalizes the URL via the link
rel=canonical attribute, if such an attribute exists.

fiddlerwoaroof authored on 19/10/2015 00:14:28
Showing 5 changed files
... ...
@@ -1,12 +1,13 @@
1 1
 Flask==0.10.1
2 2
 Flask-Cors==2.1.0
3
-Flask-Limiter==0.8.1
4
-Flask-Login==0.3.0
3
+Flask-Limiter==0.8.5
4
+Flask-Login==0.3.2
5 5
 Flask-OAuth==0.12
6 6
 Flask-Security==1.7.4
7
-Flask-WTF==0.11
7
+Flask-WTF==0.12
8 8
 lxml==3.4.4
9 9
 psycopg2==2.6.1
10 10
 python-dateutil==2.4.2
11
-textblob==0.9.1
12
-uWSGI==2.0.11
11
+requests==2.8.1
12
+textblob==0.10.0
13
+uWSGI==2.0.11.2
... ...
@@ -56,7 +56,7 @@ def clean_url(url):
56 56
         netloc, path = path, netloc
57 57
     return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
58 58
 
59
-def get_title(url):
59
+def get_siteinfo(url):
60 60
     return config.titlegetter.get_title(url)
61 61
 
62 62
 @bone_blueprint.route('/vote/total')
... ...
@@ -135,7 +135,8 @@ def submit_link():
135 135
     if username is not None:
136 136
         url, title = obj['url'],obj['title']
137 137
         url = clean_url(url)
138
-        title = get_title(url)
138
+        title, url = get_siteinfo(url) # this makes sure that the url is the site's preferred URL
139
+                                       #  TODO: this might need sanity checks . . . like make sure same site?
139 140
         with db.cursor() as cur:
140 141
             cur.callproc('put_link', (username, url, title))
141 142
             ## This returns (link_id, user_id)
... ...
@@ -1,7 +1,8 @@
1
-import lxml.html
2
-import urllib2
3
-import urlparse
4 1
 import re
2
+import urlparse
3
+
4
+import lxml.html
5
+import requests
5 6
 
6 7
 from textblob import TextBlob
7 8
 from textblob_aptagger import PerceptronTagger
... ...
@@ -14,19 +15,22 @@ class DefaultTitleGetter(object):
14 15
     url_cleaner = re.compile('[+\-_]')
15 16
 
16 17
     def get_title(self, url):
18
+        s = requests.session()
17 19
         scheme, netloc, path, params, query, fragment = urlparse.urlparse(url, 'http')
18
-        data = urllib2.urlopen(url)
19
-        content_type = data.headers['content-type'].lower()
20
-        charset = 'utf-8'
21
-        if 'charset' in content_type:
22
-            charset = content_type.partition('charset=')[-1]
23
-        data = data.read()
24
-        data = data.decode(charset)
25
-        etree = lxml.html.fromstring(data)
26
-        titleElems = etree.xpath('//title')
27
-        title = url
28
-        if titleElems != []:
29
-            title = titleElems[0].text
20
+        data = s.get(url)
21
+        etree = lxml.html.fromstring(data.content.decode(data.encoding))
22
+
23
+        canonicalLink = etree.xpath('//link[@rel="canonical"]/@href')
24
+        if canonicalLink != []:
25
+            canonicalLink = canonicalLink[0]
26
+            data = s.get(canonicalLink)
27
+            etree = lxml.html.fromstring(data.content.decode(data.encoding))
28
+        else:
29
+            canonicalLink = url
30
+
31
+        title = etree.xpath('//title/text()')
32
+        if title != []:
33
+            title = title[0]
30 34
         elif path:
31 35
             # hacky way to make a title
32 36
             path = urlparse.unquote(path)
... ...
@@ -37,4 +41,4 @@ class DefaultTitleGetter(object):
37 41
             title = map(titlecase, path)
38 42
             title = u' \u2014 '.join(title)
39 43
             title = u' \u2014 '.join([title, netloc])
40
-        return title
44
+        return title, canonicalLink
... ...
@@ -2,6 +2,7 @@ import urlparse
2 2
 import urllib2
3 3
 import json
4 4
 
5
+# TODO: this should use the articlesearch API, if this is actually necessary
5 6
 class TimesTitleGetter(object):
6 7
     api_url='http://api.nytimes.com/svc/news/v3/content.json?url=%(url)s&api-key=%(api_key)s'
7 8
     site='nytimes.com'
... ...
@@ -13,4 +14,4 @@ class TimesTitleGetter(object):
13 14
         info = json.load(urllib2.urlopen(api_url))
14 15
         title = info['results'][0]['title']
15 16
         source = info['results'][0]['source']
16
-        return u'%s \u2014 %s' % (title, source)
17
+        return u'%s \u2014 %s' % (title, source), url
... ...
@@ -1,4 +1,5 @@
1 1
 import urllib2
2
+import requests
2 3
 import urlparse
3 4
 
4 5
 from .utils import memoize
... ...
@@ -34,11 +35,10 @@ class TitleGetter(object):
34 35
                 handler = self.getters[site]
35 36
                 break
36 37
 
37
-        title = None
38 38
         try:
39
-            title = handler.get_title(url)
40
-        except urllib2.HTTPError:
41
-            title = self.default_handler.get_title(url)
39
+            title, canonicalUrl = handler.get_title(url)
40
+        except requests.exceptions.RequestException:
41
+            title, canonicalUrl = self.default_handler.get_title(url)
42 42
 
43
-        return title.encode('utf-8')
43
+        return title.encode('utf-8'), canonicalUrl
44 44