GitList

Switching from urrlib2 to requests

- Now the DefaultTitleGetter canonicalizes the URL via the link
rel=canonical attribute, if such an attribute exists.

fiddlerwoaroof authored on 19/10/2015 00:14:28
Showing 5 changed files

requirements.txt index fd72e5d..4370c54 100644
src/marrow/bone.py index 6feb317..e08df4d 100644
src/marrow/titlegetter/default.py index 1e6a741..50f40a7 100644
src/marrow/titlegetter/nytimes.py index 0aefbbc..7b2eb77 100644
src/marrow/titlegetter/titlegetter.py index 8bab509..10f45d5 100644

@@ -1,12 +1,13 @@
                  Flask==0.10.1
                  Flask-Cors==2.1.0
                 -Flask-Limiter==0.8.1
                 -Flask-Login==0.3.0
                 +Flask-Limiter==0.8.5
                 +Flask-Login==0.3.2
                  Flask-OAuth==0.12
                  Flask-Security==1.7.4
                 -Flask-WTF==0.11
                 +Flask-WTF==0.12
                  lxml==3.4.4
                  psycopg2==2.6.1
                  python-dateutil==2.4.2
                 -textblob==0.9.1
                 -uWSGI==2.0.11
                 +requests==2.8.1
                 +textblob==0.10.0
                 +uWSGI==2.0.11.2

src/marrow/bone.py

History View file @ 062e9d8

@@ -56,7 +56,7 @@ def clean_url(url):
                          netloc, path = path, netloc
                      return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
                 -def get_title(url):
                 +def get_siteinfo(url):
                      return config.titlegetter.get_title(url)
                  @bone_blueprint.route('/vote/total')
@@ -135,7 +135,8 @@ def submit_link():
                      if username is not None:
                          url, title = obj['url'],obj['title']
                          url = clean_url(url)
                 -        title = get_title(url)
                 +        title, url = get_siteinfo(url) # this makes sure that the url is the site's preferred URL
                 +                                       #  TODO: this might need sanity checks . . . like make sure same site?
                          with db.cursor() as cur:
                              cur.callproc('put_link', (username, url, title))
                              ## This returns (link_id, user_id)

src/marrow/titlegetter/default.py

History View file @ 062e9d8

@@ -1,7 +1,8 @@
                 -import lxml.html
                 -import urllib2
                 -import urlparse
                  import re
                 +import urlparse
+                +
                 +import lxml.html
                 +import requests
                  from textblob import TextBlob
                  from textblob_aptagger import PerceptronTagger
@@ -14,19 +15,22 @@ class DefaultTitleGetter(object):
                      url_cleaner = re.compile('[+\-_]')
                      def get_title(self, url):
                 +        s = requests.session()
                          scheme, netloc, path, params, query, fragment = urlparse.urlparse(url, 'http')
                 -        data = urllib2.urlopen(url)
                 -        content_type = data.headers['content-type'].lower()
                 -        charset = 'utf-8'
                 -        if 'charset' in content_type:
                 -            charset = content_type.partition('charset=')[-1]
                 -        data = data.read()
                 -        data = data.decode(charset)
                 -        etree = lxml.html.fromstring(data)
                 -        titleElems = etree.xpath('//title')
                 -        title = url
                 -        if titleElems != []:
                 -            title = titleElems[0].text
                 +        data = s.get(url)
                 +        etree = lxml.html.fromstring(data.content.decode(data.encoding))
+                +
                 +        canonicalLink = etree.xpath('//link[@rel="canonical"]/@href')
                 +        if canonicalLink != []:
                 +            canonicalLink = canonicalLink[0]
                 +            data = s.get(canonicalLink)
                 +            etree = lxml.html.fromstring(data.content.decode(data.encoding))
                 +        else:
                 +            canonicalLink = url
+                +
                 +        title = etree.xpath('//title/text()')
                 +        if title != []:
                 +            title = title[0]
                          elif path:
                              # hacky way to make a title
                              path = urlparse.unquote(path)
@@ -37,4 +41,4 @@ class DefaultTitleGetter(object):
                              title = map(titlecase, path)
                              title = u' \u2014 '.join(title)
                              title = u' \u2014 '.join([title, netloc])
                 -        return title
                 +        return title, canonicalLink

src/marrow/titlegetter/nytimes.py

History View file @ 062e9d8

@@ -2,6 +2,7 @@ import urlparse
                  import urllib2
                  import json
                 +# TODO: this should use the articlesearch API, if this is actually necessary
                  class TimesTitleGetter(object):
                      api_url='http://api.nytimes.com/svc/news/v3/content.json?url=%(url)s&api-key=%(api_key)s'
                      site='nytimes.com'
@@ -13,4 +14,4 @@ class TimesTitleGetter(object):
                          info = json.load(urllib2.urlopen(api_url))
                          title = info['results'][0]['title']
                          source = info['results'][0]['source']
                 -        return u'%s \u2014 %s' % (title, source)
                 +        return u'%s \u2014 %s' % (title, source), url

src/marrow/titlegetter/titlegetter.py

History View file @ 062e9d8

@@ -1,4 +1,5 @@
                  import urllib2
                 +import requests
                  import urlparse
                  from .utils import memoize
@@ -34,11 +35,10 @@ class TitleGetter(object):
                                  handler = self.getters[site]
                                  break
                 -        title = None
                          try:
                 -            title = handler.get_title(url)
                 -        except urllib2.HTTPError:
                 -            title = self.default_handler.get_title(url)
                 +            title, canonicalUrl = handler.get_title(url)
                 +        except requests.exceptions.RequestException:
                 +            title, canonicalUrl = self.default_handler.get_title(url)
                 -        return title.encode('utf-8')
                 +        return title.encode('utf-8'), canonicalUrl