import urllib2 import requests import urlparse from .utils import memoize from .default import DefaultTitleGetter def clean_url(url): scheme, netloc, path, params, query, fragment = urlparse.urlparse(url, 'http') if path and not netloc: netloc, path = path, netloc return urlparse.urlunparse((scheme, netloc, path, params, query, fragment)), netloc def split_netloc(netloc): tail = netloc while tail != u'': yield tail head, div, tail = tail.partition(u'.') class TitleGetter(object): getters = {} @classmethod def add_getter(cls, getter): cls.getters[getter.site] = getter default_handler = DefaultTitleGetter() title_cache = {} @memoize(title_cache) def get_title(self, url): url, site = clean_url(url) handler = self.default_handler for site in split_netloc(urlparse.urlparse(url).netloc): if site in self.getters: handler = self.getters[site] break try: title, canonicalUrl = handler.get_title(url) except requests.exceptions.RequestException: title, canonicalUrl = self.default_handler.get_title(url) return title.encode('utf-8'), canonicalUrl