TNET

# -*- utf-8 -*- import sys import cgi import json import dateutil.parser import re import textblob def get_date(date): start_date = dateutil.parser.parse('Wed Oct 22 01:18') time_parser = re.compile(ur'(\d+) (hr|min|sec|hour)') try: return dateutil.parser.parse(date) except (TypeError,ValueError): if date.startswith('Yesterday'): date = date.partition(' ')[2] date = 'August 24 %s' % date return dateutil.parser.parse(date) elif date.startswith('Just now'): return start_date else: match = time_parser.match(date) if match: num,unit = match.groups() if unit == 'hr' or unit == 'hour': return start_date + dateutil.relativedelta.relativedelta(hours=-int(num)) elif unit == 'min': return start_date + dateutil.relativedelta.relativedelta(minutes=-int(num)) elif unit == 'sec': return start_date + dateutil.relativedelta.relativedelta(seconds=-int(num)) else: print >>sys.stderr, date raise else: print >>sys.stderr, date raise delimiter = u'--##--%%--##--' with file('TNET') as f: txt = f.read().decode('utf-8') txt = txt.split(delimiter) beginning = u'''\ TNET

Post #:

''' end = u'''\

''' out = [] div_template = u'''\

%(name)s at %(date)s with %(likes)d likes

%(text)s

''' counter = 0 splitter = u' \xb7 ' data = [] def get_name(line): line = line.split() name_end = 1 while name_end < len(line) and (line[name_end].endswith('.') or len(line[name_end]) < 4): name_end += 1 name = line[:name_end+1] if len(name) > 2 and name[1] == 'HF': name = name[:2] elif len(name) > 2 and name[1] == 'Lng': name = name[:2] elif len(name) > 2 and name[1] == 'Haaf': name = line[:3] name = ' '.join(name) if name == 'Christopher Michael': name = ' '.join(line[:3]) elif name == 'Jody Haaf': name = ' '.join(line[:3]) name_length = len(name) if name == 'Peregrine Bonaventure': name = 'Scott Weinberg' elif name == 'JA Escalante': name = 'Jehoshaphat Escalante' return name, name_length for k in txt: k = k.strip() lines = k.split('\n') lines = map(cgi.escape, lines) lines = filter(None, lines) if 'ike' in lines[-1]: items = lines[-1].split(splitter) try: date = get_date(items[0]) except UnicodeEncodeError: print >> sys.stderr, '**problem:', items, items[0] print >> sys.stderr, lines break likes = 0 if items[-1].isdigit(): likes = int(items[-1]) counter += 1 name, name_length = get_name(lines[0]) class_ = u'odd' if counter % 2 == 1 else u'even' lines = lines[:-1] j = '\n'.join(lines)[name_length:].lstrip() blob = textblob.TextBlob(j) data.append(dict( name = name, name_length = name_length, date = date, likes = likes, text = j, index = counter, classes = class_, polarity = int(round(((blob.sentiment.polarity + 1)/2)*255)), subjectivity = int(round(blob.sentiment.subjectivity * 75 + 25)) )) ntemplate = data[-1].copy() ntemplate['date'] = ntemplate['date'].isoformat() ntemplate['text'] = u'
'.join(ntemplate['text'].split('\n')) out.append(div_template % ntemplate) import argparse a = argparse.ArgumentParser() a.add_argument('--json', '-j', action='store_true', default=False) a.add_argument('--fwc', '-w', action='store_true', default=False) args = a.parse_args() if args.json: import json import datetime class DateTimeEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, datetime.datetime): return obj.isoformat() return json.JSONEncoder.default(self, obj) print json.dumps(data, cls=DateTimeEncoder, indent=2) elif args.fwc: for x in data: txt = x['text'].split('\n')[:-1] txt = u'\n'.join(txt) txt = txt[len(x['name'])+1:] txt = u' '.join(filter(None,(y for y in txt.split() if y.isalpha()))) print txt.encode('utf-8'), else: print beginning for x in out: print x.encode('utf-8') print end