# -*- utf-8 -*-
import sys
import cgi
import json
import dateutil.parser
import re
import textblob
def get_date(date):
start_date = dateutil.parser.parse('Wed Oct 22 01:18')
time_parser = re.compile(ur'(\d+) (hr|min|sec|hour)')
try:
return dateutil.parser.parse(date)
except (TypeError,ValueError):
if date.startswith('Yesterday'):
date = date.partition(' ')[2]
date = 'August 24 %s' % date
return dateutil.parser.parse(date)
elif date.startswith('Just now'):
return start_date
else:
match = time_parser.match(date)
if match:
num,unit = match.groups()
if unit == 'hr' or unit == 'hour':
return start_date + dateutil.relativedelta.relativedelta(hours=-int(num))
elif unit == 'min':
return start_date + dateutil.relativedelta.relativedelta(minutes=-int(num))
elif unit == 'sec':
return start_date + dateutil.relativedelta.relativedelta(seconds=-int(num))
else:
print >>sys.stderr, date
raise
else:
print >>sys.stderr, date
raise
delimiter = u'--##--%%--##--'
with file('TNET') as f:
txt = f.read().decode('utf-8')
txt = txt.split(delimiter)
beginning = u'''\
%(name)s at %(date)s with %(likes)d likes
%(text)s
'''
counter = 0
splitter = u' \xb7 '
data = []
def get_name(line):
line = line.split()
name_end = 1
while name_end < len(line) and (line[name_end].endswith('.') or len(line[name_end]) < 4):
name_end += 1
name = line[:name_end+1]
if len(name) > 2 and name[1] == 'HF': name = name[:2]
elif len(name) > 2 and name[1] == 'Lng': name = name[:2]
elif len(name) > 2 and name[1] == 'Haaf': name = line[:3]
name = ' '.join(name)
if name == 'Christopher Michael': name = ' '.join(line[:3])
elif name == 'Jody Haaf': name = ' '.join(line[:3])
name_length = len(name)
if name == 'Peregrine Bonaventure': name = 'Scott Weinberg'
elif name == 'JA Escalante': name = 'Jehoshaphat Escalante'
return name, name_length
for k in txt:
k = k.strip()
lines = k.split('\n')
lines = map(cgi.escape, lines)
lines = filter(None, lines)
if 'ike' in lines[-1]:
items = lines[-1].split(splitter)
try:
date = get_date(items[0])
except UnicodeEncodeError:
print >> sys.stderr, '**problem:', items, items[0]
print >> sys.stderr, lines
break
likes = 0
if items[-1].isdigit():
likes = int(items[-1])
counter += 1
name, name_length = get_name(lines[0])
class_ = u'odd' if counter % 2 == 1 else u'even'
lines = lines[:-1]
j = '\n'.join(lines)[name_length:].lstrip()
blob = textblob.TextBlob(j)
data.append(dict(
name = name,
name_length = name_length,
date = date,
likes = likes,
text = j,
index = counter,
classes = class_,
polarity = int(round(((blob.sentiment.polarity + 1)/2)*255)),
subjectivity = int(round(blob.sentiment.subjectivity * 75 + 25))
))
ntemplate = data[-1].copy()
ntemplate['date'] = ntemplate['date'].isoformat()
ntemplate['text'] = u'