git.fiddlerwoaroof.com
Raw Blame History
class test(HTMLParser.HTMLParser):
    intable = False
    titling = False
    capturing = True
    cdata = []
    out = u''
    def handle_starttag(self, tag, attrs):
        print tag,attrs
        if dict(attrs).get('class', None) == 'm': self.capturing = False
        if tag.lower() == 'table': self.intable = True
        elif tag.lower() == 'title': self.titling = True
        elif tag.lower() == 'td' and dict(attrs).get('class', None) != 'm':
            self.capturing = True
    def handle_endtag(self, tag):
        if self.titling and tag.lower() == 'title': self.titling=False
        if self.capturing:
            out = u'\n'.join(self.cdata)
            self.cdata = []
            self.out += '\n' + out
            #print out
    def handle_data(self, data):
        if self.capturing and self.intable and data.strip() != '': self.cdata.append(data.decode('utf-8'))
        if self.titling: self.cdata.append(u'HEADING: %s' % data.decode('utf-8'))


b=test()
for x in range(1,14):
    with file('3_%02d.html' % x, 'rU') as a: (b.reset(),setattr(b, 'capturing', True), b.feed(a.read()))
a = file('2_01.html')
b.feed(a.read())