class test(HTMLParser.HTMLParser): intable = False titling = False capturing = True cdata = [] out = u'' def handle_starttag(self, tag, attrs): print tag,attrs if dict(attrs).get('class', None) == 'm': self.capturing = False if tag.lower() == 'table': self.intable = True elif tag.lower() == 'title': self.titling = True elif tag.lower() == 'td' and dict(attrs).get('class', None) != 'm': self.capturing = True def handle_endtag(self, tag): if self.titling and tag.lower() == 'title': self.titling=False if self.capturing: out = u'\n'.join(self.cdata) self.cdata = [] self.out += '\n' + out #print out def handle_data(self, data): if self.capturing and self.intable and data.strip() != '': self.cdata.append(data.decode('utf-8')) if self.titling: self.cdata.append(u'HEADING: %s' % data.decode('utf-8')) b=test() for x in range(1,14): with file('3_%02d.html' % x, 'rU') as a: (b.reset(),setattr(b, 'capturing', True), b.feed(a.read())) a = file('2_01.html') b.feed(a.read())