import HTMLParser class TMP(HTMLParser.HTMLParser): INI = 0 DIV = 1 OTH = 2 AST = 3 def __init__(self): HTMLParser.HTMLParser.__init__(self) self.links = [];self.__tmp = [];self.dbuf = '';self.__state = self.INI;self.level = 0 def handle_starttag(self, tag, attrs): attrs = dict(attrs);self.level += 1 print '%stag: %r, attrs: %r, state: %r' % ('\t'*self.level, tag, attrs, self.__state), if tag == 'div' and self.__state != self.DIV and attrs.get('class') == 'smallbox': print 'useful div', self.__state = self.DIV elif tag == 'img' and self.__state == self.OTH: print 'useful image', self.__tmp.append(attrs['src']) elif tag == 'a' and self.__state == self.DIV: print 'useful link', self.__tmp.append(attrs['href']);self.__state = self.OTH print '..done' def handle_endtag(self, tag): print '%sclose tag: %r' % ('\t'*self.level, tag) self.level -= 1 if tag == 'div' and self.__state == self.DIV: if self.dbuf is not '': self.__tmp.append(self.dbuf) self.dbuf = '' self.links.append(self.__tmp);self.__tmp = [] self.__state == self.INI elif tag == 'a' and self.__state == self.OTH: self.__state = self.DIV if self.dbuf is not '': self.__tmp.append(self.dbuf) self.dbuf = '' def handle_data(self, data): if self.__state == self.OTH: self.dbuf += data + ' '