git.fiddlerwoaroof.com
Raw Blame History
import HTMLParser

class TMP(HTMLParser.HTMLParser):

	INI = 0

	DIV = 1

	OTH = 2

	AST = 3

	def __init__(self):

		HTMLParser.HTMLParser.__init__(self)

		self.links = [];self.__tmp = [];self.dbuf = '';self.__state = self.INI;self.level = 0

	def handle_starttag(self, tag, attrs):

		attrs = dict(attrs);self.level += 1

		print '%stag: %r, attrs: %r, state: %r' % ('\t'*self.level, tag, attrs, self.__state),

		if tag == 'div' and self.__state != self.DIV and attrs.get('class') == 'smallbox':

			print 'useful div',

			self.__state = self.DIV

		elif tag == 'img' and self.__state == self.OTH:

			print 'useful image',

			self.__tmp.append(attrs['src'])

		elif tag == 'a' and self.__state == self.DIV:

			print 'useful link',

			self.__tmp.append(attrs['href']);self.__state = self.OTH

		print '..done'

	def handle_endtag(self, tag):

		print '%sclose tag: %r' % ('\t'*self.level, tag)

		self.level -= 1

		if tag == 'div' and self.__state == self.DIV:

			if self.dbuf is not '':

				self.__tmp.append(self.dbuf)

				self.dbuf = ''

			self.links.append(self.__tmp);self.__tmp = []

			self.__state == self.INI

		elif tag == 'a' and self.__state == self.OTH:

			self.__state = self.DIV

			if self.dbuf is not '':

				self.__tmp.append(self.dbuf)

				self.dbuf = ''

	def handle_data(self, data):

		if self.__state == self.OTH:

			self.dbuf += data + ' '