Mega Code Archive

 
Categories / Python Tutorial / Network
 

HTML Title Retriever With Entity Support

from htmlentitydefs  import entitydefs from HTMLParser import HTMLParser import sys class TitleParser(HTMLParser):     def __init__(self):         self.title = ''         self.readingtitle = 0         HTMLParser.__init__(self)     def handle_starttag(self, tag, attrs):         if tag == 'title':             self.readingtitle = 1     def handle_data(self, data):         if self.readingtitle:             self.title += data     def handle_endtag(self, tag):         if tag == 'title':             self.readingtitle = 0     def handle_entityref(self, name):         if entitydefs.has_key(name):             self.handle_data(entitydefs[name])         else:             self.handle_data('&' + name + ';')     def handle_charref(self, name):         try:             charnum = int(name)         except ValueError:             return         if charnum < 1 or charnum > 255:             return         self.handle_data(chr(charnum))     def gettitle(self):         return self.title fd = open(sys.argv[1]) tp = TitleParser() tp.feed(fd.read()) print "Title is:", tp.gettitle()