import sgmllib class ParseAtAllCostsParser(sgmllib.SGMLParser): def reset(self): self.items = [] self.currentTag = None self.currentValue = '' self.initem = 0 sgmllib.SGMLParser.reset(self) def start_item(self, attrs): # set a flag that we're within an RSS item now self.items.append({}) self.initem = 1 def end_item(self): # OK, we're out of the RSS item self.initem = 0 def unknown_starttag(self, tag, attrs): self.currentTag = tag def unknown_endtag(self, tag): # if we're within an RSS item, save the data we've buffered if self.initem: # decode entities and strip whitespace self.currentValue = decodeEntities(self.currentValue.strip()) self.items[-1][self.currentTag] = self.currentValue self.currentValue = '' def handle_data(self, data): # buffer all text data self.currentValue += data def handle_entityref(self, data): # buffer all entities self.currentValue += '&' + data handle_charref = handle_entityref def decodeEntities(data): # in case our document *was* encoded correctly, we'll # need to decode the XML entities manually; sgmllib # will not do it for us data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('"', '"') data = data.replace(''', "'") data = data.replace('&', '&') return data if __name__ == '__main__': p = ParseAtAllCostsParser() p.feed(file('invalid.xml').read()) for rssitem in p.items: print 'title:', rssitem.get('title') print 'description:', rssitem.get('description') print 'link:', rssitem.get('link') print