import sgmllib

class ParseAtAllCostsParser(sgmllib.SGMLParser):
    def reset(self):
        self.items = []
        self.currentTag = None
        self.currentValue = ''
        self.initem = 0
        sgmllib.SGMLParser.reset(self)

    def start_item(self, attrs):
        # set a flag that we're within an RSS item now
        self.items.append({})
        self.initem = 1

    def end_item(self):
        # OK, we're out of the RSS item
        self.initem = 0

    def unknown_starttag(self, tag, attrs):
        self.currentTag = tag

    def unknown_endtag(self, tag):
        # if we're within an RSS item, save the data we've buffered
        if self.initem:
            # decode entities and strip whitespace
            self.currentValue = decodeEntities(self.currentValue.strip())
            self.items[-1][self.currentTag] = self.currentValue
        self.currentValue = ''

    def handle_data(self, data):
        # buffer all text data
        self.currentValue += data

    def handle_entityref(self, data):
        # buffer all entities
        self.currentValue += '&' + data
    handle_charref = handle_entityref

def decodeEntities(data):
    # in case our document *was* encoded correctly, we'll
    # need to decode the XML entities manually; sgmllib
    # will not do it for us
    data = data.replace('&lt;', '<')
    data = data.replace('&gt;', '>')
    data = data.replace('&quot;', '"')
    data = data.replace('&apos;', "'")
    data = data.replace('&amp;', '&')
    return data

if __name__ == '__main__':
    p = ParseAtAllCostsParser()
    p.feed(file('invalid.xml').read())
    for rssitem in p.items:
        print 'title:', rssitem.get('title')
        print 'description:', rssitem.get('description')
        print 'link:', rssitem.get('link')
        print