import sys import os import re from xml.parsers import pyexpat class Elinfo: def __init__(self, name): self.name = name self.count = 0 self.minlev = 0 self.charcount = 0 self.empty = 1 self.ptab = {} self.ktab = {} self.atab = {} class Docinfo: def __init__(self): self.root = None self.eltab = {} self.elstack = [] self.seenorder = 0 self.utf8pattern = re.compile("([\300-\367])") def strt_handle(self, name, attrs): inf = self.eltab.get(name) if not inf: inf = Elinfo(name) inf.seen = self.seenorder self.seenorder = self.seenorder + 1 self.eltab[name] = inf inf.count = inf.count + 1 lim = len(self.elstack) if lim: parent = self.elstack[lim - 1] if inf.ptab.has_key(parent): inf.ptab[parent] = inf.ptab[parent] + 1 else: inf.ptab[parent] = 1 pinf = self.eltab[parent] pinf.empty = 0 if pinf.ktab.has_key(name): pinf.ktab[name] = pinf.ktab[name] + 1 else: pinf.ktab[name] = 1 else: self.root = name #Attribute handling for i in range(0, len(attrs), 2): if inf.atab.has_key(attrs[i]): inf.atab[attrs[i]] = inf.atab[attrs[i]] + 1 else: inf.atab[attrs[i]] = 1 self.elstack.append(name) def end_handle(self, name): lim = len(self.elstack) self.elstack[lim - 1 : lim] = [] def char_handle(self, data): elname = self.elstack[len(self.elstack) - 1] inf = self.eltab[elname] inf.empty = 0 cnt = len(data) match = self.utf8pattern.search(data) while match: pos = match.start() char = data[pos:pos+1] data = data[pos+1:] if (char < "\340"): cnt = cnt - 1 elif (char < "\360"): cnt = cnt - 2 else: cnt = cnt - 3 match = self.utf8pattern.search(data) inf.charcount = inf.charcount + cnt def set_minlev(self, name, level): inf = self.eltab[name] if inf.minlev == 0 or inf.minlev > level: newlev = level + 1 inf.minlev = level for kid in inf.ktab.keys(): self.set_minlev(kid, newlev) def showtab(label, tab, dosum): if not len(tab): return print '\n ', label + ':' sum = 0 names = tab.keys() names.sort() for name in names: cnt = tab[name] sum = sum + cnt print ' %-16s %5d' % (name, cnt) if dosum and len(names) > 1: print ' =====' print ' %5d' % sum def elcmp(a, b): cmpmin = a.minlev - b.minlev if cmpmin: return cmpmin return a.seen - b.seen doc = Docinfo() parser = pyexpat.ParserCreate() parser.StartElementHandler = doc.strt_handle parser.EndElementHandler = doc.end_handle parser.CharacterDataHandler = doc.char_handle docstream = open(sys.argv[1]) while 1: buff = docstream.read(32000) if not len(buff): break status = parser.Parse(buff, 0) if status == 0: print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\ ', column ', parser.ErrorColumnNumber, ', byte ',\ parser.ErrorByteIndex exit(-1) status = parser.Parse('', 1) if status == 0: print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\ ', column ', parser.ErrorColumnNumber, ', byte ',\ parser.ErrorByteIndex exit(-1) doc.set_minlev(doc.root, 0) sortinf = doc.eltab.values() sortinf.sort(elcmp) for elinf in sortinf: print '\n================' print elinf.name + ':', elinf.count if elinf.charcount: print 'Had', elinf.charcount, 'bytes of character data' if elinf.empty: print 'Always empty' showtab('Parents', elinf.ptab, 0) showtab('Children', elinf.ktab, 1) showtab('Attributes', elinf.atab, 0)