import sys
import os
import re
from xml.parsers import pyexpat

class Elinfo:
	def __init__(self, name):
		self.name = name
		self.count = 0
		self.minlev = 0
		self.charcount = 0
		self.empty = 1
		self.ptab = {}
		self.ktab = {}
		self.atab = {}

class Docinfo:
	def __init__(self):
		self.root = None
		self.eltab = {}
		self.elstack = []
		self.seenorder = 0
		self.utf8pattern = re.compile("([\300-\367])")

	def strt_handle(self, name, attrs):
		inf = self.eltab.get(name)
		if not inf:
			inf = Elinfo(name)
			inf.seen = self.seenorder
			self.seenorder = self.seenorder + 1
			self.eltab[name] = inf
		inf.count = inf.count + 1
		lim = len(self.elstack)
		if lim:
			parent = self.elstack[lim - 1]
			if inf.ptab.has_key(parent):
				inf.ptab[parent] = inf.ptab[parent] + 1
			else:
				inf.ptab[parent] = 1
			pinf = self.eltab[parent]
			pinf.empty = 0
			if pinf.ktab.has_key(name):
				pinf.ktab[name] = pinf.ktab[name] + 1
			else:
				pinf.ktab[name] = 1
		else:
			self.root = name

		#Attribute handling
		for i in range(0, len(attrs), 2):
			if inf.atab.has_key(attrs[i]):
				inf.atab[attrs[i]] = inf.atab[attrs[i]] + 1
			else:
				inf.atab[attrs[i]] = 1
		self.elstack.append(name)

	def end_handle(self, name):
		lim = len(self.elstack)
		self.elstack[lim - 1 : lim] = []

	def char_handle(self, data):
		elname = self.elstack[len(self.elstack) - 1]
		inf = self.eltab[elname]
		inf.empty = 0
		cnt = len(data)
		match = self.utf8pattern.search(data)
		while match:
			pos = match.start()
			char = data[pos:pos+1]
			data = data[pos+1:]
			if (char < "\340"):
				cnt = cnt - 1
			elif (char < "\360"):
				cnt = cnt - 2
			else:
				cnt = cnt - 3
			match = self.utf8pattern.search(data)
		inf.charcount = inf.charcount + cnt
	
	def set_minlev(self, name, level):
		inf = self.eltab[name]
		if inf.minlev == 0 or inf.minlev > level:
			newlev = level + 1
			inf.minlev = level
			for kid in inf.ktab.keys():
				self.set_minlev(kid, newlev)

def showtab(label, tab, dosum):
	if not len(tab):
		return
	print '\n  ', label + ':'
	sum = 0

	names = tab.keys()
	names.sort()
	for name in names:
		cnt = tab[name]
		sum = sum + cnt
		print '      %-16s      %5d' % (name, cnt)

	if dosum and len(names) > 1:
		print '                            ====='
		print '                            %5d' % sum

def elcmp(a, b):
	cmpmin = a.minlev - b.minlev
	if cmpmin:
		return cmpmin
	return a.seen - b.seen

doc = Docinfo()

parser = pyexpat.ParserCreate()
parser.StartElementHandler  = doc.strt_handle
parser.EndElementHandler    = doc.end_handle
parser.CharacterDataHandler = doc.char_handle

docstream = open(sys.argv[1])

while 1:
	buff = docstream.read(32000)
	if not len(buff):
		break
	status = parser.Parse(buff, 0)
	if status == 0:
		print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
		      ', column ', parser.ErrorColumnNumber, ', byte ',\
		      parser.ErrorByteIndex
		exit(-1)

status = parser.Parse('', 1)
if status == 0:
	print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
	      ', column ', parser.ErrorColumnNumber, ', byte ',\
	      parser.ErrorByteIndex
	exit(-1)

doc.set_minlev(doc.root, 0)
sortinf = doc.eltab.values()
sortinf.sort(elcmp)
for elinf in sortinf:
	print '\n================'
	print elinf.name + ':', elinf.count
	if elinf.charcount:
		print 'Had', elinf.charcount, 'bytes of character data'
	if elinf.empty:
		print 'Always empty'
	showtab('Parents', elinf.ptab, 0)
	showtab('Children', elinf.ktab, 1)
	showtab('Attributes', elinf.atab, 0)
