import string, sys, urllib, re, traceback, pickle, os, \
libxml2, types, libxslt
#Used to spit out XML version of the word document
serializexml = """
"""
#Return leg of the round trip, XML back to Word
wordxml2html = """
-br-hr-meta-img-link-
<o:wrapblock>
</o:wrapblock>
--
--
<![]>
<>
--
</>
"
'
"
=
"""
class wordtoxml:
doc = '' #string version of original file
htmfilename = ''
xmlfilename = ''
htmldoc = ''
#stylesheet used to fix namespaces and serialize the document
styledoc = libxml2.parseDoc(serializexml)
style = libxslt.parseStylesheetDoc(styledoc)
#dict containing classnames mapped to original style-names
stylenames = {}
def __init__(self, doc = ""):
pass
def loadhtmlfile(self, htmlfilename):
self.htmfilename = htmlfilename
self.doc = open(htmlfilename).read()
#self.doc = self.doc.encode('cp1252')
def parsehtmlfile(self, htmlfilename):
self.loadhtmlfile(htmlfilename)
#Remove mutant markup
self.doc = self.clean()
#Create a libxml2 XML document
self.htmldoc = libxml2.htmlParseDoc(self.doc, None)
#Get all style information
styleNodes = self.htmldoc.xpathEval("//*[local-name() = 'style']")
styles = ""
for styleNode in styleNodes:
styles += styleNode.serialize()
self.extractstyles(styles)
#Add mso-style-name attributes where possible
classNodes = self.htmldoc.xpathEval("//*[@class]")
for c in classNodes:
className = c.prop("class")
msoStyle = self.getStyleName(className)
if msoStyle: #add an mso-stylename attribute
c.newProp("mso-style-name", msoStyle)
def output(self):
result = self.style.applyStylesheet(self.htmldoc, None)
return result.serialize()
def extractstyles(self, styles):
for style in re.findall(r'p\.(\w+)[^{]*[{]mso-style-name:\s*(.+);[^}]+[}]', styles):
self.stylenames[style[0]]= style[1].replace('\\','').replace('"','')
def getStyleName(self, className):
if self.stylenames.has_key(className):
return self.stylenames[className]
elif className.startswith('Mso'): #builtin style
return className[3:]
else:
return None
def clean(self, doc = None):
"""Remove Mutant Markup Declarations (MMD) from word HTML files"""
#This is allowed for testing purposes or other hacking
if doc:
self.doc = doc
#Hack to deal with odd code-points in word files
#you may need to add more of these if you encounter stuff
#like wingdings font
self.doc = self.doc.replace('\201','É')
#Deal with MMDs
startComment = r'<\!--\[(.*?)\]\>'
startCommentReplace = r"
"
self.doc = re.sub(startComment, startCommentReplace, self.doc)
endComment = r''
endCommentReplace = r"
"
self.doc = re.sub(endComment, endCommentReplace, self.doc)
startMMD = r'<\!\[(.*?)\]\>'
startMMDReplace = r""
self.doc = re.sub(startMMD, startMMDReplace, self.doc)
endMMD = r'<\!\[endif\]>'
endMMDReplace = ""
self.doc = re.sub(endMMD, endMMDReplace, self.doc)
#This is a rare special case, seems to be related to equations
wrapblock = r''
wrapblockreplace = ""
self.doc = self.doc.replace(wrapblock, wrapblockreplace)
endwrapblock = r''
endwrapblockreplace = ""
self.doc = self.doc.replace(endwrapblock, endwrapblockreplace)
#Hide namespaces from libxml2's HTML parser
qualifiedname = '<(/?)(\w):(\w)'
hackedname = r'<\1\2_\3'
self.doc = re.sub(qualifiedname, hackedname, self.doc)
return self.doc
class xmltoword:
xmldoc = ''
styledoc = libxml2.parseDoc(wordxml2html)
style = libxslt.parseStylesheetDoc(styledoc)
def __init__(self):
pass
def parsexmlfile(self, fileName):
self.xmldoc = libxml2.parseFile(fileName)
def output(self):
return self.style.applyStylesheet(self.xmldoc, None).serialize()
if __name__ == '__main__':
fileName = sys.argv[1]
if fileName.endswith('.htm'):
conv = wordtoxml()
conv.parsehtmlfile(fileName)
print conv.output()
elif fileName.endswith('.xml'):
conv = xmltoword()
conv.parsexmlfile(fileName)
print conv.output()