import string, sys, urllib, re, traceback, pickle, os, \ libxml2, types, libxslt #Used to spit out XML version of the word document serializexml = """ """ #Return leg of the round trip, XML back to Word wordxml2html = """ -br-hr-meta-img-link- &nbsp; <o:wrapblock> </o:wrapblock> -- -- <![]> <> -- </> " ' " = """ class wordtoxml: doc = '' #string version of original file htmfilename = '' xmlfilename = '' htmldoc = '' #stylesheet used to fix namespaces and serialize the document styledoc = libxml2.parseDoc(serializexml) style = libxslt.parseStylesheetDoc(styledoc) #dict containing classnames mapped to original style-names stylenames = {} def __init__(self, doc = ""): pass def loadhtmlfile(self, htmlfilename): self.htmfilename = htmlfilename self.doc = open(htmlfilename).read() #self.doc = self.doc.encode('cp1252') def parsehtmlfile(self, htmlfilename): self.loadhtmlfile(htmlfilename) #Remove mutant markup self.doc = self.clean() #Create a libxml2 XML document self.htmldoc = libxml2.htmlParseDoc(self.doc, None) #Get all style information styleNodes = self.htmldoc.xpathEval("//*[local-name() = 'style']") styles = "" for styleNode in styleNodes: styles += styleNode.serialize() self.extractstyles(styles) #Add mso-style-name attributes where possible classNodes = self.htmldoc.xpathEval("//*[@class]") for c in classNodes: className = c.prop("class") msoStyle = self.getStyleName(className) if msoStyle: #add an mso-stylename attribute c.newProp("mso-style-name", msoStyle) def output(self): result = self.style.applyStylesheet(self.htmldoc, None) return result.serialize() def extractstyles(self, styles): for style in re.findall(r'p\.(\w+)[^{]*[{]mso-style-name:\s*(.+);[^}]+[}]', styles): self.stylenames[style[0]]= style[1].replace('\\','').replace('"','') def getStyleName(self, className): if self.stylenames.has_key(className): return self.stylenames[className] elif className.startswith('Mso'): #builtin style return className[3:] else: return None def clean(self, doc = None): """Remove Mutant Markup Declarations (MMD) from word HTML files""" #This is allowed for testing purposes or other hacking if doc: self.doc = doc #Hack to deal with odd code-points in word files #you may need to add more of these if you encounter stuff #like wingdings font self.doc = self.doc.replace('\201','É') #Deal with MMDs startComment = r'<\!--\[(.*?)\]\>' startCommentReplace = r"
" self.doc = re.sub(startComment, startCommentReplace, self.doc) endComment = r'' endCommentReplace = r"
" self.doc = re.sub(endComment, endCommentReplace, self.doc) startMMD = r'<\!\[(.*?)\]\>' startMMDReplace = r"" self.doc = re.sub(startMMD, startMMDReplace, self.doc) endMMD = r'<\!\[endif\]>' endMMDReplace = "" self.doc = re.sub(endMMD, endMMDReplace, self.doc) #This is a rare special case, seems to be related to equations wrapblock = r'' wrapblockreplace = "" self.doc = self.doc.replace(wrapblock, wrapblockreplace) endwrapblock = r'' endwrapblockreplace = "" self.doc = self.doc.replace(endwrapblock, endwrapblockreplace) #Hide namespaces from libxml2's HTML parser qualifiedname = '<(/?)(\w):(\w)' hackedname = r'<\1\2_\3' self.doc = re.sub(qualifiedname, hackedname, self.doc) return self.doc class xmltoword: xmldoc = '' styledoc = libxml2.parseDoc(wordxml2html) style = libxslt.parseStylesheetDoc(styledoc) def __init__(self): pass def parsexmlfile(self, fileName): self.xmldoc = libxml2.parseFile(fileName) def output(self): return self.style.applyStylesheet(self.xmldoc, None).serialize() if __name__ == '__main__': fileName = sys.argv[1] if fileName.endswith('.htm'): conv = wordtoxml() conv.parsehtmlfile(fileName) print conv.output() elif fileName.endswith('.xml'): conv = xmltoword() conv.parsexmlfile(fileName) print conv.output()