import re from lxml import etree def _escape_latex_control_chars_in(text): text = re.sub('#', r'\#', text) text = re.sub('{', r'\{', text) text = re.sub('}', r'\}', text) text = re.sub('~', r'\\textasciitilde', text) text = re.sub('_', r'\_', text) text = re.sub('\^', r'\\textasciicircum', text) text = re.sub('%', r'\%', text) text = re.sub('\$', r'\$', text) return text def escape_latex_control_chars(root): for elem in root.iter(): text = elem.text tail = elem.tail if elem.tag != 'tex-math' and text and isinstance(text, (str, unicode)): elem.text = _escape_latex_control_chars_in(text) if tail and isinstance(tail, (str, unicode)): elem.tail = _escape_latex_control_chars_in(tail) def unicode_replace(in_filename, out_filename): in_file = file(in_filename) in_tree = etree.parse(in_file) escape_latex_control_chars(in_tree) in_tree.write(file(out_filename, 'w')) def out_filename_for(in_filename): return ''.join(in_filename.rsplit('.', 1)[:-1] + ['-unicode.xml']) if __name__ == "__main__": import sys in_filename = sys.argv[1] out_filename = out_filename_for(in_filename) unicode_replace(in_filename, out_filename)