Last active
          September 27, 2025 00:57 
        
      - 
      
- 
        Save benallard/8042835 to your computer and use it in GitHub Desktop. 
Revisions
- 
        benallard revised this gist Jan 15, 2014 . 1 changed file with 30 additions and 11 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,6 +3,8 @@ import os import xml.parsers.expat from xml.sax.saxutils import escape from optparse import OptionParser from math import log10 # How much data we process at a time @@ -21,7 +23,11 @@ # The current file handle we are writing to cur_file = None # The format string used to introduce the index in the file to be written FMT = ".%d" # The filename we are playing with out_dir = None root = None ext = None @@ -58,7 +64,8 @@ def next_file(): cur_size = 0 # Open another file cur_idx += 1 cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt') if xml_declaration is not None: cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration)) # Start again where we stopped @@ -130,7 +137,7 @@ def char_data(data): # make the split next_file() def main(filename, output_dir): # Create a parser p = xml.parsers.expat.ParserCreate() # We want to reproduce the input, so we are interested in the order of the @@ -145,11 +152,18 @@ def main(filename): p.CharacterDataHandler = char_data global cur_file, cur_idx global out_dir, root, ext global FMT FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1) out_dir, filename = os.path.split(filename) if output_dir is not None: out_dir = output_dir root, ext = os.path.splitext(filename) cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt') with open(filename, 'rt') as xml_file: while True: @@ -170,10 +184,15 @@ def main(filename): print "part %d Done" % cur_idx if __name__ == "__main__": parser = OptionParser(usage="usage: %prog [options] XML_FILE") parser.add_option("-o", "--output-dir", help="Specify the directory where the xml files will be written" \ "(default to the same directory where the original file is)") parser.add_option("-M", "--max_size", type="int", help="Specify the size at which the files should be split (in Kb)") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments") if options.max_size is not None: MAX_SIZE = options.max_size * 1024 main(args[0], options.output_dir) 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -121,6 +121,8 @@ def char_data(data): # ``escape`` is too much for us, only & and < ned to be escaped there ... data = data.replace('&', '&') data = data.replace('<', '<') if data == '>': data = '>' cur_file.write(data.encode('utf-8')) cur_size += len(data) if not wroteStart: 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -121,7 +121,7 @@ def char_data(data): # ``escape`` is too much for us, only & and < ned to be escaped there ... data = data.replace('&', '&') data = data.replace('<', '<') cur_file.write(data.encode('utf-8')) cur_size += len(data) if not wroteStart: # The data was outside of an element, it could be the right moment to 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,3 @@ #!/usr/bin/env python import os 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 21 additions and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,4 @@ #!/usr/bin/env python import os @@ -25,6 +26,9 @@ root = None ext = None # The xml declaration of the file. xml_declaration = None # What was the signature of the last start element start = None @@ -56,12 +60,24 @@ def next_file(): # Open another file cur_idx += 1 cur_file = open(root + '.%d' % cur_idx + ext, 'wt') if xml_declaration is not None: cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration)) # Start again where we stopped for elem in path: start_element(*elem) # We are done 'ending' ending = False def xml_decl(version, encoding, standalone): global xml_declaration l = ['version', version, 'encoding', encoding] if standalone != -1: l.extend(['standalone', 'yes' if standalone else 'no']) xml_declaration = l cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration)) def start_element(name, attrs): """ Called by the parser when he meet a start element """ global cur_size, start @@ -103,7 +119,10 @@ def char_data(data): cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = None wroteStart = True # ``escape`` is too much for us, only & and < ned to be escaped there ... data = data.replace('&', '&') data = data.replace('<', '<') cur_file.write(data) cur_size += len(data) if not wroteStart: # The data was outside of an element, it could be the right moment to @@ -119,6 +138,7 @@ def main(filename): # Set our callbacks (we are stripping comments out by not defining # callbacks for them) p.XmlDeclHandler = xml_decl p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 33 additions and 4 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -25,35 +25,45 @@ root = None ext = None # What was the signature of the last start element start = None # if we are currently in the process of changing file ending = False def attrs_s(attrs): """ This generate the XML attributes from an element attribute list """ l = [''] for i in range(0,len(attrs), 2): l.append('%s="%s"' % (attrs[i], escape(attrs[i+1]))) return ' '.join(l) def next_file(): """ This makes the decision to cut the current file and starta new one """ global cur_size, ending if (not ending) and (cur_size > MAX_SIZE): # size above threshold, and not already ending global cur_file, cur_idx print "part %d Done" % cur_idx ending = True # Close the current elements for elem in reversed(path): end_element(elem[0]) # Close the file cur_file.close() # reset the size cur_size = 0 # Open another file cur_idx += 1 cur_file = open(root + '.%d' % cur_idx + ext, 'wt') # Start again where we stopped for elem in path: start_element(*elem) # We are done 'ending' ending = False def start_element(name, attrs): """ Called by the parser when he meet a start element """ global cur_size, start if start is not None: # Chaining starts after each others @@ -63,7 +73,10 @@ def start_element(name, attrs): return cur_size += len(name) + sum(len(k) for k in attrs) path.append((name, attrs)) def end_element(name): """ Caled by the parser when he meet an end element """ global cur_size global start if start is not None: @@ -79,7 +92,10 @@ def end_element(name): assert elem[0] == name cur_size += len(name) next_file() def char_data(data): """ Called by the parser when he meet data """ global cur_size, start wroteStart = False if start is not None: @@ -95,10 +111,14 @@ def char_data(data): next_file() def main(filename): # Create a parser p = xml.parsers.expat.ParserCreate() # We want to reproduce the input, so we are interested in the order of the # attributess p.ordered_attributes = 1 # Set our callbacks (we are stripping comments out by not defining # callbacks for them) p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data @@ -112,11 +132,20 @@ def main(filename): with open(filename, 'rt') as xml_file: while True: # Read a chunk chunk = xml_file.read(CHUNK_SIZE) if len(chunk) < CHUNK_SIZE: # End of file # tell the parser we're done p.Parse(chunk, 1) # exit the loop break # process the chunk p.Parse(chunk) # Don't forget to close our handle cur_file.close() print "part %d Done" % cur_idx if __name__ == "__main__": 
- 
        benallard revised this gist Dec 20, 2013 . 1 changed file with 12 additions and 4 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -56,35 +56,43 @@ def next_file(): def start_element(name, attrs): global cur_size, start if start is not None: # Chaining starts after each others cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = (name, attrs) if ending: return cur_size += len(name) + sum(len(k) for k in attrs) path.append((name, attrs)) def end_element(name): global cur_size global start if start is not None: # Empty element, good, we did not wrote the start part cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1]))) else: # There was some data, close it normaly cur_file.write('</%s>' % name) start = None if ending: return elem = path.pop() assert elem[0] == name cur_size += len(name) next_file() def char_data(data): global cur_size, start wroteStart = False if start is not None: # The data belong to an element, we should write the start part first cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = None wroteStart = True cur_file.write(escape(data)) cur_size += len(data) if not wroteStart: # The data was outside of an element, it could be the right moment to # make the split next_file() def main(filename): 
- 
        benallard revised this gist Dec 19, 2013 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -73,8 +73,6 @@ def end_element(name): start = None if ending: return elem = path.pop()[0] assert elem == name cur_size += len(name) 
- 
        benallard revised this gist Dec 19, 2013 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -110,6 +110,7 @@ def main(filename): if chunk == '': break p.Parse(chunk) cur_file.close() print "part %d Done" % cur_idx if __name__ == "__main__": 
- 
        benallard created this gist Dec 19, 2013 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,122 @@ #!/usr/bin/env python import os import xml.parsers.expat from xml.sax.saxutils import escape # How much data we process at a time CHUNK_SIZE = 1024 * 1024 # The sequence of element leading us to the current one path = [] # How far we are in the current file cur_size = 0 # From how much should we start another file MAX_SIZE = 1024*1024 # 1Mb # The current index cur_idx = 0 # The current file handle we are writing to cur_file = None # The filename we are playing with root = None ext = None # What was the signature of the start element start = None # if we are currently in the process of changing file ending = False def attrs_s(attrs): l = [''] for i in range(0,len(attrs), 2): l.append('%s="%s"' % (attrs[i], escape(attrs[i+1]))) return ' '.join(l) def next_file(): global cur_size, ending if (not ending) and (cur_size > MAX_SIZE): global cur_file, cur_idx print "part %d Done" % cur_idx ending = True for elem in reversed(path): end_element(elem[0]) cur_file.close() cur_size = 0 cur_idx += 1 cur_file = open(root + '.%d' % cur_idx + ext, 'wt') for elem in path: start_element(*elem) ending = False def start_element(name, attrs): global cur_size, start if start is not None: cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = (name, attrs) if ending: return cur_size += len(name) + sum(len(k) for k in attrs) path.append((name, attrs)) next_file() def end_element(name): global cur_size global start if start is not None: cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1]))) else: cur_file.write('</%s>' % name) start = None if ending: return if len(path) == 1: print path elem = path.pop()[0] assert elem == name cur_size += len(name) next_file() def char_data(data): global cur_size, start if start is not None: cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = None cur_file.write(escape(data)) cur_size += len(data) next_file() def main(filename): p = xml.parsers.expat.ParserCreate() p.ordered_attributes = 1 p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data global cur_file, cur_idx global root, ext root, ext = os.path.splitext(filename) cur_file = open(root + '.%d' % cur_idx + ext, 'wt') with open(filename, 'rt') as xml_file: while True: chunk = xml_file.read(1024*1024) if chunk == '': break p.Parse(chunk) print "part %d Done" % cur_idx if __name__ == "__main__": import sys if len(sys.argv) == 1: print "usage: %s FILENAME [MAX_SIZE]" % sys.argv[0] sys.exit() if len(sys.argv) > 2: MAX_SIZE = int(sys.argv[2]) * 1024 main(sys.argv[1])