Skip to content

Instantly share code, notes, and snippets.

@benallard
Last active September 27, 2025 00:57
Show Gist options
  • Save benallard/8042835 to your computer and use it in GitHub Desktop.
Save benallard/8042835 to your computer and use it in GitHub Desktop.

Revisions

  1. benallard revised this gist Jan 15, 2014. 1 changed file with 30 additions and 11 deletions.
    41 changes: 30 additions & 11 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,8 @@
    import os
    import xml.parsers.expat
    from xml.sax.saxutils import escape
    from optparse import OptionParser
    from math import log10


    # How much data we process at a time
    @@ -21,7 +23,11 @@
    # The current file handle we are writing to
    cur_file = None

    # The format string used to introduce the index in the file to be written
    FMT = ".%d"

    # The filename we are playing with
    out_dir = None
    root = None
    ext = None

    @@ -58,7 +64,8 @@ def next_file():
    cur_size = 0
    # Open another file
    cur_idx += 1
    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')
    cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext),
    'wt')
    if xml_declaration is not None:
    cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
    # Start again where we stopped
    @@ -130,7 +137,7 @@ def char_data(data):
    # make the split
    next_file()

    def main(filename):
    def main(filename, output_dir):
    # Create a parser
    p = xml.parsers.expat.ParserCreate()
    # We want to reproduce the input, so we are interested in the order of the
    @@ -145,11 +152,18 @@ def main(filename):
    p.CharacterDataHandler = char_data

    global cur_file, cur_idx
    global root, ext
    global out_dir, root, ext

    global FMT
    FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)

    out_dir, filename = os.path.split(filename)
    if output_dir is not None:
    out_dir = output_dir

    root, ext = os.path.splitext(filename)

    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')
    cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')

    with open(filename, 'rt') as xml_file:
    while True:
    @@ -170,10 +184,15 @@ def main(filename):
    print "part %d Done" % cur_idx

    if __name__ == "__main__":
    import sys
    if len(sys.argv) == 1:
    print "usage: %s FILENAME [MAX_SIZE]" % sys.argv[0]
    sys.exit()
    if len(sys.argv) > 2:
    MAX_SIZE = int(sys.argv[2]) * 1024
    main(sys.argv[1])
    parser = OptionParser(usage="usage: %prog [options] XML_FILE")
    parser.add_option("-o", "--output-dir",
    help="Specify the directory where the xml files will be written" \
    "(default to the same directory where the original file is)")
    parser.add_option("-M", "--max_size", type="int",
    help="Specify the size at which the files should be split (in Kb)")
    (options, args) = parser.parse_args()
    if len(args) != 1:
    parser.error("incorrect number of arguments")
    if options.max_size is not None:
    MAX_SIZE = options.max_size * 1024
    main(args[0], options.output_dir)
  2. benallard revised this gist Dec 20, 2013. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -121,6 +121,8 @@ def char_data(data):
    # ``escape`` is too much for us, only & and < ned to be escaped there ...
    data = data.replace('&', '&amp;')
    data = data.replace('<', '&lt;')
    if data == '>':
    data = '&gt;'
    cur_file.write(data.encode('utf-8'))
    cur_size += len(data)
    if not wroteStart:
  3. benallard revised this gist Dec 20, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion xml_split.py
    Original file line number Diff line number Diff line change
    @@ -121,7 +121,7 @@ def char_data(data):
    # ``escape`` is too much for us, only & and < ned to be escaped there ...
    data = data.replace('&', '&amp;')
    data = data.replace('<', '&lt;')
    cur_file.write(data)
    cur_file.write(data.encode('utf-8'))
    cur_size += len(data)
    if not wroteStart:
    # The data was outside of an element, it could be the right moment to
  4. benallard revised this gist Dec 20, 2013. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion xml_split.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@

    #!/usr/bin/env python

    import os
  5. benallard revised this gist Dec 20, 2013. 1 changed file with 21 additions and 1 deletion.
    22 changes: 21 additions & 1 deletion xml_split.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@

    #!/usr/bin/env python

    import os
    @@ -25,6 +26,9 @@
    root = None
    ext = None

    # The xml declaration of the file.
    xml_declaration = None

    # What was the signature of the last start element
    start = None

    @@ -56,12 +60,24 @@ def next_file():
    # Open another file
    cur_idx += 1
    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')
    if xml_declaration is not None:
    cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
    # Start again where we stopped
    for elem in path:
    start_element(*elem)
    # We are done 'ending'
    ending = False


    def xml_decl(version, encoding, standalone):
    global xml_declaration
    l = ['version', version, 'encoding', encoding]
    if standalone != -1:
    l.extend(['standalone', 'yes' if standalone else 'no'])
    xml_declaration = l
    cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))


    def start_element(name, attrs):
    """ Called by the parser when he meet a start element """
    global cur_size, start
    @@ -103,7 +119,10 @@ def char_data(data):
    cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = None
    wroteStart = True
    cur_file.write(escape(data))
    # ``escape`` is too much for us, only & and < ned to be escaped there ...
    data = data.replace('&', '&amp;')
    data = data.replace('<', '&lt;')
    cur_file.write(data)
    cur_size += len(data)
    if not wroteStart:
    # The data was outside of an element, it could be the right moment to
    @@ -119,6 +138,7 @@ def main(filename):

    # Set our callbacks (we are stripping comments out by not defining
    # callbacks for them)
    p.XmlDeclHandler = xml_decl
    p.StartElementHandler = start_element
    p.EndElementHandler = end_element
    p.CharacterDataHandler = char_data
  6. benallard revised this gist Dec 20, 2013. 1 changed file with 33 additions and 4 deletions.
    37 changes: 33 additions & 4 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -25,35 +25,45 @@
    root = None
    ext = None

    # What was the signature of the start element
    # What was the signature of the last start element
    start = None

    # if we are currently in the process of changing file
    ending = False

    def attrs_s(attrs):
    """ This generate the XML attributes from an element attribute list """
    l = ['']
    for i in range(0,len(attrs), 2):
    l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
    return ' '.join(l)

    def next_file():
    """ This makes the decision to cut the current file and starta new one """
    global cur_size, ending
    if (not ending) and (cur_size > MAX_SIZE):
    # size above threshold, and not already ending
    global cur_file, cur_idx
    print "part %d Done" % cur_idx
    ending = True
    # Close the current elements
    for elem in reversed(path):
    end_element(elem[0])
    # Close the file
    cur_file.close()
    # reset the size
    cur_size = 0
    # Open another file
    cur_idx += 1
    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')
    # Start again where we stopped
    for elem in path:
    start_element(*elem)
    # We are done 'ending'
    ending = False

    def start_element(name, attrs):
    """ Called by the parser when he meet a start element """
    global cur_size, start
    if start is not None:
    # Chaining starts after each others
    @@ -63,7 +73,10 @@ def start_element(name, attrs):
    return
    cur_size += len(name) + sum(len(k) for k in attrs)
    path.append((name, attrs))


    def end_element(name):
    """ Caled by the parser when he meet an end element """
    global cur_size
    global start
    if start is not None:
    @@ -79,7 +92,10 @@ def end_element(name):
    assert elem[0] == name
    cur_size += len(name)
    next_file()


    def char_data(data):
    """ Called by the parser when he meet data """
    global cur_size, start
    wroteStart = False
    if start is not None:
    @@ -95,10 +111,14 @@ def char_data(data):
    next_file()

    def main(filename):

    # Create a parser
    p = xml.parsers.expat.ParserCreate()
    # We want to reproduce the input, so we are interested in the order of the
    # attributess
    p.ordered_attributes = 1

    # Set our callbacks (we are stripping comments out by not defining
    # callbacks for them)
    p.StartElementHandler = start_element
    p.EndElementHandler = end_element
    p.CharacterDataHandler = char_data
    @@ -112,11 +132,20 @@ def main(filename):

    with open(filename, 'rt') as xml_file:
    while True:
    chunk = xml_file.read(1024*1024)
    if chunk == '':
    # Read a chunk
    chunk = xml_file.read(CHUNK_SIZE)
    if len(chunk) < CHUNK_SIZE:
    # End of file
    # tell the parser we're done
    p.Parse(chunk, 1)
    # exit the loop
    break
    # process the chunk
    p.Parse(chunk)

    # Don't forget to close our handle
    cur_file.close()

    print "part %d Done" % cur_idx

    if __name__ == "__main__":
  7. benallard revised this gist Dec 20, 2013. 1 changed file with 12 additions and 4 deletions.
    16 changes: 12 additions & 4 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -56,35 +56,43 @@ def next_file():
    def start_element(name, attrs):
    global cur_size, start
    if start is not None:
    # Chaining starts after each others
    cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = (name, attrs)
    if ending:
    return
    cur_size += len(name) + sum(len(k) for k in attrs)
    path.append((name, attrs))
    next_file()
    def end_element(name):
    global cur_size
    global start
    if start is not None:
    # Empty element, good, we did not wrote the start part
    cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
    else:
    # There was some data, close it normaly
    cur_file.write('</%s>' % name)
    start = None
    if ending:
    return
    elem = path.pop()[0]
    assert elem == name
    elem = path.pop()
    assert elem[0] == name
    cur_size += len(name)
    next_file()
    def char_data(data):
    global cur_size, start
    wroteStart = False
    if start is not None:
    # The data belong to an element, we should write the start part first
    cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = None
    wroteStart = True
    cur_file.write(escape(data))
    cur_size += len(data)
    next_file()
    if not wroteStart:
    # The data was outside of an element, it could be the right moment to
    # make the split
    next_file()

    def main(filename):

  8. benallard revised this gist Dec 19, 2013. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -73,8 +73,6 @@ def end_element(name):
    start = None
    if ending:
    return
    if len(path) == 1:
    print path
    elem = path.pop()[0]
    assert elem == name
    cur_size += len(name)
  9. benallard revised this gist Dec 19, 2013. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -110,6 +110,7 @@ def main(filename):
    if chunk == '':
    break
    p.Parse(chunk)
    cur_file.close()
    print "part %d Done" % cur_idx

    if __name__ == "__main__":
  10. benallard created this gist Dec 19, 2013.
    122 changes: 122 additions & 0 deletions xml_split.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@
    #!/usr/bin/env python

    import os
    import xml.parsers.expat
    from xml.sax.saxutils import escape


    # How much data we process at a time
    CHUNK_SIZE = 1024 * 1024

    # The sequence of element leading us to the current one
    path = []

    # How far we are in the current file
    cur_size = 0
    # From how much should we start another file
    MAX_SIZE = 1024*1024 # 1Mb

    # The current index
    cur_idx = 0
    # The current file handle we are writing to
    cur_file = None

    # The filename we are playing with
    root = None
    ext = None

    # What was the signature of the start element
    start = None

    # if we are currently in the process of changing file
    ending = False

    def attrs_s(attrs):
    l = ['']
    for i in range(0,len(attrs), 2):
    l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
    return ' '.join(l)

    def next_file():
    global cur_size, ending
    if (not ending) and (cur_size > MAX_SIZE):
    global cur_file, cur_idx
    print "part %d Done" % cur_idx
    ending = True
    for elem in reversed(path):
    end_element(elem[0])
    cur_file.close()
    cur_size = 0
    cur_idx += 1
    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')
    for elem in path:
    start_element(*elem)
    ending = False

    def start_element(name, attrs):
    global cur_size, start
    if start is not None:
    cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = (name, attrs)
    if ending:
    return
    cur_size += len(name) + sum(len(k) for k in attrs)
    path.append((name, attrs))
    next_file()
    def end_element(name):
    global cur_size
    global start
    if start is not None:
    cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
    else:
    cur_file.write('</%s>' % name)
    start = None
    if ending:
    return
    if len(path) == 1:
    print path
    elem = path.pop()[0]
    assert elem == name
    cur_size += len(name)
    next_file()
    def char_data(data):
    global cur_size, start
    if start is not None:
    cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = None
    cur_file.write(escape(data))
    cur_size += len(data)
    next_file()

    def main(filename):

    p = xml.parsers.expat.ParserCreate()
    p.ordered_attributes = 1

    p.StartElementHandler = start_element
    p.EndElementHandler = end_element
    p.CharacterDataHandler = char_data

    global cur_file, cur_idx
    global root, ext

    root, ext = os.path.splitext(filename)

    cur_file = open(root + '.%d' % cur_idx + ext, 'wt')

    with open(filename, 'rt') as xml_file:
    while True:
    chunk = xml_file.read(1024*1024)
    if chunk == '':
    break
    p.Parse(chunk)
    print "part %d Done" % cur_idx

    if __name__ == "__main__":
    import sys
    if len(sys.argv) == 1:
    print "usage: %s FILENAME [MAX_SIZE]" % sys.argv[0]
    sys.exit()
    if len(sys.argv) > 2:
    MAX_SIZE = int(sys.argv[2]) * 1024
    main(sys.argv[1])