#!/usr/bin/env python import os import xml.parsers.expat from xml.sax.saxutils import escape # How much data we process at a time CHUNK_SIZE = 1024 * 1024 # The sequence of element leading us to the current one path = [] # How far we are in the current file cur_size = 0 # From how much should we start another file MAX_SIZE = 1024*1024 # 1Mb # The current index cur_idx = 0 # The current file handle we are writing to cur_file = None # The filename we are playing with root = None ext = None # What was the signature of the start element start = None # if we are currently in the process of changing file ending = False def attrs_s(attrs): l = [''] for i in range(0,len(attrs), 2): l.append('%s="%s"' % (attrs[i], escape(attrs[i+1]))) return ' '.join(l) def next_file(): global cur_size, ending if (not ending) and (cur_size > MAX_SIZE): global cur_file, cur_idx print "part %d Done" % cur_idx ending = True for elem in reversed(path): end_element(elem[0]) cur_file.close() cur_size = 0 cur_idx += 1 cur_file = open(root + '.%d' % cur_idx + ext, 'wt') for elem in path: start_element(*elem) ending = False def start_element(name, attrs): global cur_size, start if start is not None: cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = (name, attrs) if ending: return cur_size += len(name) + sum(len(k) for k in attrs) path.append((name, attrs)) next_file() def end_element(name): global cur_size global start if start is not None: cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1]))) else: cur_file.write('' % name) start = None if ending: return elem = path.pop()[0] assert elem == name cur_size += len(name) next_file() def char_data(data): global cur_size, start if start is not None: cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) start = None cur_file.write(escape(data)) cur_size += len(data) next_file() def main(filename): p = xml.parsers.expat.ParserCreate() p.ordered_attributes = 1 p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data global cur_file, cur_idx global root, ext root, ext = os.path.splitext(filename) cur_file = open(root + '.%d' % cur_idx + ext, 'wt') with open(filename, 'rt') as xml_file: while True: chunk = xml_file.read(1024*1024) if chunk == '': break p.Parse(chunk) cur_file.close() print "part %d Done" % cur_idx if __name__ == "__main__": import sys if len(sys.argv) == 1: print "usage: %s FILENAME [MAX_SIZE]" % sys.argv[0] sys.exit() if len(sys.argv) > 2: MAX_SIZE = int(sys.argv[2]) * 1024 main(sys.argv[1])