Skip to content

Instantly share code, notes, and snippets.

@bancek
Created July 29, 2016 09:49
Show Gist options
  • Select an option

  • Save bancek/7fe5fe0476e81495f99633cf91b6ee29 to your computer and use it in GitHub Desktop.

Select an option

Save bancek/7fe5fe0476e81495f99633cf91b6ee29 to your computer and use it in GitHub Desktop.

Revisions

  1. bancek created this gist Jul 29, 2016.
    143 changes: 143 additions & 0 deletions zipstream.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,143 @@
    import struct
    import zipfile
    import time
    import os
    from binascii import crc32

    def commonprefix(m):
    "Given a list of pathnames, returns the longest common leading component"
    if not m: return ''
    s1 = min(m)
    s2 = max(m)
    for i, c in enumerate(s1):
    if c != s2[i]:
    return s1[:i]
    return s1

    def relpath(path, start=os.path.curdir):
    """Return a relative version of a path"""

    if not path:
    raise ValueError("no path specified")

    start_list = [x for x in os.path.abspath(start).split(os.path.sep) if x]
    path_list = [x for x in os.path.abspath(path).split(os.path.sep) if x]

    i = len(commonprefix([start_list, path_list]))

    rel_list = [os.path.pardir] * (len(start_list)-i) + path_list[i:]
    if not rel_list:
    return os.path.curdir
    return os.path.join(*rel_list)

    class ZipBuffer(object):
    """ A file-like object for zipfile.ZipFile to write into. """

    def __init__(self):
    self.data = []
    self.pos = 0

    def write(self, data):
    self.data.append(data)
    self.pos += len(data)

    def tell(self):
    # zipfile calls this so we need it
    return self.pos

    def flush(self):
    # zipfile calls this so we need it
    pass

    def get_and_clear(self):
    result = self.data
    self.data = []
    return result


    class XZipFile(zipfile.ZipFile):
    def write_streaming(self, zinfo_or_arcname, filename, compress_type=None):
    """Write a file into the archive. The contents is the string
    'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
    the name of the file in the archive."""
    if not isinstance(zinfo_or_arcname, zipfile.ZipInfo):
    zinfo = zipfile.ZipInfo(
    filename=zinfo_or_arcname,
    date_time=time.localtime(time.time())[:6])

    zinfo.compress_type = self.compression
    zinfo.flag_bits = 0x08 # magic
    zinfo.external_attr = 0600 << 16
    else:
    zinfo = zinfo_or_arcname

    if not self.fp:
    raise RuntimeError(
    "Attempt to write to ZIP archive that was already closed")

    if compress_type is not None:
    raise RuntimeError("Compression not supported!")

    zinfo.CRC = CRC = 0
    zinfo.compress_size = 0
    zinfo.file_size = file_size = 0
    zinfo.header_offset = self.fp.tell() # Start of header bytes
    self._writecheck(zinfo)
    self._didModify = True
    self.fp.write(zinfo.FileHeader())

    with open(filename, 'rb') as fp:
    while 1:
    buf = fp.read(1024 * 8)
    if not buf:
    break
    file_size = file_size + len(buf)
    CRC = crc32(buf, CRC) & 0xffffffff
    self.fp.write(buf)
    yield None

    zinfo.CRC = CRC
    zinfo.file_size = file_size
    zinfo.compress_size = file_size
    if zinfo.flag_bits & 0x08:
    # Write CRC and file sizes after the file data
    self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
    zinfo.file_size))
    self.filelist.append(zinfo)
    self.NameToInfo[zinfo.filename] = zinfo

    class FileGenerator(object):
    def __init__(self, src_dir, passes_filters):
    self.name = os.path.basename(src_dir.rstrip('/')) + 'zip'
    self.generator = _stream_folder(src_dir, passes_filters)

    def read(self, n):
    try:
    x = self.generator.next()
    while not len(x):
    x = self.generator.next()
    return x
    except StopIteration:
    return ''

    def stream_folder(src_dir, passes_filters):
    return FileGenerator(src_dir, passes_filters)

    def _stream_folder(src_dir, passes_filters):
    sink = ZipBuffer()
    archive = XZipFile(sink, mode='w', compression=zipfile.ZIP_STORED,
    allowZip64=True)

    for root, _, files in os.walk(unicode(src_dir)):
    for f in files:
    path = os.path.join(root, f)
    if passes_filters(path):
    rel_path = relpath(path, src_dir).replace(os.path.sep, '/')
    for _ in archive.write_streaming(rel_path, path):
    for chunk in sink.get_and_clear():
    yield chunk

    archive.close()
    # close() generates some more data, so we yield that too
    for chunk in sink.get_and_clear():
    yield chunk