Skip to content

Instantly share code, notes, and snippets.

@christopherhesse
Created March 8, 2012 07:11
Show Gist options
  • Select an option

  • Save christopherhesse/1999345 to your computer and use it in GitHub Desktop.

Select an option

Save christopherhesse/1999345 to your computer and use it in GitHub Desktop.

Revisions

  1. christopherhesse created this gist Mar 8, 2012.
    67 changes: 67 additions & 0 deletions extract_chrome_cache_html.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    import sys
    import re
    import gzip
    import mimetypes
    from mimetools import Message
    from StringIO import StringIO

    def parse_headers(raw_headers):
    response_line, headers_text = raw_headers.split('\n', 1)
    headers = Message(StringIO(headers_text))
    return dict(headers)

    def filter_blank(lines):
    for line in lines:
    line = line.strip()
    if line != '':
    yield line

    def convert_cache_line(cache_line):
    result = ''
    for byte in cache_line.split(' ')[1:17]:
    if byte == '':
    break
    result += chr(int(byte, 16))
    return result

    def gzip_decompress(data):
    return gzip.GzipFile('', 'rb', 9, StringIO(data)).read()

    def decode_cache_html(cache_html):
    pre_regexp = re.compile('<pre>(.*?)</pre>', re.DOTALL)
    matches = pre_regexp.findall(cache_html)

    raw_headers = matches[0]
    headers = parse_headers(raw_headers)

    segments = []
    for raw_content in filter_blank(matches[1:]):
    segment = ''
    for line in filter_blank(raw_content.split('\n')):
    if line != '':
    segment += convert_cache_line(line)
    segments.append(segment)

    return headers, segments

    def main():
    for filename in sys.argv[1:]:
    with open(filename) as input_file:
    cache_html = input_file.read()
    headers, segments = decode_cache_html(cache_html)

    # segment[0] is usually the header + certificate
    data = segments[1]
    if headers.get('content-encoding') == 'gzip':
    data = gzip_decompress(data)

    content_type = headers['content-type'].split(';')[0]
    extension = mimetypes.guess_all_extensions(content_type)[0]

    output_filename = filename + '-decoded' + extension
    print 'writing to {0}'.format(output_filename)
    with open(output_filename, 'w') as output_file:
    output_file.write(data)

    if __name__ == "__main__":
    main()