Created
August 7, 2015 21:30
-
-
Save Smerity/56bc6f21a8adec920ebf to your computer and use it in GitHub Desktop.
Revisions
-
Smerity created this gist
Aug 7, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,51 @@ import gzip import json import requests try: from cStringIO import StringIO except: from StringIO import StringIO # Let's fetch the Common Crawl FAQ using the CC index resp = requests.get('http://index.commoncrawl.org/CC-MAIN-2015-27-index?url=http%3A%2F%2Fcommoncrawl.org%2Ffaqs%2F&output=json') pages = [json.loads(x) for x in resp.content.strip().split('\n')] # Multiple pages may have been found - we're only interested in one page = pages[0] # If we print this, we'll see the JSON representation of the response # Most important is the file path to read and the location within the large file that the GZIP response exists print 'JSON response from index.commoncrawl.org' print '---' print page print '---' # We need to calculate the start and the end of the relevant byte range # (each WARC file is composed of many small GZIP files stuck together) offset, length = int(page['offset']), int(page['length']) offset_end = offset + length - 1 # We'll get the file via HTTPS so we don't need to worry about S3 credentials # Getting the file on S3 is equivalent however - you can request a Range prefix = 'https://aws-publicdatasets.s3.amazonaws.com/' # We can then use the Range header to ask for just this set of bytes resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)}) # The page is stored compressed (gzip) to save space # We can extract it using the GZIP library raw_data = StringIO(resp.content) f = gzip.GzipFile(fileobj=raw_data) # What we have now is just the WARC response, formatted: data = f.read() warc, header, response = data.strip().split('\r\n\r\n', 2) # print 'WARC headers' print '---' print warc[:100] print '---' print 'HTTP headers' print '---' print header[:100] print '---' print 'HTTP response' print '---' print response[:100]