Skip to content

Instantly share code, notes, and snippets.

@Manouchehri
Last active January 24, 2025 13:35
Show Gist options
  • Save Manouchehri/0ce55d239fb07c41c92f to your computer and use it in GitHub Desktop.
Save Manouchehri/0ce55d239fb07c41c92f to your computer and use it in GitHub Desktop.

Revisions

  1. Manouchehri revised this gist Feb 12, 2015. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions acceptgzipped.py
    Original file line number Diff line number Diff line change
    @@ -17,9 +17,7 @@
    response = urllib.request.urlopen(req)

    if response.info().get('Content-Encoding') == 'gzip':
    buf = io.BytesIO(response.read())
    f = gzip.GzipFile(fileobj=buf)
    pagedata = f.read()
    pagedata = gzip.decompress(response.read())
    elif response.info().get('Content-Encoding') == 'deflate':
    pagedata = response.read()
    elif response.info().get('Content-Encoding'):
  2. Manouchehri created this gist Feb 12, 2015.
    32 changes: 32 additions & 0 deletions acceptgzipped.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,32 @@
    __author__ = 'David Manouchehri'

    from bs4 import BeautifulSoup
    import urllib.request
    import gzip
    import io

    url = 'http://yoururlgoesherehopefullythisisntavalidurl.com/pages.html'

    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.5',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}


    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)

    if response.info().get('Content-Encoding') == 'gzip':
    buf = io.BytesIO(response.read())
    f = gzip.GzipFile(fileobj=buf)
    pagedata = f.read()
    elif response.info().get('Content-Encoding') == 'deflate':
    pagedata = response.read()
    elif response.info().get('Content-Encoding'):
    print('Encoding type unknown')
    else:
    pagedata = response.read()

    soup = BeautifulSoup(pagedata)

    print(soup.prettify())