Skip to content

Instantly share code, notes, and snippets.

@jaikishantulswani
Forked from mhmdiaa/waybackrobots.py
Created February 3, 2018 12:49
Show Gist options
  • Save jaikishantulswani/f2ae3a7d6fbf0ef1c18fd08d8e93fa4a to your computer and use it in GitHub Desktop.
Save jaikishantulswani/f2ae3a7d6fbf0ef1c18fd08d8e93fa4a to your computer and use it in GitHub Desktop.

Revisions

  1. Mohammed Diaa revised this gist Jun 21, 2017. 1 changed file with 6 additions and 2 deletions.
    8 changes: 6 additions & 2 deletions waybackrobots.py
    Original file line number Diff line number Diff line change
    @@ -9,6 +9,8 @@ def robots(host):
    'https://web.archive.org/cdx/search/cdx\
    ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
    results = r.json()
    if len(results) == 0: # might find nothing
    return []
    results.pop(0) # The first item is ['timestamp', 'original']
    return results

    @@ -24,14 +26,16 @@ def getpaths(snapshot):

    if __name__ == '__main__':
    if len(sys.argv) < 2:
    print('Usage:\n\tpython3 waybackrobots.py <url>')
    print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
    sys.exit()

    host = sys.argv[1]

    snapshots = robots(host)
    print('Found %s unique results' % len(snapshots))
    print('This is gonna take some time...')
    if len(snapshots) == 0:
    sys.exit()
    print('This may take some time...')
    pool = Pool(4)
    paths = pool.map(getpaths, snapshots)
    unique_paths = set()
  2. Mohammed Diaa created this gist Apr 28, 2017.
    43 changes: 43 additions & 0 deletions waybackrobots.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,43 @@
    import requests
    import re
    import sys
    from multiprocessing.dummy import Pool


    def robots(host):
    r = requests.get(
    'https://web.archive.org/cdx/search/cdx\
    ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
    results = r.json()
    results.pop(0) # The first item is ['timestamp', 'original']
    return results


    def getpaths(snapshot):
    url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
    robotstext = requests.get(url).text
    if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page
    paths = re.findall('/.*', robotstext)
    return paths
    return []


    if __name__ == '__main__':
    if len(sys.argv) < 2:
    print('Usage:\n\tpython3 waybackrobots.py <url>')
    sys.exit()

    host = sys.argv[1]

    snapshots = robots(host)
    print('Found %s unique results' % len(snapshots))
    print('This is gonna take some time...')
    pool = Pool(4)
    paths = pool.map(getpaths, snapshots)
    unique_paths = set()
    for i in paths:
    unique_paths.update(i)
    filename = '%s-robots.txt' % host
    with open(filename, 'w') as f:
    f.write('\n'.join(unique_paths))
    print('[*] Saved results to %s' % filename)