-
-
Save jaikishantulswani/f2ae3a7d6fbf0ef1c18fd08d8e93fa4a to your computer and use it in GitHub Desktop.
Revisions
-
Mohammed Diaa revised this gist
Jun 21, 2017 . 1 changed file with 6 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,6 +9,8 @@ def robots(host): 'https://web.archive.org/cdx/search/cdx\ ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host) results = r.json() if len(results) == 0: # might find nothing return [] results.pop(0) # The first item is ['timestamp', 'original'] return results @@ -24,14 +26,16 @@ def getpaths(snapshot): if __name__ == '__main__': if len(sys.argv) < 2: print('Usage:\n\tpython3 waybackrobots.py <domain-name>') sys.exit() host = sys.argv[1] snapshots = robots(host) print('Found %s unique results' % len(snapshots)) if len(snapshots) == 0: sys.exit() print('This may take some time...') pool = Pool(4) paths = pool.map(getpaths, snapshots) unique_paths = set() -
Mohammed Diaa created this gist
Apr 28, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,43 @@ import requests import re import sys from multiprocessing.dummy import Pool def robots(host): r = requests.get( 'https://web.archive.org/cdx/search/cdx\ ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host) results = r.json() results.pop(0) # The first item is ['timestamp', 'original'] return results def getpaths(snapshot): url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1]) robotstext = requests.get(url).text if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page paths = re.findall('/.*', robotstext) return paths return [] if __name__ == '__main__': if len(sys.argv) < 2: print('Usage:\n\tpython3 waybackrobots.py <url>') sys.exit() host = sys.argv[1] snapshots = robots(host) print('Found %s unique results' % len(snapshots)) print('This is gonna take some time...') pool = Pool(4) paths = pool.map(getpaths, snapshots) unique_paths = set() for i in paths: unique_paths.update(i) filename = '%s-robots.txt' % host with open(filename, 'w') as f: f.write('\n'.join(unique_paths)) print('[*] Saved results to %s' % filename)