jaikishantulswani · February 3, 2018 12:49 · Jun 21, 2017 · Apr 28, 2017
diff --git a/waybackrobots.py b/waybackrobots.py
@@ -9,6 +9,8 @@ def robots(host):
         'https://web.archive.org/cdx/search/cdx\
         ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
     results = r.json()
+    if len(results) == 0: # might find nothing
+        return []
     results.pop(0)  # The first item is ['timestamp', 'original']
     return results
 
@@ -24,14 +26,16 @@ def getpaths(snapshot):
 
 if __name__ == '__main__':
     if len(sys.argv) < 2:
-        print('Usage:\n\tpython3 waybackrobots.py <url>')
+        print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
         sys.exit()
 
     host = sys.argv[1]
 
     snapshots = robots(host)
     print('Found %s unique results' % len(snapshots))
-    print('This is gonna take some time...')
+    if len(snapshots) == 0:
+        sys.exit()
+    print('This may take some time...')
     pool = Pool(4)
     paths = pool.map(getpaths, snapshots)
     unique_paths = set()

diff --git a/waybackrobots.py b/waybackrobots.py
@@ -0,0 +1,43 @@
+import requests
+import re
+import sys
+from multiprocessing.dummy import Pool
+
+
+def robots(host):
+    r = requests.get(
+        'https://web.archive.org/cdx/search/cdx\
+        ?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
+    results = r.json()
+    results.pop(0)  # The first item is ['timestamp', 'original']
+    return results
+
+
+def getpaths(snapshot):
+    url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
+    robotstext = requests.get(url).text
+    if 'Disallow:' in robotstext:  # verify it's acually a robots.txt file, not 404 page
+        paths = re.findall('/.*', robotstext)
+        return paths
+    return []
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print('Usage:\n\tpython3 waybackrobots.py <url>')
+        sys.exit()
+
+    host = sys.argv[1]
+
+    snapshots = robots(host)
+    print('Found %s unique results' % len(snapshots))
+    print('This is gonna take some time...')
+    pool = Pool(4)
+    paths = pool.map(getpaths, snapshots)
+    unique_paths = set()
+    for i in paths:
+        unique_paths.update(i)
+    filename = '%s-robots.txt' % host
+    with open(filename, 'w') as f:
+        f.write('\n'.join(unique_paths))
+    print('[*] Saved results to %s' % filename)