baktun95827 · June 20, 2021 10:05 · Jun 20, 2021
diff --git a/snapshot2path.py b/snapshot2path.py
@@ -0,0 +1,40 @@
+import re
+import sys
+import os 
+
+rubbish_list = list((
+    "access-control-allow-credentials",
+    "{","}","chrome","data:image/png;base64", "<a", "zendesk"
+))
+
+def isUrl(line):
+    if '/' not in line:
+        return False
+    if re.search('/[a-z0-9_-]*/',line):
+        return True 
+
+def mightBeGarbage(line):
+    if re.search("\.(png|jpg|jpeg|gif|svg|bmp|ttf|avif|wav|mp4|aac|ajax|css|all|woff|js)",line):
+        return True
+    for word in rubbish_list:
+        if word in line:
+            return True
+    return False
+
+if __name__ == '__main__':
+    snapshot_infile = sys.argv[1]
+    pathlist_outfile = sys.argv[2]
+    working_dir = os.sep.join(os.path.realpath(snapshot_infile).split(os.sep)[:-1])
+    pathlist_outfile = working_dir + os.sep + pathlist_outfile
+    # print(pathlist_outfile)
+    results = set()
+    with open(snapshot_infile,'r') as r:
+        for line in r:
+            if isUrl(line):
+                if not mightBeGarbage(line):
+                    results.add(line)
+
+    with open(pathlist_outfile,'w') as w:
+        for entry in results:
+            w.write(entry)
+
No results found