#!/usr/bin/env python3 ''' Fetch and decode the links from a subreddit when they are encoded in base64 (until 3 pass) Installation: You need python 3.8 installed Save the pastebin as a file i.e "redscrape.py" and open a terminal where the file is located > python3 -m venv . # On Mac > python -m venv . # On Windows/Linux > . bin/activate > pip install psaw fire > python redscrape.py --help Usage examples : > python redscrape.py my_sub --after="2021-11-01" # All the links in r/my_sub from 2021-11-01 to now > python redscrape.py my_sub --after="2021-07-01" --before="2021-08-01" --domains="drive.google.com, mega.nz" # All the links in r/my_sub on July 2021 containing the domains drive.google.com and mega.nz Behaviour: A "links.json" file is generated, so that you can easily visualize links or process them with 'jq' program (See below). This file contains a list ("by_date") of the matching posts (submissions and comments) sorted by date, with the title and the links related. Another field ("by_id) contains more details as text content and, for comments, the root post title and its body. You can run the program with different params to complete your collection as the json file is reused on startup. CAUTION: You should not run the program without parameters on a complete sub as you will hammer the pushshift API Params: --sub= : Name of subreddit --after= : Start date (ex: --start="2021-09-15" ). By default it's the beginning of the sub --before= : Stop date (ex: --stop=="2021-06-03"). By default it's now --domains= : A list of domains to filter the links separated by commas. It overrides the default list (ex: --domains="drive.google.com, mega.nz") JQ Examples: jq allows you to smartly grep in a json file and has many other features: # Search Schubert in the titles of the "by_date" index > jq -r '.by_date[] | select(.title | match("schubert"; "i"))' links.json # Search Mozart in the body the the posts. > jq -r '.by_id[] | select (.body != null) | select(.body | match("mozart"; "i"))' links.json ''' from psaw import PushshiftAPI import re import base64 from urllib.parse import urlparse import json import fire import datetime import time import os # cf. https://stackoverflow.com/questions/475074/regex-to-parse-or-validate-base64-data/475217#475217 BASE64_REGEX = '(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})' # URL_REGEX = r'(https?:\/\/[A-Za-z0-9+\/\.\-_#?=!]+)' URL_REGEX = r'(https?:\/\/[^\s)\]]+)' URL_MARKDOWN_REGEX = r'\[.+?\]\((.+?)\)' OUTPUT_FILE='links.json' MAX_PASS = 3 ACCEPTED_DOMAINS = ['drive.google.com', 'mega.nz', 'web.archive.org', 'filecat.org', 'dropbox.com', 'terabox.com', '1fichier.com', 'youtube.com', 'youtu.be', 'transferfile.io','udrop.com', 'odrive.com', 'mirrored.to', 'mediafire.com', 'file-upload', 'dropbox.com', 'uptobox.com', 'ulozto.net', 'ufile.io', 'turbobit.net', 'udl.to' , 'store.tidal.com', 'krakenfiles.com', 'gofile.io', 'filetransfer.io', 'dropapk.to', 'drop.download', 'easyupload.io', 'dbree.org', ] # Ugly globals api = PushshiftAPI() base64_pattern = re.compile(BASE64_REGEX) url_pattern = re.compile(URL_REGEX) md_pattern = re.compile(URL_MARKDOWN_REGEX) def is_url(string): try: result = urlparse(string) return all([result.scheme, result.netloc]) except: return False def fetch_links(candidates, nb_pass=MAX_PASS, domains=ACCEPTED_DOMAINS): """ Analyse a list of tuples and return valid links as a new list of tuples. The strings are decoded recursively in base 64 in "nb_pass". The returned list is filtered and should only contains links matching the accepted domains.""" nb_pass -= 1 candidate_list = candidates # print("level", nb_pass) # print('candidates', candidates) for candidate in candidates: # print('candidate:', candidate) matches = url_pattern.findall(candidate) # print("url matches:", matches) candidate_list.extend([c.rstrip('\n') for c in matches]) # print("candidate list:", candidate_list) matches = base64_pattern.findall(candidate) candidate_list.extend([base64.b64decode(c).decode('utf-8', errors='ignore').rstrip('\n') for c in matches if len(c) >=12]) # print("base64 matches:", matches) candidate_list = list(set(candidate_list)) # print("base64 candidates:", candidate_list) new_list = [] new_list=[c for c in candidate_list if c] if not new_list: return [] if (nb_pass == 0): # new_list=[c for c in candidate_list if is_url(c[0])] # filter urls by domain tmp_list = [] for c in new_list: for accepted in domains: if c.find(accepted) >= 0 : tmp_list.append(c) break new_list = tmp_list return list(set(new_list)) else: return list(set(fetch_links(new_list, nb_pass=nb_pass, domains=domains))) def compute_sub(sub, after=None, before=None, domains = None): """ Fetch links matching domains between 2 dates """ print() print("Sub:", sub) print(f"From {after} to {before}") after=int(datetime.datetime.strptime(after, "%Y-%m-%d").timestamp()) if after else 0 before=int(datetime.datetime.strptime(before, "%Y-%m-%d").timestamp()) if before else int(datetime.datetime.now().timestamp()) accepted = domains.split(',') if domains else ACCEPTED_DOMAINS accepted = [s.strip() for s in accepted] print("Domains :", ",".join(accepted)) print() if os.path.exists(OUTPUT_FILE): with open(OUTPUT_FILE, 'r') as f: by_id = json.load(f)['by_id'] else: by_id = {} posts_dict={} print("Processing submissions. Please wait!") print() submissions = api.search_submissions(subreddit=sub, before=before, after=after) posts = (dict(id=post.id, url=post.full_link, title=post.title, date=datetime.datetime.fromtimestamp(post.created_utc).isoformat(), body=post.selftext if hasattr(post, 'selftext') else '') for post in submissions) for p in posts: # Cache the submissions for future use posts_dict[p['id']] = p # Analyse the submissions # links = fetch_links([(p['body'], p['body'])], nb_pass=MAX_PASS, domains=accepted) links = fetch_links([p['body']], nb_pass=MAX_PASS, domains=accepted) if links: print("\n", p['url']) print("links", links) p['links'] = links by_id[p["id"]] = p # print() # print("Waiting 10s ...") # To avoid some errors 429 # print() # time.sleep(10) print("Processing comments. Please wait!") comments = api.search_comments(subreddit=sub, before=before, after=after) posts = (dict(id=post.id, url="http://reddit.com"+post.permalink, date=datetime.date.fromtimestamp(post.created_utc).isoformat(), body=post.body, parent_id= post.parent_id) for post in comments) # Cache the comments for future use comments = [] for p in posts: posts_dict[p['id']] = p comments.append(p['id']) for c in comments: p = posts_dict.get(c) links = fetch_links([p['body']], nb_pass=MAX_PASS, domains=accepted) if links: p['links'] = links # Retrieve submission info for this comment root_id = p['url'].split('/')[6] root = posts_dict.get(root_id, None) # Data may be inconsistent or network errors title = root.get('title', '') if root else '' body = root.get('body', '') if root else '' p['submission_id'] = root_id p['title']=title p['submission_body'] = body # Retrieve parent info for this comment parent_id = p['parent_id'] if(parent_id != root_id): parent = posts_dict.get(parent_id, None) body = parent.get('body', '') if parent else '' p['parent_body']=body by_id[p["id"]] = p print("\n", p['url']) print("links", links) # Let's build the list sorted by date by_date = [{'id': p['id'], 'title': p['title'], 'date': p['date'],'links': p['links']} for p in by_id.values()] by_date.sort(key=lambda x: x.get('date'), reverse=True) final_list = {"by_date": by_date, "by_id": by_id} with open(OUTPUT_FILE, 'w') as f: json.dump(final_list, f, indent=4) print() print("Total :", len(by_date)) if __name__ == "__main__": fire.Fire(compute_sub)