-
-
Save ripstick/a92e12c5de371824af87 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python2 | |
| import os | |
| import getpass | |
| import urllib2 | |
| from StringIO import StringIO | |
| import time | |
| import datetime | |
| import argparse | |
| import praw | |
| from bs4 import BeautifulSoup | |
| from progressbar import ProgressBar, Percentage, Bar, ETA | |
| def get_submissions(subreddit, count, filter): | |
| r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl') | |
| sr = r.get_subreddit(subreddit) | |
| return filter(sr, count) | |
| def get_links(submissions): | |
| res = [] | |
| for sub in submissions: | |
| res.append(sub.url) | |
| return res | |
| def filter_for_imgur(urls): | |
| res = [] | |
| for url in urls: | |
| url = url.encode('ascii', 'ignore') | |
| if 'imgur' in url: | |
| if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'): | |
| res.append(url) | |
| else: #we have to get the direct links here | |
| try: | |
| response = urllib2.urlopen(url) | |
| except urllib2.HTTPError as e: | |
| print url +": \t"+str(e.code)+" "+e.msg | |
| continue | |
| except urllib2.URLError as e: | |
| print "Could not download "+url | |
| continue | |
| if "image" in get_content_type(response): | |
| res.append(url) | |
| continue | |
| soup = BeautifulSoup(response.read()) | |
| image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image") | |
| imgs = image_container[0].findChildren("img") if len(image_container) > 0 else [] | |
| for img in imgs: | |
| link = img.get("data-src") if img.get("data-src") else img.get("src") | |
| if not link: | |
| continue | |
| res.append("http://" + link[2:]) | |
| else: | |
| try: | |
| response = urllib2.urlopen(url) | |
| except urllib2.HTTPError as e: | |
| print url +": \t"+str(e.code)+" "+e.msg | |
| continue | |
| except urllib2.URLError as e: | |
| print "Could not download "+url | |
| continue | |
| if "image" in get_content_type(response): | |
| res.append(url) | |
| return res | |
| def get_content_type(response): | |
| for header in response.info().headers: | |
| if header.startswith("Content-Type"): | |
| return header.split(":")[1] | |
| def get_file_format(content_type): | |
| short=content_type.split("/")[1] | |
| if ("jpg" in short or "jpeg" in short): | |
| return "jpg" | |
| elif ("gif" in short): | |
| return "gif" | |
| else: | |
| return "png" | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Download Images from Reddit") | |
| parser.add_argument('subreddit', help="The subreddit to load images from") | |
| parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)") | |
| parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images") | |
| parser.add_argument('--category', '-t', default="top", | |
| choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week", | |
| "top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week", | |
| "con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"], | |
| help="From which category do you want to download") | |
| return parser.parse_args() | |
| def download_images(urls, directory): | |
| actual = 0 | |
| not_read=[] | |
| ts=time.time() | |
| timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') | |
| widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' '] | |
| pbar = ProgressBar(widgets=widgets, maxval=100 ).start(); | |
| if directory and not directory.endswith("/"): | |
| directory += "/" | |
| if not os.path.isdir(directory): | |
| print directory+" could not be found" | |
| for i, url in enumerate(urls): | |
| try: | |
| response = urllib2.urlopen(url) | |
| except urllib2.HTTPError as e: | |
| print url +": \t"+str(e.code)+" "+e.msg | |
| continue | |
| except urllib2.URLError as e: | |
| print "Could not download "+url | |
| continue | |
| content_type = get_content_type(response) | |
| if "image" in content_type: | |
| percent = float(i+1) / len(urls) * 100 | |
| pbar.update(percent); | |
| image_data = StringIO(response.read()) | |
| directory = directory if directory else "" | |
| with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f: | |
| f.write(image_data.buf) | |
| actual += 1 | |
| time.sleep(2) | |
| else: | |
| not_read.append(url); | |
| pbar.finish(); | |
| if(len(not_read) > 0): | |
| print "Could not read the following urls:" | |
| for url in not_read: | |
| print url | |
| return actual | |
| def get_filters(): | |
| return {"top": lambda r, c: r.get_top(limit=c), | |
| "top-all": lambda r, c: r.get_top_from_all(limit=c), | |
| "top-day": lambda r, c: r.get_top_from_day(limit=c), | |
| "top-hour": lambda r, c: r.get_top_from_hour(limit=c), | |
| "top-month": lambda r, c: r.get_top_from_month(limit=c), | |
| "top-week": lambda r, c: r.get_top_from_week(limit=c), | |
| "top-year": lambda r, c: r.get_top_from_year(limit=c), | |
| "con": lambda r, c: r.get_controversial(limit=c), | |
| "con-all": lambda r, c: r.get_controversial_from_all(limit=c), | |
| "con-day": lambda r, c: r.get_controversial_from_day(limit=c), | |
| "con-hour": lambda r, c: r.get_controversial_from_hour(limit=c), | |
| "con-month": lambda r, c: r.get_controversial_from_month(limit=c), | |
| "con-week": lambda r, c: r.get_controversial_from_week(limit=c), | |
| "con-year": lambda r, c: r.get_controversial_from_year(limit=c), | |
| "hot": lambda r, c: r.get_hot(limit=c), | |
| "new": lambda r, c: r.get_new(limit=c), | |
| "new-bydate": lambda r, c: r.get_new_by_date(limit=c), | |
| "new-byrising": lambda r, c: r.get_new_by_rising(limit=c), | |
| "random": lambda r, c: r.get_random_submission(limit=c), | |
| "rising": lambda r, c: r.get_rising(limit=c), | |
| } | |
| def main(): | |
| args = parse_args() | |
| urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category])) | |
| print "Found "+str(len(urls))+" reddit threads" | |
| urls = filter_for_imgur(urls) | |
| print "Found "+str(len(urls))+" image links" | |
| actual = download_images(urls, args.output) | |
| print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory" | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment