Skip to content

Instantly share code, notes, and snippets.

@ripstick
Forked from nicokoch/reddit_dl.py
Created July 7, 2014 03:11
Show Gist options
  • Select an option

  • Save ripstick/a92e12c5de371824af87 to your computer and use it in GitHub Desktop.

Select an option

Save ripstick/a92e12c5de371824af87 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
import os
import getpass
import urllib2
from StringIO import StringIO
import time
import datetime
import argparse
import praw
from bs4 import BeautifulSoup
from progressbar import ProgressBar, Percentage, Bar, ETA
def get_submissions(subreddit, count, filter):
r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
sr = r.get_subreddit(subreddit)
return filter(sr, count)
def get_links(submissions):
res = []
for sub in submissions:
res.append(sub.url)
return res
def filter_for_imgur(urls):
res = []
for url in urls:
url = url.encode('ascii', 'ignore')
if 'imgur' in url:
if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
res.append(url)
else: #we have to get the direct links here
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
if "image" in get_content_type(response):
res.append(url)
continue
soup = BeautifulSoup(response.read())
image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
for img in imgs:
link = img.get("data-src") if img.get("data-src") else img.get("src")
if not link:
continue
res.append("http://" + link[2:])
else:
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
if "image" in get_content_type(response):
res.append(url)
return res
def get_content_type(response):
for header in response.info().headers:
if header.startswith("Content-Type"):
return header.split(":")[1]
def get_file_format(content_type):
short=content_type.split("/")[1]
if ("jpg" in short or "jpeg" in short):
return "jpg"
elif ("gif" in short):
return "gif"
else:
return "png"
def parse_args():
parser = argparse.ArgumentParser(description="Download Images from Reddit")
parser.add_argument('subreddit', help="The subreddit to load images from")
parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
parser.add_argument('--category', '-t', default="top",
choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
help="From which category do you want to download")
return parser.parse_args()
def download_images(urls, directory):
actual = 0
not_read=[]
ts=time.time()
timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
if directory and not directory.endswith("/"):
directory += "/"
if not os.path.isdir(directory):
print directory+" could not be found"
for i, url in enumerate(urls):
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
content_type = get_content_type(response)
if "image" in content_type:
percent = float(i+1) / len(urls) * 100
pbar.update(percent);
image_data = StringIO(response.read())
directory = directory if directory else ""
with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
f.write(image_data.buf)
actual += 1
time.sleep(2)
else:
not_read.append(url);
pbar.finish();
if(len(not_read) > 0):
print "Could not read the following urls:"
for url in not_read:
print url
return actual
def get_filters():
return {"top": lambda r, c: r.get_top(limit=c),
"top-all": lambda r, c: r.get_top_from_all(limit=c),
"top-day": lambda r, c: r.get_top_from_day(limit=c),
"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
"top-month": lambda r, c: r.get_top_from_month(limit=c),
"top-week": lambda r, c: r.get_top_from_week(limit=c),
"top-year": lambda r, c: r.get_top_from_year(limit=c),
"con": lambda r, c: r.get_controversial(limit=c),
"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
"hot": lambda r, c: r.get_hot(limit=c),
"new": lambda r, c: r.get_new(limit=c),
"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
"random": lambda r, c: r.get_random_submission(limit=c),
"rising": lambda r, c: r.get_rising(limit=c),
}
def main():
args = parse_args()
urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
print "Found "+str(len(urls))+" reddit threads"
urls = filter_for_imgur(urls)
print "Found "+str(len(urls))+" image links"
actual = download_images(urls, args.output)
print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment