ripstick · July 7, 2014 03:11
diff --git a/reddit_dl.py b/reddit_dl.py
 #!/usr/bin/env python2
 import os
 import getpass
 import urllib2
 from StringIO import StringIO
 import time
 import datetime
 import argparse
 import praw
 from bs4 import BeautifulSoup
 from progressbar import ProgressBar, Percentage, Bar, ETA


 def get_submissions(subreddit, count, filter):
    r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
    sr = r.get_subreddit(subreddit)
    return filter(sr, count)


 def get_links(submissions):
    res = []
    for sub in submissions:
        res.append(sub.url)
    return res


 def filter_for_imgur(urls):
    res = []
    for url in urls:
        url = url.encode('ascii', 'ignore')
        if 'imgur' in url:
            if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
                res.append(url)
            else: #we have to get the direct links here         
                try:
                    response = urllib2.urlopen(url)
                except urllib2.HTTPError as e:
                    print url +": \t"+str(e.code)+" "+e.msg
                    continue
                except urllib2.URLError as e:
                    print "Could not download "+url
                    continue

                if "image" in get_content_type(response):
                    res.append(url)
                    continue
                soup = BeautifulSoup(response.read())
                image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
                imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
                for img in imgs:
                    link = img.get("data-src") if img.get("data-src") else img.get("src")
                    if not link:
                        continue
                    res.append("http://" + link[2:])
        else:
            try:
                response = urllib2.urlopen(url)
            except urllib2.HTTPError as e:
                print url +": \t"+str(e.code)+" "+e.msg
                continue
            except urllib2.URLError as e:
                print "Could not download "+url
                continue

            if "image" in get_content_type(response):
                res.append(url)
    return res


 def get_content_type(response):
    for header in response.info().headers:
        if header.startswith("Content-Type"):
            return header.split(":")[1]

 def get_file_format(content_type):
    short=content_type.split("/")[1]
    if ("jpg" in short or "jpeg" in short):
        return "jpg"
    elif ("gif" in short):
        return "gif"
    else:
        return "png"


 def parse_args():
    parser = argparse.ArgumentParser(description="Download Images from Reddit")
    parser.add_argument('subreddit', help="The subreddit to load images from")
    parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
    parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
    parser.add_argument('--category', '-t', default="top",
                        choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
                                 "top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
                                 "con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
                        help="From which category do you want to download")
    return parser.parse_args()


 def download_images(urls, directory):
    actual = 0
    not_read=[]
    ts=time.time()
    timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
    pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
    if directory and not directory.endswith("/"):
        directory += "/"
        if not os.path.isdir(directory):
            print directory+" could not be found"
    for i, url in enumerate(urls):
        try:
            response = urllib2.urlopen(url)
        except urllib2.HTTPError as e:
            print url +": \t"+str(e.code)+" "+e.msg
            continue
        except urllib2.URLError as e:
            print "Could not download "+url
            continue

        content_type = get_content_type(response)
        if "image" in content_type:
            percent = float(i+1) / len(urls) * 100
            pbar.update(percent);
            image_data = StringIO(response.read())
            directory = directory if directory else ""
            with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
                f.write(image_data.buf)
            actual += 1
            time.sleep(2)
        else:
            not_read.append(url);
    pbar.finish();
    if(len(not_read) > 0):
        print "Could not read the following urls:"
        for url in not_read:
            print url
    return actual

 def get_filters():
    return {"top": lambda r, c: r.get_top(limit=c),
            "top-all": lambda r, c: r.get_top_from_all(limit=c),
            "top-day": lambda r, c: r.get_top_from_day(limit=c),
            "top-hour": lambda r, c: r.get_top_from_hour(limit=c),
            "top-month": lambda r, c: r.get_top_from_month(limit=c),
            "top-week": lambda r, c: r.get_top_from_week(limit=c),
            "top-year": lambda r, c: r.get_top_from_year(limit=c),
            "con": lambda r, c: r.get_controversial(limit=c),
            "con-all": lambda r, c: r.get_controversial_from_all(limit=c),
            "con-day": lambda r, c: r.get_controversial_from_day(limit=c),
            "con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
            "con-month": lambda r, c: r.get_controversial_from_month(limit=c),
            "con-week": lambda r, c: r.get_controversial_from_week(limit=c),
            "con-year": lambda r, c: r.get_controversial_from_year(limit=c),
            "hot": lambda r, c: r.get_hot(limit=c),
            "new": lambda r, c: r.get_new(limit=c),
            "new-bydate": lambda r, c: r.get_new_by_date(limit=c),
            "new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
            "random": lambda r, c: r.get_random_submission(limit=c),
            "rising": lambda r, c: r.get_rising(limit=c),
    }


 def main():
    args = parse_args()
    urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
    print "Found "+str(len(urls))+" reddit threads"
    urls = filter_for_imgur(urls)
    print "Found "+str(len(urls))+" image links"
    actual = download_images(urls, args.output)
    print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python2
	import os
	import getpass
	import urllib2
	from StringIO import StringIO
	import time
	import datetime
	import argparse
	import praw
	from bs4 import BeautifulSoup
	from progressbar import ProgressBar, Percentage, Bar, ETA


	def get_submissions(subreddit, count, filter):
	r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
	sr = r.get_subreddit(subreddit)
	return filter(sr, count)


	def get_links(submissions):
	res = []
	for sub in submissions:
	res.append(sub.url)
	return res


	def filter_for_imgur(urls):
	res = []
	for url in urls:
	url = url.encode('ascii', 'ignore')
	if 'imgur' in url:
	if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
	res.append(url)
	else: #we have to get the direct links here
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	if "image" in get_content_type(response):
	res.append(url)
	continue
	soup = BeautifulSoup(response.read())
	image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
	imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
	for img in imgs:
	link = img.get("data-src") if img.get("data-src") else img.get("src")
	if not link:
	continue
	res.append("http://" + link[2:])
	else:
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	if "image" in get_content_type(response):
	res.append(url)
	return res


	def get_content_type(response):
	for header in response.info().headers:
	if header.startswith("Content-Type"):
	return header.split(":")[1]

	def get_file_format(content_type):
	short=content_type.split("/")[1]
	if ("jpg" in short or "jpeg" in short):
	return "jpg"
	elif ("gif" in short):
	return "gif"
	else:
	return "png"


	def parse_args():
	parser = argparse.ArgumentParser(description="Download Images from Reddit")
	parser.add_argument('subreddit', help="The subreddit to load images from")
	parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
	parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
	parser.add_argument('--category', '-t', default="top",
	choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
	"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
	"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
	help="From which category do you want to download")
	return parser.parse_args()


	def download_images(urls, directory):
	actual = 0
	not_read=[]
	ts=time.time()
	timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
	widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
	pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
	if directory and not directory.endswith("/"):
	directory += "/"
	if not os.path.isdir(directory):
	print directory+" could not be found"
	for i, url in enumerate(urls):
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	content_type = get_content_type(response)
	if "image" in content_type:
	percent = float(i+1) / len(urls) * 100
	pbar.update(percent);
	image_data = StringIO(response.read())
	directory = directory if directory else ""
	with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
	f.write(image_data.buf)
	actual += 1
	time.sleep(2)
	else:
	not_read.append(url);
	pbar.finish();
	if(len(not_read) > 0):
	print "Could not read the following urls:"
	for url in not_read:
	print url
	return actual

	def get_filters():
	return {"top": lambda r, c: r.get_top(limit=c),
	"top-all": lambda r, c: r.get_top_from_all(limit=c),
	"top-day": lambda r, c: r.get_top_from_day(limit=c),
	"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
	"top-month": lambda r, c: r.get_top_from_month(limit=c),
	"top-week": lambda r, c: r.get_top_from_week(limit=c),
	"top-year": lambda r, c: r.get_top_from_year(limit=c),
	"con": lambda r, c: r.get_controversial(limit=c),
	"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
	"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
	"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
	"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
	"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
	"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
	"hot": lambda r, c: r.get_hot(limit=c),
	"new": lambda r, c: r.get_new(limit=c),
	"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
	"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
	"random": lambda r, c: r.get_random_submission(limit=c),
	"rising": lambda r, c: r.get_rising(limit=c),
	}


	def main():
	args = parse_args()
	urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
	print "Found "+str(len(urls))+" reddit threads"
	urls = filter_for_imgur(urls)
	print "Found "+str(len(urls))+" image links"
	actual = download_images(urls, args.output)
	print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"


	if __name__ == "__main__":
	main()
No results found