// rDump -- Dumps images in an Imgur sub-reddit thing // Dependencies: // go get github.com/PuerkitoBio/goquery package main import ( "encoding/json" "flag" "fmt" "github.com/PuerkitoBio/goquery" "io" "io/ioutil" "log" "net/http" "net/url" "os" "path" "strconv" ) // Magic values go here: const ( user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36" base_url_fmt string = "https://imgur.com/r/%s" next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled" detail_url_fmt string = "https://imgur.com%s" ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true" ajax_img_fmt string = "https://i.imgur.com/%s%s" download_fmt string = "https:%s" album_cutoff int = 8 num_workers int = 5 max_pages int = 10 ) // Used for parsing the AJAX endpoints: type PostDetail struct { Hash string `json:"hash"` Title string `json:"title"` Desc string `json:"description"` Width int `json:"width"` Height int `json:"height"` Size int `json:"size"` Ext string `json:"ext"` Anim bool `json:"animated"` PreferVid bool `json:"prefer_video"` Looping bool `json:"looping"` Timestamp string `json:"datetime"` } type ListData struct { Count int `json:"count"` Images []PostDetail `json:"images"` } type AJAXResponse struct { Data ListData `json:"data"` Success bool `json:"success"` Status int `json:"status"` } func (pd PostDetail) GetURL() string { if pd.Hash == "" || pd.Ext == "" { return "" } return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext) } // From a subreddit name, fetches all urls from that subreddit: func fetchAllImageLinks(subreddit string) chan string { // We give this channel a buffer, just so that page changes are less likely to // block image workers: out := make(chan string, 10) go (func() { defer close(out) for link := range urlGenerator(subreddit) { pageNo, linkChannel := fetchUrlList(link) log.Printf("Entering Page #%d : %s", pageNo, link) for link := range linkChannel { out <- link } } })() return out } // Given a subreddit name, returns a channel of URLs to scrape: func urlGenerator(seed string) chan string { out := make(chan string) base := fmt.Sprintf(base_url_fmt, seed) go (func() { out <- base for n := 1; n < max_pages; n++ { out <- fmt.Sprintf(next_url_fmt, seed, n) } close(out) })() return out } // Performs an HTTP GET, with the correct fake headers: func httpGET(url string) (*http.Response, error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } request.Header.Set("User-Agent", user_agent) return http.DefaultClient.Do(request) } // A stupid hack so we can manipulate our user-agent when fetching pages: func buildGoQueryDocument(url string) (*goquery.Document, error) { resp, err := httpGET(url) if err != nil { return nil, err } return goquery.NewDocumentFromResponse(resp) } func extractFilename(link string) (string, error) { parsed, err := url.Parse(link) if err != nil { return "", err } return path.Base(parsed.Path), nil } // Download a file... Unless we already have it: func maybeDownload(link string) { fname, err := extractFilename(link) if err != nil { log.Printf("Cannot download [%s] : Bad link. %v", link, err) return } stat, err := os.Stat(fname) if err == nil && stat.Size() > 0 { log.Printf("Already have '%s'. Skipping.", fname) return } destFile, err := os.Create(fname) if err != nil { log.Printf("Failed to create '%s': %v", fname, err) return } defer destFile.Close() httpResp, err := httpGET(link) if err != nil { log.Printf("Couldn't download '%s': %v", fname, err) return } defer httpResp.Body.Close() if httpResp.StatusCode > 299 { log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode) return } n, err := io.Copy(destFile, httpResp.Body) if err != nil { log.Printf("Download failed for '%s': %v", fname, err) return } log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n) } // Parses images and the data-page thing out of the entry lists: func fetchUrlList(link string) (pageNum int, urls chan string) { pageNum, urls = -1, make(chan string) doc, err := buildGoQueryDocument(link) if err != nil { log.Printf("Failed to read URL: %s", link) close(urls) return } // We recieve a single value on this, which is the page num: pageNumSent, pageNumChan := false, make(chan int) defer close(pageNumChan) go (func() { defer close(urls) doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) { page, pageExists := s.Attr("data-page") href, hrefExists := s.Attr("href") if pageExists && !pageNumSent { pageNo, _ := strconv.ParseInt(page, 10, 32) pageNumSent = true pageNumChan <- int(pageNo) } if hrefExists { urls <- href } }) // If page was malformed, and/or had no useable content, just send back page -1 if !pageNumSent { log.Printf("Page [%s] contained no usable data", link) pageNumChan <- -1 } })() pageNum = <-pageNumChan return } func httpAJAX(detailLink string) ([]byte, error) { albumId, err := extractFilename(detailLink) if err != nil { return nil, err } albumUrl := fmt.Sprintf(ajax_url_fmt, albumId) resp, err := httpGET(albumUrl) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode > 299 { return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode) } return ioutil.ReadAll(resp.Body) } // Will use the AJAX endpoint to pluck all images in an album out: func fetchAJAXUrls(detailLink string) chan string { out := make(chan string) data, err := httpAJAX(detailLink) if err != nil { close(out) return out } go (func() { defer close(out) parsed := AJAXResponse{} err = json.Unmarshal(data, &parsed) if err != nil { log.Printf("AJAX Parse failed: %v", err) return } for _, img := range parsed.Data.Images { if imgUrl := img.GetURL(); imgUrl != "" { out <- imgUrl } } })() return out } // Given the URL to a post detail page, returns the URLs to download: func fetchDownloadUrls(detailLink string) chan string { out := make(chan string) detailUrl := fmt.Sprintf(detail_url_fmt, detailLink) doc, err := buildGoQueryDocument(detailUrl) if err != nil { log.Printf("Failed to read detail URL: %s", detailUrl) close(out) return out } _maybeSend := func(s string, exists bool) { if exists && s != "" { fullUrl := fmt.Sprintf(download_fmt, s) out <- fullUrl } } go (func() { defer close(out) // Albums could have TONS of pics, so use AJAX if too many pics: if doc.Find("div.post-image").Length() >= album_cutoff { log.Printf("Large album: %s", detailLink) for linkz := range fetchAJAXUrls(detailLink) { out <- linkz } return } // Else, emit a single entry: doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) { _maybeSend(s.Find("img").Attr("src")) _maybeSend(s.Find("source").Attr("src")) }) })() return out } // Will read from a channel, downloading links until the channel dies: func imageWorker(urls chan string, workerName string) chan bool { out := make(chan bool) go (func() { defer close(out) log.Printf("Starting up worker: %s", workerName) for link := range urls { log.Printf("%s : Handling %s", workerName, link) for downloadMe := range fetchDownloadUrls(link) { log.Printf("%s : Found: %s", workerName, downloadMe) maybeDownload(downloadMe) } } })() return out } // Main func parses args, and sets things up: func main() { verbose := flag.Bool("v", false, "Verbosely log what's happening") flag.Parse() target := flag.Arg(0) if !(*verbose) { log.SetOutput(ioutil.Discard) } imageChan := fetchAllImageLinks(target) var workers [num_workers]chan bool for i := range workers { name := fmt.Sprintf("Worker[%d]", i+1) workers[i] = imageWorker(imageChan, name) } for _, w := range workers { _ = <-w } log.Printf("Done.") }