// rDump -- Dumps images in an Imgur sub-reddit thing

// Dependencies:
//   go get github.com/PuerkitoBio/goquery

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"os"
	"path"
	"strconv"
)

// Magic values go here:
const (
	user_agent     string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
	base_url_fmt   string = "https://imgur.com/r/%s"
	next_url_fmt   string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
	detail_url_fmt string = "https://imgur.com%s"
	ajax_url_fmt   string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
	ajax_img_fmt   string = "https://i.imgur.com/%s%s"
	download_fmt   string = "https:%s"
	album_cutoff   int    = 8
	num_workers    int    = 5
	max_pages      int    = 10
)

// Used for parsing the AJAX endpoints:
type PostDetail struct {
	Hash      string `json:"hash"`
	Title     string `json:"title"`
	Desc      string `json:"description"`
	Width     int    `json:"width"`
	Height    int    `json:"height"`
	Size      int    `json:"size"`
	Ext       string `json:"ext"`
	Anim      bool   `json:"animated"`
	PreferVid bool   `json:"prefer_video"`
	Looping   bool   `json:"looping"`
	Timestamp string `json:"datetime"`
}
type ListData struct {
	Count  int          `json:"count"`
	Images []PostDetail `json:"images"`
}
type AJAXResponse struct {
	Data    ListData `json:"data"`
	Success bool     `json:"success"`
	Status  int      `json:"status"`
}

func (pd PostDetail) GetURL() string {
	if pd.Hash == "" || pd.Ext == "" {
		return ""
	}
	return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
}

// From a subreddit name, fetches all urls from that subreddit:
func fetchAllImageLinks(subreddit string) chan string {
	// We give this channel a buffer, just so that page changes are less likely to
	// block image workers:
	out := make(chan string, 10)

	go (func() {
		defer close(out)
		for link := range urlGenerator(subreddit) {
			pageNo, linkChannel := fetchUrlList(link)
			log.Printf("Entering Page #%d : %s", pageNo, link)

			for link := range linkChannel {
				out <- link
			}
		}
	})()
	return out
}

// Given a subreddit name, returns a channel of URLs to scrape:
func urlGenerator(seed string) chan string {
	out := make(chan string)
	base := fmt.Sprintf(base_url_fmt, seed)
	go (func() {
		out <- base
		for n := 1; n < max_pages; n++ {
			out <- fmt.Sprintf(next_url_fmt, seed, n)
		}
		close(out)
	})()
	return out
}

// Performs an HTTP GET, with the correct fake headers:
func httpGET(url string) (*http.Response, error) {
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return nil, err
	}

	request.Header.Set("User-Agent", user_agent)

	return http.DefaultClient.Do(request)
}

// A stupid hack so we can manipulate our user-agent when fetching pages:
func buildGoQueryDocument(url string) (*goquery.Document, error) {
	resp, err := httpGET(url)
	if err != nil {
		return nil, err
	}

	return goquery.NewDocumentFromResponse(resp)
}

func extractFilename(link string) (string, error) {
	parsed, err := url.Parse(link)
	if err != nil {
		return "", err
	}
	return path.Base(parsed.Path), nil
}

// Download a file... Unless we already have it:
func maybeDownload(link string) {

	fname, err := extractFilename(link)
	if err != nil {
		log.Printf("Cannot download [%s] : Bad link. %v", link, err)
		return
	}

	stat, err := os.Stat(fname)
	if err == nil && stat.Size() > 0 {
		log.Printf("Already have '%s'. Skipping.", fname)
		return
	}

	destFile, err := os.Create(fname)
	if err != nil {
		log.Printf("Failed to create '%s': %v", fname, err)
		return
	}
	defer destFile.Close()

	httpResp, err := httpGET(link)
	if err != nil {
		log.Printf("Couldn't download '%s': %v", fname, err)
		return
	}
	defer httpResp.Body.Close()

	if httpResp.StatusCode > 299 {
		log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
		return
	}

	n, err := io.Copy(destFile, httpResp.Body)
	if err != nil {
		log.Printf("Download failed for '%s': %v", fname, err)
		return
	}

	log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
}

// Parses images and the data-page thing out of the entry lists:
func fetchUrlList(link string) (pageNum int, urls chan string) {
	pageNum, urls = -1, make(chan string)

	doc, err := buildGoQueryDocument(link)
	if err != nil {
		log.Printf("Failed to read URL: %s", link)
		close(urls)
		return
	}

	// We recieve a single value on this, which is the page num:
	pageNumSent, pageNumChan := false, make(chan int)
	defer close(pageNumChan)

	go (func() {
		defer close(urls)

		doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
			page, pageExists := s.Attr("data-page")
			href, hrefExists := s.Attr("href")

			if pageExists && !pageNumSent {
				pageNo, _ := strconv.ParseInt(page, 10, 32)
				pageNumSent = true
				pageNumChan <- int(pageNo)
			}

			if hrefExists {
				urls <- href
			}
		})

		// If page was malformed, and/or had no useable content, just send back page -1
		if !pageNumSent {
			log.Printf("Page [%s] contained no usable data", link)
			pageNumChan <- -1
		}
	})()

	pageNum = <-pageNumChan
	return
}

func httpAJAX(detailLink string) ([]byte, error) {
	albumId, err := extractFilename(detailLink)
	if err != nil {
		return nil, err
	}

	albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)

	resp, err := httpGET(albumUrl)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode > 299 {
		return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
	}

	return ioutil.ReadAll(resp.Body)
}

// Will use the AJAX endpoint to pluck all images in an album out:
func fetchAJAXUrls(detailLink string) chan string {
	out := make(chan string)

	data, err := httpAJAX(detailLink)
	if err != nil {
		close(out)
		return out
	}

	go (func() {
		defer close(out)

		parsed := AJAXResponse{}
		err = json.Unmarshal(data, &parsed)
		if err != nil {
			log.Printf("AJAX Parse failed: %v", err)
			return
		}

		for _, img := range parsed.Data.Images {
			if imgUrl := img.GetURL(); imgUrl != "" {
				out <- imgUrl
			}
		}
	})()

	return out
}

// Given the URL to a post detail page, returns the URLs to download:
func fetchDownloadUrls(detailLink string) chan string {
	out := make(chan string)

	detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
	doc, err := buildGoQueryDocument(detailUrl)
	if err != nil {
		log.Printf("Failed to read detail URL: %s", detailUrl)
		close(out)
		return out
	}

	_maybeSend := func(s string, exists bool) {
		if exists && s != "" {
			fullUrl := fmt.Sprintf(download_fmt, s)
			out <- fullUrl
		}
	}

	go (func() {
		defer close(out)

		// Albums could have TONS of pics, so use AJAX if too many pics:
		if doc.Find("div.post-image").Length() >= album_cutoff {
			log.Printf("Large album: %s", detailLink)
			for linkz := range fetchAJAXUrls(detailLink) {
				out <- linkz
			}
			return
		}

		// Else, emit a single entry:
		doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
			_maybeSend(s.Find("img").Attr("src"))
			_maybeSend(s.Find("source").Attr("src"))
		})

	})()

	return out
}

// Will read from a channel, downloading links until the channel dies:
func imageWorker(urls chan string, workerName string) chan bool {
	out := make(chan bool)
	go (func() {
		defer close(out)
		log.Printf("Starting up worker: %s", workerName)
		for link := range urls {
			log.Printf("%s : Handling %s", workerName, link)
			for downloadMe := range fetchDownloadUrls(link) {
				log.Printf("%s : Found: %s", workerName, downloadMe)
				maybeDownload(downloadMe)
			}
		}
	})()
	return out
}

// Main func parses args, and sets things up:
func main() {

	verbose := flag.Bool("v", false, "Verbosely log what's happening")
	flag.Parse()

	target := flag.Arg(0)

	if !(*verbose) {
		log.SetOutput(ioutil.Discard)
	}

	imageChan := fetchAllImageLinks(target)

	var workers [num_workers]chan bool
	for i := range workers {
		name := fmt.Sprintf("Worker[%d]", i+1)
		workers[i] = imageWorker(imageChan, name)
	}

	for _, w := range workers {
		_ = <-w
	}

	log.Printf("Done.")
}