Last active
March 29, 2016 18:27
-
-
Save raydog/a12701b5ba9a2c4f5e9c to your computer and use it in GitHub Desktop.
Revisions
-
raydog revised this gist
Mar 29, 2016 . No changes.There are no files selected for viewing
-
raydog revised this gist
Oct 10, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -31,7 +31,7 @@ const ( download_fmt string = "https:%s" album_cutoff int = 8 num_workers int = 5 max_pages int = 10 ) // Used for parsing the AJAX endpoints: -
raydog revised this gist
Oct 10, 2015 . 1 changed file with 126 additions and 19 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,9 +6,12 @@ package main import ( "encoding/json" "flag" "fmt" "github.com/PuerkitoBio/goquery" "io" "io/ioutil" "log" "net/http" "net/url" @@ -23,16 +26,44 @@ const ( base_url_fmt string = "https://imgur.com/r/%s" next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled" detail_url_fmt string = "https://imgur.com%s" ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true" ajax_img_fmt string = "https://i.imgur.com/%s%s" download_fmt string = "https:%s" album_cutoff int = 8 num_workers int = 5 max_pages int = 1 // 0 ) // Used for parsing the AJAX endpoints: type PostDetail struct { Hash string `json:"hash"` Title string `json:"title"` Desc string `json:"description"` Width int `json:"width"` Height int `json:"height"` Size int `json:"size"` Ext string `json:"ext"` Anim bool `json:"animated"` PreferVid bool `json:"prefer_video"` Looping bool `json:"looping"` Timestamp string `json:"datetime"` } type ListData struct { Count int `json:"count"` Images []PostDetail `json:"images"` } type AJAXResponse struct { Data ListData `json:"data"` Success bool `json:"success"` Status int `json:"status"` } func (pd PostDetail) GetURL() string { if pd.Hash == "" || pd.Ext == "" { return "" } return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext) } // From a subreddit name, fetches all urls from that subreddit: func fetchAllImageLinks(subreddit string) chan string { @@ -60,42 +91,53 @@ func urlGenerator(seed string) chan string { base := fmt.Sprintf(base_url_fmt, seed) go (func() { out <- base for n := 1; n < max_pages; n++ { out <- fmt.Sprintf(next_url_fmt, seed, n) } close(out) })() return out } // Performs an HTTP GET, with the correct fake headers: func httpGET(url string) (*http.Response, error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } request.Header.Set("User-Agent", user_agent) return http.DefaultClient.Do(request) } // A stupid hack so we can manipulate our user-agent when fetching pages: func buildGoQueryDocument(url string) (*goquery.Document, error) { resp, err := httpGET(url) if err != nil { return nil, err } return goquery.NewDocumentFromResponse(resp) } func extractFilename(link string) (string, error) { parsed, err := url.Parse(link) if err != nil { return "", err } return path.Base(parsed.Path), nil } // Download a file... Unless we already have it: func maybeDownload(link string) { fname, err := extractFilename(link) if err != nil { log.Printf("Cannot download [%s] : Bad link. %v", link, err) return } stat, err := os.Stat(fname) if err == nil && stat.Size() > 0 { log.Printf("Already have '%s'. Skipping.", fname) @@ -109,7 +151,7 @@ func maybeDownload(link string) { } defer destFile.Close() httpResp, err := httpGET(link) if err != nil { log.Printf("Couldn't download '%s': %v", fname, err) return @@ -162,12 +204,69 @@ func fetchUrlList(link string) (pageNum int, urls chan string) { urls <- href } }) // If page was malformed, and/or had no useable content, just send back page -1 if !pageNumSent { log.Printf("Page [%s] contained no usable data", link) pageNumChan <- -1 } })() pageNum = <-pageNumChan return } func httpAJAX(detailLink string) ([]byte, error) { albumId, err := extractFilename(detailLink) if err != nil { return nil, err } albumUrl := fmt.Sprintf(ajax_url_fmt, albumId) resp, err := httpGET(albumUrl) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode > 299 { return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode) } return ioutil.ReadAll(resp.Body) } // Will use the AJAX endpoint to pluck all images in an album out: func fetchAJAXUrls(detailLink string) chan string { out := make(chan string) data, err := httpAJAX(detailLink) if err != nil { close(out) return out } go (func() { defer close(out) parsed := AJAXResponse{} err = json.Unmarshal(data, &parsed) if err != nil { log.Printf("AJAX Parse failed: %v", err) return } for _, img := range parsed.Data.Images { if imgUrl := img.GetURL(); imgUrl != "" { out <- imgUrl } } })() return out } // Given the URL to a post detail page, returns the URLs to download: func fetchDownloadUrls(detailLink string) chan string { out := make(chan string) @@ -192,7 +291,10 @@ func fetchDownloadUrls(detailLink string) chan string { // Albums could have TONS of pics, so use AJAX if too many pics: if doc.Find("div.post-image").Length() >= album_cutoff { log.Printf("Large album: %s", detailLink) for linkz := range fetchAJAXUrls(detailLink) { out <- linkz } return } @@ -220,19 +322,23 @@ func imageWorker(urls chan string, workerName string) chan bool { maybeDownload(downloadMe) } } })() return out } // Main func parses args, and sets things up: func main() { verbose := flag.Bool("v", false, "Verbosely log what's happening") flag.Parse() target := flag.Arg(0) if !(*verbose) { log.SetOutput(ioutil.Discard) } imageChan := fetchAllImageLinks(target) var workers [num_workers]chan bool for i := range workers { @@ -243,5 +349,6 @@ func main() { for _, w := range workers { _ = <-w } log.Printf("Done.") } -
raydog revised this gist
Oct 8, 2015 . 1 changed file with 207 additions and 199 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,239 +1,247 @@ // rDump -- Dumps images in an Imgur sub-reddit thing // Dependencies: // go get github.com/PuerkitoBio/goquery package main import ( "fmt" "github.com/PuerkitoBio/goquery" "io" "log" "net/http" "net/url" "os" "path" "strconv" ) // Magic values go here: const ( user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36" base_url_fmt string = "https://imgur.com/r/%s" next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled" detail_url_fmt string = "https://imgur.com%s" download_fmt string = "https:%s" album_cutoff int = 8 num_workers int = 5 ) // Enums for the image plucking: const ( state_normal int = 0 state_image int = 1 ) // From a subreddit name, fetches all urls from that subreddit: func fetchAllImageLinks(subreddit string) chan string { // We give this channel a buffer, just so that page changes are less likely to // block image workers: out := make(chan string, 10) go (func() { defer close(out) for link := range urlGenerator(subreddit) { pageNo, linkChannel := fetchUrlList(link) log.Printf("Entering Page #%d : %s", pageNo, link) for link := range linkChannel { out <- link } } })() return out } // Given a subreddit name, returns a channel of URLs to scrape: func urlGenerator(seed string) chan string { out := make(chan string) base := fmt.Sprintf(base_url_fmt, seed) go (func() { out <- base for n := 1; n <= 1; n++ { out <- fmt.Sprintf(next_url_fmt, seed, n) } close(out) })() return out } // A stupid hack so we can manipulate our user-agent when fetching pages: func buildGoQueryDocument(url string) (*goquery.Document, error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } request.Header.Set("User-Agent", user_agent) resp, err := http.DefaultClient.Do(request) if err != nil { return nil, err } return goquery.NewDocumentFromResponse(resp) } // Download a file... Unless we already have it: func maybeDownload(link string) { parsed, err := url.Parse(link) if err != nil { log.Printf("Cannot download [%s] : Bad link. %v", link, err) return } fname := path.Base(parsed.Path) stat, err := os.Stat(fname) if err == nil && stat.Size() > 0 { log.Printf("Already have '%s'. Skipping.", fname) return } destFile, err := os.Create(fname) if err != nil { log.Printf("Failed to create '%s': %v", fname, err) return } defer destFile.Close() httpResp, err := http.Get(link) if err != nil { log.Printf("Couldn't download '%s': %v", fname, err) return } defer httpResp.Body.Close() if httpResp.StatusCode > 299 { log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode) return } n, err := io.Copy(destFile, httpResp.Body) if err != nil { log.Printf("Download failed for '%s': %v", fname, err) return } log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n) } // Parses images and the data-page thing out of the entry lists: func fetchUrlList(link string) (pageNum int, urls chan string) { pageNum, urls = -1, make(chan string) doc, err := buildGoQueryDocument(link) if err != nil { log.Printf("Failed to read URL: %s", link) close(urls) return } // We recieve a single value on this, which is the page num: pageNumSent, pageNumChan := false, make(chan int) defer close(pageNumChan) go (func() { defer close(urls) doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) { page, pageExists := s.Attr("data-page") href, hrefExists := s.Attr("href") if pageExists && !pageNumSent { pageNo, _ := strconv.ParseInt(page, 10, 32) pageNumSent = true pageNumChan <- int(pageNo) } if hrefExists { urls <- href } }) })() pageNum = <-pageNumChan return } // Given the URL to a post detail page, returns the URLs to download: func fetchDownloadUrls(detailLink string) chan string { out := make(chan string) detailUrl := fmt.Sprintf(detail_url_fmt, detailLink) doc, err := buildGoQueryDocument(detailUrl) if err != nil { log.Printf("Failed to read detail URL: %s", detailUrl) close(out) return out } _maybeSend := func(s string, exists bool) { if exists && s != "" { fullUrl := fmt.Sprintf(download_fmt, s) out <- fullUrl } } go (func() { defer close(out) // Albums could have TONS of pics, so use AJAX if too many pics: if doc.Find("div.post-image").Length() >= album_cutoff { log.Printf("AJAX album: %s (TODO)", detailLink) return } // Else, emit a single entry: doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) { _maybeSend(s.Find("img").Attr("src")) _maybeSend(s.Find("source").Attr("src")) }) })() return out } // Will read from a channel, downloading links until the channel dies: func imageWorker(urls chan string, workerName string) chan bool { out := make(chan bool) go (func() { defer close(out) log.Printf("Starting up worker: %s", workerName) for link := range urls { log.Printf("%s : Handling %s", workerName, link) for downloadMe := range fetchDownloadUrls(link) { log.Printf("%s : Found: %s", workerName, downloadMe) maybeDownload(downloadMe) } } out <- true })() return out } // Main func parses args, and sets things up: func main() { if len(os.Args) == 1 { log.Fatalf("Not enough arguments") } subreddit := os.Args[1] imageChan := fetchAllImageLinks(subreddit) var workers [num_workers]chan bool for i := range workers { name := fmt.Sprintf("Worker[%d]", i+1) workers[i] = imageWorker(imageChan, name) } for _, w := range workers { _ = <-w } log.Printf("Done.") } -
raydog created this gist
Oct 7, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,239 @@ // rDump -- Dumps images in an Imgur sub-reddit thing package main import ( "os" "log" "fmt" "strings" "strconv" "golang.org/x/net/html" "net/http" "io/ioutil" ) // Magic values go here: const ( base_url_fmt string = "https://imgur.com/r/%s" next_url_fmt string = "%s/new/page/%d/hit?scrolled" detail_url_fmt string = "https://imgur.com%s" image_class_name string = "image-list-link" post_class_name string = "post-image" num_workers int = 5 ) // Enums for the image plucking: const ( state_normal int = 0 state_picture int = 1 ) // From a subreddit name, fetches all urls from that subreddit: func fetchAllImageLinks(subreddit string) chan string { out := make(chan string) go (func () { defer close(out) for url := range urlGenerator(subreddit) { for link := range scrapeURL(url, true) { out <- link } } })() return out } // Given a subreddit name, returns a channel of URLs to scrape: func urlGenerator(seed string) chan string { out := make(chan string) base := fmt.Sprintf(base_url_fmt, seed) go (func () { out <- base for n:=1 ; n <= 2 ; n++ { out <- fmt.Sprintf(next_url_fmt, base, n) } close(out) })() return out } // Will perform an HTTP request, and return the code + content as a string: func httpGet(url string) (body string, status int) { log.Printf("Fetching %s...", url) resp, err := http.Get(url) if err != nil { log.Printf("HTTP FAIL (%s) : %v", url, err) return "", 999 } defer resp.Body.Close() bytes, err := ioutil.ReadAll(resp.Body) if err != nil { log.Printf("HTTP READ FAIL (%s) : %v", url, err) return "", 999 } return string(bytes), resp.StatusCode } // Given an html node, will try to find a val in the attribute list: (linear) func getNodeAttr(n *html.Node, name string) string { for _, a := range n.Attr { if a.Key == name { return a.Val } } return "" } // Parses images out of the html + the data-page thing: func parseListHtml(body string) (pageNum int, urls chan string) { pageNum, urls = -1, make(chan string) tree, err := html.Parse(strings.NewReader(body)) if err != nil { panic(err) } // We recieve a single value on this, which is the page num: pageNumSent, pageNumChan := false, make(chan int) defer close(pageNumChan) // Recursive func to search for <a> links var _findImages func (*html.Node) _findImages = func (n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { elemHref := getNodeAttr(n, "href") elemPage := getNodeAttr(n, "data-page") elemClass := getNodeAttr(n, "class") // If correct class: if elemClass == image_class_name { // First, check out the page number: pageNo, err := strconv.ParseInt(elemPage, 10, 32) if err == nil && !pageNumSent { pageNumSent = true pageNumChan <- int(pageNo) } // Then emit the href: urls <- elemHref } } for c := n.FirstChild; c != nil; c = c.NextSibling { _findImages(c) } } // Kick off the tree traversal async, and make sure we close afterwards: go (func () { _findImages(tree) close(urls) })() pageNum = <- pageNumChan return } // Will extract all image detail urls from a page: func scrapeURL(url string, pageZero bool) chan string { out := make(chan string) go (func () { defer close(out) body, status := httpGet(url) log.Printf("%s :: Status (%d) Content Length (%d)", url, status, len(body)) if status > 299 { log.Printf("%s :: Bad Status. Skipping.", url) return } pageNo, urls := parseListHtml(body) log.Printf("GOT PAGE NO %d", pageNo) for url := range urls { out <- url } })() return out } // Check to see if a file already exists and has content: func fileExists(path) bool { stat, err := os.Stat() return os.IsExist(err) && stat.Size() > 0; } // Given the URL to a post detail page, returns the URL to download: func fetchDownloadUrl(detailLink string) string { detailUrl := fmt.Sprintf(detail_url_fmt, detailLink) data, code := httpGet(detailUrl) if code > 999 { log.Printf("%s :: Bad Status. Skipping.", url) return "" } var _findThing func (*html.Node, state int) string _findThing = func (n *html.Node, state int) string { if n.Type == html.ElementNode { // Pluck the class, if it has one: klass := getNodeAttr(n, "class") tag := n.Data // Switch on state: switch { case state == state_normal && tag == "div" && klass == post_class_name: state = state_image for c := n.FirstChild; c != nil; c = c.NextSibling { if maybe := _findImages(c, state_image) ; maybe != "" { return maybe } } case state == state_image && tag == "a": return getNodeAttr(n, "href") } // Else, just recurse: for c := n.FirstChild; c != nil; c = c.NextSibling { if maybe := _findImages(c, state) ; maybe != "" { return maybe } } } } // Kick off the tree traversal async, and make sure we close afterwards: return _findImages(tree) close(urls) })() pageNum = <- pageNumChan return } // Will read from a channel, downloading links until the channel dies: func imageWorker(urls chan string, workerName string) chan bool { out := make(chan bool) go (func () { defer close(out) for link := range urls { log.Printf("Image fetch %s handling %s", workerName, link) } out <- true })() } // Main func parses args, and sets things up: func main() { if len(os.Args) == 1 { log.Fatalf("Not enough arguments") } subreddit := os.Args[1] for detail := range fetchAllImageLinks(subreddit) { log.Printf("Detail Link: %s", detail) } }