Skip to content

Instantly share code, notes, and snippets.

@yogesh-desai
Created September 26, 2017 07:57
Show Gist options
  • Save yogesh-desai/bf0d203afe2d151bbefbcc98b427a09c to your computer and use it in GitHub Desktop.
Save yogesh-desai/bf0d203afe2d151bbefbcc98b427a09c to your computer and use it in GitHub Desktop.

Revisions

  1. yogesh-desai created this gist Sep 26, 2017.
    388 changes: 388 additions & 0 deletions NewTokoChan.go
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,388 @@
    package main

    import (

    "golang.org/x/net/html"
    "net/http"
    "net/url"
    "runtime"
    "strings"
    "context"
    "bytes"
    "flag"
    "sync"
    "time"
    "fmt"
    "log"
    "os"

    // "github.com/PuerkitoBio/fetchbot"
    // "github.com/PuerkitoBio/goquery"

    cdp "github.com/knq/chromedp"
    cdpr "github.com/knq/chromedp/runner"

    )



    var (
    baseurl string

    // Command-line flags
    seed = flag.String("seed", "https://jeremywho.com", "seed URL")
    //seed = flag.String("seed", "https://www.tokopedia.com/", "seed URL")
    cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time")
    cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL")
    stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time")
    stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL")
    memStats = flag.Duration("memstats", 5 * time.Minute, "display memory statistics at a given interval")

    )

    func DoExtract(chanURL chan string){

    time.Sleep(2 * time.Second)
    for{
    url := <- chanURL
    //var u string
    // Append visited urls
    // u := fmt.Sprintf("%v", url)
    // u = g(url)
    //urls = append(urls, url)

    DoCDP(url)
    }
    }


    func main() {

    flag.Parse()

    u, err := url.Parse(*seed)
    check(err, "Error in parsing the seed url")
    log.Println("The URL: ", u)

    baseurl = u.String()
    urlProcessor := make(chan string)
    done := make(chan bool)

    go processURL(urlProcessor, done)
    go DoExtract(urlProcessor)
    urlProcessor <- u.String() //fmt.Sprint(u) //"https://jeremywho.com"

    // First mem stat print must be right after creating the fetchbot
    if *memStats > 0 {
    // Print starting stats
    printMemStats()
    // Run at regular intervals
    runMemStats(*memStats)
    // On exit, print ending stats after a GC
    defer func() {
    runtime.GC()
    printMemStats()
    }()
    }

    // if a stop or cancel is requested after some duration, launch the goroutine
    // that will stop or cancel.
    if *stopAfter > 0 || *cancelAfter > 0 {
    after := *stopAfter
    stopFunc := true
    if *cancelAfter != 0 {
    after = *cancelAfter
    stopFunc = true
    }

    go func() {
    c := time.After(after)
    <-c
    fmt.Println("The given timeout has occured. Exiting...")
    done <- stopFunc
    }()
    }

    <-done
    fmt.Println("Done")

    /* if _, err := os.Stat(pwd() + "/TokoProductDetails.csv"); !os.IsNotExist(err) {
    log.Println("The output TSV file location: ", pwd() + "/TokoProductDetails.csv")
    } else {
    log.Println("Required data is not present in any of processed URLs.")
    }
    // Write the processed URLs to a file
    fmt.Println("Total no. of URLs processed: ", len(urls), "\nThe Processed URLs are in the file: ", WriteProcessedUrlsToFile(urls))
    */
    }

    func runMemStats(tick time.Duration) {
    var mu sync.Mutex
    go func() {
    c := time.Tick(tick)
    for _ = range c {
    mu.Lock()
    printMemStats()
    mu.Unlock()
    }
    }()
    }

    func printMemStats() {

    var mem runtime.MemStats
    runtime.ReadMemStats(&mem)
    buf := bytes.NewBuffer(nil)

    buf.WriteString(strings.Repeat("=", 72) + "\n")
    buf.WriteString("Memory Profile:\n")
    buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024))
    buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024))
    buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC))
    buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine()))
    buf.WriteString(strings.Repeat("=", 72))

    log.Println(buf.String())
    }

    // processURL checks the url is already visited or not.
    //If not visited already, then set map = true and explore page for more links.
    func processURL(urlProcessor chan string, done chan bool) {
    visited := make(map[string]bool)
    for {
    select {
    case url := <-urlProcessor:
    if _, ok := visited[url]; ok {
    continue
    } else {
    visited[url] = true
    go exploreURL(url, urlProcessor)
    }
    case <-time.After(15 * time.Second):
    fmt.Printf("Explored %d pages\n", len(visited))
    done <- truex

    }
    }
    }

    // exploreURL does HTTP GET and tokenize the response
    func exploreURL(url string, urlProcessor chan string) {
    fmt.Printf("Visiting %s.\n", url)

    resp, err := http.Get(url)
    if err != nil {
    fmt.Println(err)
    return
    }

    defer resp.Body.Close()
    z := html.NewTokenizer(resp.Body)

    for {
    tt := z.Next()
    if tt == html.ErrorToken {
    return
    }

    if tt == html.StartTagToken {
    t := z.Token()

    if t.Data == "a" {
    for _, a := range t.Attr {
    if a.Key == "href" {

    // if link is within jeremywho.com
    if strings.HasPrefix(a.Val, baseurl) {
    urlProcessor <- a.Val
    }
    }
    }
    }
    }
    }
    }


    //================================================================================
    //================================================================================
    // getProductInfo extract the required information by using chromedp package
    func getProductInfo(urlstr, sel string, res *[]byte, pId, pUrl, url *string) cdp.Tasks {
    return cdp.Tasks{
    cdp.Navigate(urlstr),
    cdp.Sleep(5 * time.Second),
    cdp.WaitVisible(sel, cdp.ByID),
    cdp.EvaluateAsDevTools("document.getElementById('product-id').value;", pId),
    cdp.EvaluateAsDevTools("document.getElementById('product-url').value;", pUrl),
    cdp.EvaluateAsDevTools("document.getElementById('webyclip-widget-3').contentWindow.document.body.outerHTML;", res),
    }
    }

    // isPresent checks the existance of webyclip-widget-3 element.
    func isPresent(url string, res *[]byte) cdp.Tasks {

    return cdp.Tasks{
    cdp.Navigate(url),
    cdp.Sleep(15 * time.Second),
    // cdp.EvaluateAsDevTools("document.getElementById('webyclip-thumbnails').childElementCount;", res),
    cdp.EvaluateAsDevTools("if (document.getElementById('webyclip-thumbnails')) {document.getElementById('webyclip-thumbnails').childElementCount;} else {console.log('0')}", res),
    }

    }

    //================================================================================
    // getVideoLinks returns the Youtube viedo links present in the iframe webyclip-widget-3.
    // returns all the links which are comma seperated.
    func getVideoLinks(buf []byte) string {

    var videoLinks string

    //Convert byte buffer to String
    innerDoc := string(buf[:])
    tmp := strings.TrimSpace(innerDoc)

    //Find the videolinks and create one final string
    tmpStr := strings.Fields(tmp)
    matchStr := "i.ytimg.com/vi/"
    yUrl := "https://www.youtube.com/watch?v="

    for _, v := range tmpStr {

    //log.Println("Contains: ", strings.Contains(v, "i.ytimg.com"))
    if strings.Contains(v, matchStr) {

    vv := strings.TrimPrefix(v, "src=\\\"//i.ytimg.com/vi/")
    id := strings.Split(vv, "/")

    //log.Println("https://www.youtube.com/watch?v=" + id[0])
    //log.Println("id: \tlen:\n",len(id), id)

    youtubeLink := yUrl + id[0]
    videoLinks += youtubeLink + ","
    }

    }

    // return the video links
    return videoLinks[:len(videoLinks)-1]
    }

    //========================================================================================
    func WriteToFile(filePath, record string) {

    f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND, 0644)
    if err != nil {
    // log.Println("File open failed for writing failure counts")
    // return
    log.Println("File doesn't exists. File will be created with the headers before adding data.")
    // If file does not exists then create it with the header and write records.
    file, err1 := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
    if err1 != nil {
    log.Println("File Open operation failed.")
    return
    }
    defer file.Close()

    header := fmt.Sprint("Product_ID" + "\t" + "Product_URL" + "\t" + "Youtube_Video_URLs")
    file.WriteString(fmt.Sprintf("%s\n", header))
    file.WriteString(fmt.Sprintf("%s\n", record))
    return

    }
    defer f.Close()

    log.Println("File exists Already. Adding the data for url.")
    f.WriteString(fmt.Sprintf("%s\n", record))
    }

    //================================================================================

    func WriteProcessedUrlsToFile(urls []string) string{

    filePath := pwd() + "/ProcessedURLs.csv"
    f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
    check(err, "Error in file Open operation")
    defer f.Close()

    for _, url := range urls {
    f.WriteString(fmt.Sprintf("%s\n", url))
    }
    return filePath
    }

    //================================================================================
    // check checks the error, panics if not nil
    func check(err error, str string){

    if err != nil { log.Fatalln(err, str) }
    }

    // pwd returns the current working directory through which the binary is invoked.
    // used to save the csv file.
    func pwd() string {

    pwd, err := os.Getwd()
    check(err, "Error in getting current workig dir.")
    return pwd
    }
    //================================================================================

    func DoCDP(url string) {

    // create context
    ctxt, cancel := context.WithCancel(context.Background())
    defer cancel()

    // create chrome instancefunc(map[string]interface{}) error
    // c, err := cdp.New(ctxt, cdp.WithLog(log.Printf), cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
    c, err := cdp.New(ctxt, cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
    check(err, "Error in creating new cdp instance")

    // run task list
    var buf, buf1 []byte
    var pId, pUrl string

    // Check for the existence of the webyclip-widget-3 on the page
    err = c.Run(ctxt, isPresent(url, &buf1))
    check(err, "Error in Run method of cdp")


    if (len(buf1) == 0) || (bytes.EqualFold([]byte("0"), buf1)){

    log.Println("No webyclip-widget-3 on page:\n ", url)

    // shutdown chrome
    err = c.Shutdown(ctxt)
    check(err, "Error in shutting down chrome")

    // wait for chrome to finish
    err = c.Wait()
    check(err, "Error in wait to shutdown chrome")

    return
    //os.Exit(0)

    } else {

    //fmt.Println("In ELSE The status is: \t Len: ", len(buf), "\t", string(buf), " \t", buf)
    // Exit the code if "webyclip-widget-3" is not present.
    err = c.Run(ctxt, getProductInfo(url, `#webyclip-widget-3`, &buf, &pId, &pUrl, &url))
    check(err, "Error in Run method of cdp")

    // shutdown chrome
    err = c.Shutdown(ctxt)
    check(err, "Error in shutting down chrome")

    // wait for chrome to finish
    err = c.Wait()
    check(err, "Error in wait to shutdown chrome")

    pLinks := getVideoLinks(buf)
    record := fmt.Sprint(pId + "\t" + pUrl + "\t" + pLinks)
    filePath := pwd() + "/TokoProductDetails.csv"

    WriteToFile(filePath, record)
    }
    }