Skip to content

Instantly share code, notes, and snippets.

@raedatoui
Last active January 10, 2017 17:56
Show Gist options
  • Select an option

  • Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.

Select an option

Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.

Revisions

  1. raedatoui renamed this gist Jan 10, 2017. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. raedatoui revised this gist Jan 10, 2017. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions scrapper.go
    Original file line number Diff line number Diff line change
    @@ -121,23 +121,23 @@ func parse(r io.Reader, ch chan string) {
    }
    }

    func readFiles() map[string]bool{
    func readFiles(dir string) map[string]bool{
    foundUrls := make(map[string]bool)

    // Channels
    chUrls := make(chan string)
    chFinished := make(chan bool)

    // read files from directory
    files, err := ioutil.ReadDir("files")
    files, err := ioutil.ReadDir(dir)
    if err != nil {
    log.Fatal(err)
    }

    // Kick off the crawl process (concurrently)
    for _, file := range files {
    fmt.Println(file.Name())
    go readFile("files/"+file.Name(), chUrls, chFinished)
    go readFile(dir+file.Name(), chUrls, chFinished)
    }

    // Subscribe to both channels
    @@ -184,8 +184,8 @@ func testUrls(urls []string) map[string]bool {
    }

    func main() {

    foundUrls := readFiles()
    directory := os.Args[1]
    foundUrls := readFiles(directory)

    list := make([]string, len(foundUrls))
    i := 0
  3. raedatoui revised this gist Jan 10, 2017. No changes.
  4. raedatoui revised this gist Jan 10, 2017. No changes.
  5. raedatoui created this gist Jan 10, 2017.
    212 changes: 212 additions & 0 deletions scrapper.go
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,212 @@
    package main

    import (
    "fmt"
    "golang.org/x/net/html"
    "io"
    "log"
    "net/http"
    "os"
    "sort"
    "strings"
    "io/ioutil"
    )

    // Helper function to pull the href attribute from a Token
    func getHref(t html.Token) (ok bool, href string) {
    // Iterate over all of the Token's attributes until we find an "href"
    for _, a := range t.Attr {
    if a.Key == "href" {
    href = a.Val
    ok = true
    }
    }

    // "bare" return will return the variables (ok, href) as defined in
    // the function definition
    return
    }

    // Extract all http** links from a given webpage
    func crawlPage(url string, ch chan string, chFinished chan bool) {
    resp, err := http.Get(url)

    defer func() {
    // Notify that we're done after this function
    chFinished <- true
    }()

    if err != nil {
    fmt.Println("ERROR: Failed to crawl \"" + url + "\"")
    return
    }

    b := resp.Body
    defer b.Close() // close Body when the function returns
    parse(b, ch)

    }

    // Extract all http** links from a file
    func readFile(file string, ch chan string, chFinished chan bool) {
    reader, err := os.Open(file)

    defer func() {
    // Notify that we're done after this function
    chFinished <- true
    }()

    if err != nil {
    log.Fatal(err)
    return
    }

    defer reader.Close()

    parse(reader, ch)

    }

    func testStatus(url string, ch chan string, chFinished chan bool) {
    resp, err := http.Get(url)

    defer func() {
    // Notify that we're done after this function
    chFinished <- true
    }()

    if err != nil {
    fmt.Printf("ERROR: Failed to check %v %v\n", url, err)
    return
    }
    if resp.StatusCode > 400 {
    //fmt.Printf("ERROR: Not good %v %v\n", url , resp.Status)
    return
    }

    ch <- url
    }

    func parse(r io.Reader, ch chan string) {
    z := html.NewTokenizer(r)

    for {
    tt := z.Next()

    switch {
    case tt == html.ErrorToken:
    // End of the document, we're done
    return
    case tt == html.StartTagToken:
    t := z.Token()

    // Check if the token is an <a> tag
    isAnchor := t.Data == "a"
    if !isAnchor {
    continue
    }

    // Extract the href value, if there is one
    ok, url := getHref(t)
    if !ok {
    continue
    }

    // Make sure the url begines in http**
    hasProto := strings.Index(url, "http") == 0
    if hasProto {
    ch <- url
    }
    }
    }
    }

    func readFiles() map[string]bool{
    foundUrls := make(map[string]bool)

    // Channels
    chUrls := make(chan string)
    chFinished := make(chan bool)

    // read files from directory
    files, err := ioutil.ReadDir("files")
    if err != nil {
    log.Fatal(err)
    }

    // Kick off the crawl process (concurrently)
    for _, file := range files {
    fmt.Println(file.Name())
    go readFile("files/"+file.Name(), chUrls, chFinished)
    }

    // Subscribe to both channels
    for c := 0; c < len(files); {
    select {
    case url := <-chUrls:
    foundUrls[url] = true
    case <-chFinished:
    c++
    }
    }

    close(chUrls)
    close(chFinished)
    // We're done! Print the results...
    fmt.Println("\nFound", len(foundUrls), "unique urls:\n")
    return foundUrls
    }

    func testUrls(urls []string) map[string]bool {
    testUrls := make(chan string, 5)
    testFinished := make(chan bool, 5)

    for _, url := range urls {
    go func(v string) {
    testStatus(v, testUrls, testFinished)
    }(url)
    }

    goodUrls := make(map[string]bool)

    for c := 0; c < len(urls); {
    select {
    case url := <-testUrls:
    goodUrls[url] = true
    case <-testFinished:
    c++
    }
    }
    fmt.Println("\nFound", len(goodUrls), "good urls:\n")
    close(testUrls)
    close(testFinished)
    return goodUrls
    }

    func main() {

    foundUrls := readFiles()

    list := make([]string, len(foundUrls))
    i := 0
    for k := range foundUrls {
    list[i] = k
    i++
    }
    sort.Strings(list)

    goodUrls := testUrls(list)

    list = make([]string, len(goodUrls))
    i = 0
    for k := range goodUrls {
    list[i] = k
    i++
    }
    sort.Strings(list)

    for _, url := range list {
    fmt.Println(url)
    }

    }