Last active
January 10, 2017 17:56
-
-
Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.
Revisions
-
raedatoui renamed this gist
Jan 10, 2017 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
raedatoui revised this gist
Jan 10, 2017 . 1 changed file with 5 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -121,23 +121,23 @@ func parse(r io.Reader, ch chan string) { } } func readFiles(dir string) map[string]bool{ foundUrls := make(map[string]bool) // Channels chUrls := make(chan string) chFinished := make(chan bool) // read files from directory files, err := ioutil.ReadDir(dir) if err != nil { log.Fatal(err) } // Kick off the crawl process (concurrently) for _, file := range files { fmt.Println(file.Name()) go readFile(dir+file.Name(), chUrls, chFinished) } // Subscribe to both channels @@ -184,8 +184,8 @@ func testUrls(urls []string) map[string]bool { } func main() { directory := os.Args[1] foundUrls := readFiles(directory) list := make([]string, len(foundUrls)) i := 0 -
raedatoui revised this gist
Jan 10, 2017 . No changes.There are no files selected for viewing
-
raedatoui revised this gist
Jan 10, 2017 . No changes.There are no files selected for viewing
-
raedatoui created this gist
Jan 10, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,212 @@ package main import ( "fmt" "golang.org/x/net/html" "io" "log" "net/http" "os" "sort" "strings" "io/ioutil" ) // Helper function to pull the href attribute from a Token func getHref(t html.Token) (ok bool, href string) { // Iterate over all of the Token's attributes until we find an "href" for _, a := range t.Attr { if a.Key == "href" { href = a.Val ok = true } } // "bare" return will return the variables (ok, href) as defined in // the function definition return } // Extract all http** links from a given webpage func crawlPage(url string, ch chan string, chFinished chan bool) { resp, err := http.Get(url) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { fmt.Println("ERROR: Failed to crawl \"" + url + "\"") return } b := resp.Body defer b.Close() // close Body when the function returns parse(b, ch) } // Extract all http** links from a file func readFile(file string, ch chan string, chFinished chan bool) { reader, err := os.Open(file) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { log.Fatal(err) return } defer reader.Close() parse(reader, ch) } func testStatus(url string, ch chan string, chFinished chan bool) { resp, err := http.Get(url) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { fmt.Printf("ERROR: Failed to check %v %v\n", url, err) return } if resp.StatusCode > 400 { //fmt.Printf("ERROR: Not good %v %v\n", url , resp.Status) return } ch <- url } func parse(r io.Reader, ch chan string) { z := html.NewTokenizer(r) for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return case tt == html.StartTagToken: t := z.Token() // Check if the token is an <a> tag isAnchor := t.Data == "a" if !isAnchor { continue } // Extract the href value, if there is one ok, url := getHref(t) if !ok { continue } // Make sure the url begines in http** hasProto := strings.Index(url, "http") == 0 if hasProto { ch <- url } } } } func readFiles() map[string]bool{ foundUrls := make(map[string]bool) // Channels chUrls := make(chan string) chFinished := make(chan bool) // read files from directory files, err := ioutil.ReadDir("files") if err != nil { log.Fatal(err) } // Kick off the crawl process (concurrently) for _, file := range files { fmt.Println(file.Name()) go readFile("files/"+file.Name(), chUrls, chFinished) } // Subscribe to both channels for c := 0; c < len(files); { select { case url := <-chUrls: foundUrls[url] = true case <-chFinished: c++ } } close(chUrls) close(chFinished) // We're done! Print the results... fmt.Println("\nFound", len(foundUrls), "unique urls:\n") return foundUrls } func testUrls(urls []string) map[string]bool { testUrls := make(chan string, 5) testFinished := make(chan bool, 5) for _, url := range urls { go func(v string) { testStatus(v, testUrls, testFinished) }(url) } goodUrls := make(map[string]bool) for c := 0; c < len(urls); { select { case url := <-testUrls: goodUrls[url] = true case <-testFinished: c++ } } fmt.Println("\nFound", len(goodUrls), "good urls:\n") close(testUrls) close(testFinished) return goodUrls } func main() { foundUrls := readFiles() list := make([]string, len(foundUrls)) i := 0 for k := range foundUrls { list[i] = k i++ } sort.Strings(list) goodUrls := testUrls(list) list = make([]string, len(goodUrls)) i = 0 for k := range goodUrls { list[i] = k i++ } sort.Strings(list) for _, url := range list { fmt.Println(url) } }