package main import ( "fmt" "golang.org/x/net/html" "io" "log" "net/http" "os" "sort" "strings" "io/ioutil" ) // Helper function to pull the href attribute from a Token func getHref(t html.Token) (ok bool, href string) { // Iterate over all of the Token's attributes until we find an "href" for _, a := range t.Attr { if a.Key == "href" { href = a.Val ok = true } } // "bare" return will return the variables (ok, href) as defined in // the function definition return } // Extract all http** links from a given webpage func crawlPage(url string, ch chan string, chFinished chan bool) { resp, err := http.Get(url) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { fmt.Println("ERROR: Failed to crawl \"" + url + "\"") return } b := resp.Body defer b.Close() // close Body when the function returns parse(b, ch) } // Extract all http** links from a file func readFile(file string, ch chan string, chFinished chan bool) { reader, err := os.Open(file) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { log.Fatal(err) return } defer reader.Close() parse(reader, ch) } func testStatus(url string, ch chan string, chFinished chan bool) { resp, err := http.Get(url) defer func() { // Notify that we're done after this function chFinished <- true }() if err != nil { fmt.Printf("ERROR: Failed to check %v %v\n", url, err) return } if resp.StatusCode > 400 { //fmt.Printf("ERROR: Not good %v %v\n", url , resp.Status) return } ch <- url } func parse(r io.Reader, ch chan string) { z := html.NewTokenizer(r) for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return case tt == html.StartTagToken: t := z.Token() // Check if the token is an tag isAnchor := t.Data == "a" if !isAnchor { continue } // Extract the href value, if there is one ok, url := getHref(t) if !ok { continue } // Make sure the url begines in http** hasProto := strings.Index(url, "http") == 0 if hasProto { ch <- url } } } } func readFiles(dir string) map[string]bool{ foundUrls := make(map[string]bool) // Channels chUrls := make(chan string) chFinished := make(chan bool) // read files from directory files, err := ioutil.ReadDir(dir) if err != nil { log.Fatal(err) } // Kick off the crawl process (concurrently) for _, file := range files { fmt.Println(file.Name()) go readFile(dir+file.Name(), chUrls, chFinished) } // Subscribe to both channels for c := 0; c < len(files); { select { case url := <-chUrls: foundUrls[url] = true case <-chFinished: c++ } } close(chUrls) close(chFinished) // We're done! Print the results... fmt.Println("\nFound", len(foundUrls), "unique urls:\n") return foundUrls } func testUrls(urls []string) map[string]bool { testUrls := make(chan string, 5) testFinished := make(chan bool, 5) for _, url := range urls { go func(v string) { testStatus(v, testUrls, testFinished) }(url) } goodUrls := make(map[string]bool) for c := 0; c < len(urls); { select { case url := <-testUrls: goodUrls[url] = true case <-testFinished: c++ } } fmt.Println("\nFound", len(goodUrls), "good urls:\n") close(testUrls) close(testFinished) return goodUrls } func main() { directory := os.Args[1] foundUrls := readFiles(directory) list := make([]string, len(foundUrls)) i := 0 for k := range foundUrls { list[i] = k i++ } sort.Strings(list) goodUrls := testUrls(list) list = make([]string, len(goodUrls)) i = 0 for k := range goodUrls { list[i] = k i++ } sort.Strings(list) for _, url := range list { fmt.Println(url) } }