Skip to content

Instantly share code, notes, and snippets.

@Axure
Created January 7, 2018 09:06
Show Gist options
  • Select an option

  • Save Axure/0197f6a26e1d2e371639da0383e7f5c5 to your computer and use it in GitHub Desktop.

Select an option

Save Axure/0197f6a26e1d2e371639da0383e7f5c5 to your computer and use it in GitHub Desktop.

Revisions

  1. Axure revised this gist Jan 7, 2018. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions crawl.go
    Original file line number Diff line number Diff line change
    @@ -20,9 +20,6 @@ type FetchStore struct {
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(store *FetchStore, url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    //fmt.Println("[Got task]", url)
    if depth <= 0 {
    return
  2. Axure created this gist Jan 7, 2018.
    122 changes: 122 additions & 0 deletions crawl.go
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@
    package main

    import (
    "fmt"
    "sync"
    )

    type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
    }

    type FetchStore struct {
    v map[string]string
    done map[string]bool
    mux sync.Mutex
    }

    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(store *FetchStore, url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    //fmt.Println("[Got task]", url)
    if depth <= 0 {
    return
    }

    store.mux.Lock()
    done, ok := store.done[url]
    if ok {
    if done {
    store.mux.Unlock()
    return
    }
    }

    store.done[url] = true
    store.mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
    fmt.Println(err)
    return
    }
    doneChan := make(chan bool)
    fmt.Printf("[found]: %s %q\n", url, body)
    store.v[url] = body
    for _, u := range urls {
    newU := u
    go func() {
    //fmt.Println("====[starting] task", newU)
    Crawl(store, newU, depth-1, fetcher)
    doneChan <- true
    }()
    //fmt.Println("[creating] task", u)
    }
    for range urls {
    //fmt.Println("[awaiting] task at", url)
    <-doneChan
    }
    return
    }

    func main() {
    var store FetchStore
    store.v = make(map[string]string)
    store.done = make(map[string]bool)
    Crawl(&store, "http://golang.org/", 4, fetcher)
    fmt.Println("store", store)
    }

    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult

    type fakeResult struct {
    body string
    urls []string
    }

    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
    return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
    }

    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
    "http://golang.org/": &fakeResult{
    "The Go Programming Language",
    []string{
    "http://golang.org/pkg/",
    "http://golang.org/cmd/",
    },
    },
    "http://golang.org/pkg/": &fakeResult{
    "Packages",
    []string{
    "http://golang.org/",
    "http://golang.org/cmd/",
    "http://golang.org/pkg/fmt/",
    "http://golang.org/pkg/os/",
    },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
    "Package fmt",
    []string{
    "http://golang.org/",
    "http://golang.org/pkg/",
    },
    },
    "http://golang.org/pkg/os/": &fakeResult{
    "Package os",
    []string{
    "http://golang.org/",
    "http://golang.org/pkg/",
    },
    },
    }