Skip to content

Instantly share code, notes, and snippets.

@imxiaohui
Forked from gmolveau/solution1.go
Created January 30, 2021 16:07
Show Gist options
  • Save imxiaohui/9d320ab83ce611f923a092954e8bac03 to your computer and use it in GitHub Desktop.
Save imxiaohui/9d320ab83ce611f923a092954e8bac03 to your computer and use it in GitHub Desktop.
Go Tour concurrency crawler exercice with goroutines https://tour.golang.org/concurrency/10
/*
inspired by https://eduardolezcano.com/a-tour-of-go-web-crawler/
and https://github.com/fgrehm/go-tour/blob/master/73-web-crawler-golang-team-solution.go
solution with a "safe" map with mutex + a sub-function crawl with WaitGroup
this solution keeps the original signature of the Crawl function and does not change the main function
but does not use or store the body of the URL
*/
package main
import (
"fmt"
"sync"
)
type SafeMap struct {
urls map[string]error
mux sync.Mutex
}
func (s *SafeMap) Find(url string) (found bool) {
s.mux.Lock()
defer s.mux.Unlock()
_, found = s.urls[url]
return found
}
func (s *SafeMap) AddOrUpdate(url string, err error) {
s.mux.Lock()
defer s.mux.Unlock()
s.urls[url] = err
}
func crawl(url string, depth int, fetcher Fetcher, urlsFetched *SafeMap, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
if urlsFetched.Find(url) {
return
}
urlsFetched.AddOrUpdate(url, nil)
_, urls, err := fetcher.Fetch(url)
urlsFetched.AddOrUpdate(url, err)
if err != nil {
return
}
for _, newUrl := range urls {
wg.Add(1)
go crawl(newUrl, depth-1, fetcher, urlsFetched, wg)
}
}
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
urlsFetched := SafeMap{
urls: make(map[string]error),
}
wg := new(sync.WaitGroup)
wg.Add(1)
go crawl(url, depth, fetcher, &urlsFetched, wg)
wg.Wait() // blocking operation
for url, err := range urlsFetched.urls {
if err != nil {
fmt.Printf("%v failed: %v\n", url, err)
} else {
fmt.Printf("%v was fetched\n", url)
}
}
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
/*
inspired by https://eduardolezcano.com/a-tour-of-go-web-crawler/
+ https://github.com/fgrehm/go-tour/blob/master/73-web-crawler-golang-team-solution.go
another solution with a Crawler struct, with a "safe" map with mutex
and a Crawl method with WaitGroup
this solution keeps the original signature of the Crawl function and does not change the main function
but does not use or store the body of the URL
*/
package main
import (
"fmt"
"sync"
)
type Crawler struct {
urls SafeMap
wg sync.WaitGroup
fetcher Fetcher
}
type SafeMap struct {
m map[string]error
mux sync.Mutex
}
func (s *SafeMap) find(url string) (found bool) {
s.mux.Lock()
defer s.mux.Unlock()
_, found = s.m[url]
return found
}
func (s *SafeMap) addOrUpdate(url string, err error) {
s.mux.Lock()
defer s.mux.Unlock()
s.m[url] = err
}
func (c *Crawler) Crawl(url string, depth int) {
defer c.wg.Done()
if depth <= 0 {
return
}
if c.urls.find(url) {
return
}
c.urls.addOrUpdate(url, nil)
_, urls, err := fetcher.Fetch(url)
c.urls.addOrUpdate(url, err)
if err != nil {
return
}
for _, newUrl := range urls {
c.wg.Add(1)
go c.Crawl(newUrl, depth-1)
}
}
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
crawler := Crawler{
urls: SafeMap{
m: make(map[string]error),
},
fetcher: fetcher,
}
crawler.wg.Add(1)
go crawler.Crawl(url, depth)
crawler.wg.Wait() // blocking operation
for url, err := range crawler.urls.m {
if err != nil {
fmt.Printf("%v failed: %v\n", url, err)
} else {
fmt.Printf("%v was fetched\n", url)
}
}
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
/*
inspired by https://eduardolezcano.com/a-tour-of-go-web-crawler/
and https://github.com/fgrehm/go-tour/blob/master/73-web-crawler-golang-team-solution.go
solution with a "safe" map with mutex
WITHOUT a WaitGroup
*/
package main
import (
"fmt"
"sync"
)
type SafeMap struct {
m map[string]error
mux sync.Mutex
}
func (s *SafeMap) Find(url string) (found bool) {
s.mux.Lock()
defer s.mux.Unlock()
_, found = s.m[url]
return found
}
func (s *SafeMap) AddOrUpdate(url string, err error) {
s.mux.Lock()
defer s.mux.Unlock()
s.m[url] = err
}
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
if fetched.Find(url){
return
}
// We mark the url to be loading to avoid others reloading it at the same time.
fetched.AddOrUpdate(url, nil)
// We load it concurrently.
_, urls, err := fetcher.Fetch(url)
// And update the status in a synced zone.
fetched.AddOrUpdate(url, err)
if err != nil {
return
}
// https://gobyexample.com/channel-synchronization
// When waiting for multiple goroutines to finish, you may prefer to use a WaitGroup
done := make(chan bool)
for _, u := range urls {
go func(url string) {
Crawl(url, depth-1, fetcher)
done <- true
}(u)
}
for range urls {
<-done // blocking operation
// un channel en reception est bloquant
// comment savoir quand il n y a plus d'URL a scan ?
// parce quon est dans une boucle for de la meme taille
// que le nombre de goroutines lancées
// cela marche car le nombre de goroutines est connu
// si jamais il n'etait pas connu il faudrait utiliser un WaitGroup
// ici le `<- done` est donc "faux" dans le sens où
// ce n'est pas forcement cette URL qui est en attente
}
}
var fetched = SafeMap{
m: make(map[string]error),
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
for url, err := range fetched.m {
if err != nil {
fmt.Printf("%v failed: %v\n", url, err)
} else {
fmt.Printf("%v was fetched\n", url)
}
}
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
/*
a solution without a safe map with mutex and without a waitgroup ?
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment