Last active
January 6, 2022 16:44
-
-
Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.
Revisions
-
toannd96 revised this gist
Jan 6, 2022 . 1 changed file with 10 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -110,12 +110,12 @@ func extractInfoJob(urls []string) error { var job Job c := colly.NewCollector( // colly.Async(true), ) // c.Limit(&colly.LimitRule{ // Parallelism: 2, // }) c.SetRequestTimeout(120 * time.Second) @@ -161,7 +161,7 @@ func extractInfoJob(urls []string) error { c.Visit(url) } // c.Wait() return nil } @@ -183,7 +183,7 @@ func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error { // Get total page count of each url by province totalPage, err := getTotalPage(urlProvince) if err != nil { fmt.Println(err) } // Merge all url pages by province @@ -211,7 +211,7 @@ func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error { docChild, err := getNewDocument(urlCategory) if err != nil { fmt.Println(err) } // Get all search urls by category child @@ -222,7 +222,7 @@ func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error { // Get total page count of each url by category child totalPage, err := getTotalPage(urlCategoryChild) if err != nil { fmt.Println(err) } // Merge all url pages by category child @@ -259,7 +259,7 @@ func getTotalPage(url string) (int, error) { func getNewDocument(url string) (*goquery.Document, error) { resp, err := Get(url) if err != nil { fmt.Println(err) } defer resp.Body.Close() @@ -269,7 +269,7 @@ func getNewDocument(url string) (*goquery.Document, error) { doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { fmt.Println(err) } return doc, nil -
toannd96 renamed this gist
Jan 6, 2022 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
toannd96 revised this gist
Jan 6, 2022 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -196,7 +196,7 @@ func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error { return nil } // getUrlByCategory get all search url by category func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error { defer wg.Done() doc, err := getNewDocument(webPage) -
toannd96 created this gist
Jan 6, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,296 @@ package main import ( "encoding/json" "fmt" "log" "net/http" "os" "strconv" "sync" "time" "github.com/PuerkitoBio/goquery" "github.com/cenkalti/backoff" "github.com/gocolly/colly" ) const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c" type Job struct { Title string `json:"title"` Company string `json:"company"` Location string `json:"location"` Descript string `json:"descript"` Url string `json:"url"` Site string `json:"site"` CreatedAt string `json:"created_at"` } type Jobs struct { List []Job `json:"jobs"` TotalJobs int `json:"total_jobs"` } const ( maxRetry = 3 * time.Minute ) func get(url string) (*http.Response, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } client := &http.Client{} resp, err := client.Do(req) if err != nil { return nil, err } return resp, nil } func Get(url string) (*http.Response, error) { var err error var resp *http.Response bo := backoff.NewExponentialBackOff() bo.MaxInterval = maxRetry bo.MaxElapsedTime = maxRetry for { resp, err = get(url) if err == nil { break } d := bo.NextBackOff() if d == backoff.Stop { break } time.Sleep(d) } if err != nil { return nil, err } return resp, nil } func crawlJobStreet() { var urls []string pipe := make(chan string) done := make(chan bool) go func() { for { url, more := <-pipe if more { fmt.Println("Received urls", url) urls = append(urls, url) fmt.Println("Append url received to array", len(urls)) } else { fmt.Println("Received all urls", len(urls)) extractInfoJob(urls) done <- true return } } }() var wg sync.WaitGroup wg.Add(2) go getUrlByProvince(pipe, &wg) go getUrlByCategory(pipe, &wg) go func() { wg.Wait() close(pipe) }() <-done } func extractInfoJob(urls []string) error { var jobs Jobs var job Job c := colly.NewCollector( colly.Async(true), ) c.Limit(&colly.LimitRule{ Parallelism: 2, }) c.SetRequestTimeout(120 * time.Second) c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL) }) c.OnError(func(r *colly.Response, err error) { fmt.Println(err) }) c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) { job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href") job.Title = e.ChildText("h3.job-title > a") job.Company = e.ChildText("span.job-company") job.Location = e.ChildText("span.job-location") c.Visit(e.Request.AbsoluteURL(job.Url)) c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) { job.Site = e.ChildText("span.site") job.CreatedAt = e.ChildText("span.listed-date") }) if job.Site == "TopCV" { job.Descript = "" } else { c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) { job.Descript = e.Text }) } jobs.TotalJobs++ jobs.List = append(jobs.List, job) dataBytes, errMarshal := json.Marshal(jobs) if errMarshal != nil { fmt.Println(errMarshal) } os.WriteFile("jobstreet.json", dataBytes, 0700) }) for _, url := range urls { c.Visit(url) } c.Wait() return nil } // getUrlByProvince get all search url by province func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error { defer wg.Done() doc, err := getNewDocument(webPage) if err != nil { return err } // Get all search urls by province doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) { href, _ := province.Attr("href") urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href) // Get total page count of each url by province totalPage, err := getTotalPage(urlProvince) if err != nil { log.Fatal(err) } // Merge all url pages by province for page := 1; page <= totalPage; page++ { urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page) pipe <- urlProvinceByPage } }) return nil } // getUrlByCategories get all search url by category func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error { defer wg.Done() doc, err := getNewDocument(webPage) if err != nil { return err } // Get all search urls by category doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) { href, _ := category.Attr("href") urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href) docChild, err := getNewDocument(urlCategory) if err != nil { log.Fatal(err) } // Get all search urls by category child docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) { href, _ := key.Attr("href") urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href) // Get total page count of each url by category child totalPage, err := getTotalPage(urlCategoryChild) if err != nil { log.Fatal(err) } // Merge all url pages by category child for page := 1; page <= totalPage; page++ { urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page) pipe <- urlCategoryChildByPage } }) }) return nil } // getTotalPage get total page count of each url func getTotalPage(url string) (int, error) { var totalPage int doc, err := getNewDocument(url) if err != nil { return 0, err } pageStr := doc.Find("div.search-results-count strong:last-child").Text() if pageStr != "" { totalPage, err = strconv.Atoi(pageStr) if err != nil { return 0, err } } return totalPage, nil } // getNewDocument get html document from url func getNewDocument(url string) (*goquery.Document, error) { resp, err := Get(url) if err != nil { log.Fatal(err) } defer resp.Body.Close() if resp.StatusCode != 200 { log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status) } doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { log.Fatal(err) } return doc, nil } func schedule(timeSchedule time.Duration, index int) { ticker := time.NewTicker(timeSchedule) go func() { for { switch index { case 1: <-ticker.C crawlJobStreet() } } }() } func main() { crawlJobStreet() // schedule crawler go schedule(24*time.Hour, 1) }