Skip to content

Instantly share code, notes, and snippets.

@toannd96
Last active January 6, 2022 16:44
Show Gist options
  • Select an option

  • Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.

Select an option

Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.

Revisions

  1. toannd96 revised this gist Jan 6, 2022. 1 changed file with 10 additions and 10 deletions.
    20 changes: 10 additions & 10 deletions jobstreet.go
    Original file line number Diff line number Diff line change
    @@ -110,12 +110,12 @@ func extractInfoJob(urls []string) error {
    var job Job

    c := colly.NewCollector(
    colly.Async(true),
    // colly.Async(true),
    )

    c.Limit(&colly.LimitRule{
    Parallelism: 2,
    })
    // c.Limit(&colly.LimitRule{
    // Parallelism: 2,
    // })

    c.SetRequestTimeout(120 * time.Second)

    @@ -161,7 +161,7 @@ func extractInfoJob(urls []string) error {
    c.Visit(url)
    }

    c.Wait()
    // c.Wait()

    return nil
    }
    @@ -183,7 +183,7 @@ func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
    // Get total page count of each url by province
    totalPage, err := getTotalPage(urlProvince)
    if err != nil {
    log.Fatal(err)
    fmt.Println(err)
    }

    // Merge all url pages by province
    @@ -211,7 +211,7 @@ func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {

    docChild, err := getNewDocument(urlCategory)
    if err != nil {
    log.Fatal(err)
    fmt.Println(err)
    }

    // Get all search urls by category child
    @@ -222,7 +222,7 @@ func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
    // Get total page count of each url by category child
    totalPage, err := getTotalPage(urlCategoryChild)
    if err != nil {
    log.Fatal(err)
    fmt.Println(err)
    }

    // Merge all url pages by category child
    @@ -259,7 +259,7 @@ func getTotalPage(url string) (int, error) {
    func getNewDocument(url string) (*goquery.Document, error) {
    resp, err := Get(url)
    if err != nil {
    log.Fatal(err)
    fmt.Println(err)
    }
    defer resp.Body.Close()

    @@ -269,7 +269,7 @@ func getNewDocument(url string) (*goquery.Document, error) {

    doc, err := goquery.NewDocumentFromReader(resp.Body)
    if err != nil {
    log.Fatal(err)
    fmt.Println(err)
    }

    return doc, nil
  2. toannd96 renamed this gist Jan 6, 2022. 1 changed file with 0 additions and 0 deletions.
  3. toannd96 revised this gist Jan 6, 2022. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion crawl jobstreet use channel, colly and goquery
    Original file line number Diff line number Diff line change
    @@ -196,7 +196,7 @@ func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
    return nil
    }

    // getUrlByCategories get all search url by category
    // getUrlByCategory get all search url by category
    func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
    defer wg.Done()
    doc, err := getNewDocument(webPage)
  4. toannd96 created this gist Jan 6, 2022.
    296 changes: 296 additions & 0 deletions crawl jobstreet use channel, colly and goquery
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,296 @@
    package main

    import (
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "os"
    "strconv"
    "sync"
    "time"

    "github.com/PuerkitoBio/goquery"
    "github.com/cenkalti/backoff"
    "github.com/gocolly/colly"
    )

    const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c"

    type Job struct {
    Title string `json:"title"`
    Company string `json:"company"`
    Location string `json:"location"`
    Descript string `json:"descript"`
    Url string `json:"url"`
    Site string `json:"site"`
    CreatedAt string `json:"created_at"`
    }

    type Jobs struct {
    List []Job `json:"jobs"`
    TotalJobs int `json:"total_jobs"`
    }

    const (
    maxRetry = 3 * time.Minute
    )

    func get(url string) (*http.Response, error) {
    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
    return nil, err
    }
    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
    return nil, err
    }
    return resp, nil
    }

    func Get(url string) (*http.Response, error) {
    var err error
    var resp *http.Response
    bo := backoff.NewExponentialBackOff()
    bo.MaxInterval = maxRetry
    bo.MaxElapsedTime = maxRetry
    for {
    resp, err = get(url)
    if err == nil {
    break
    }
    d := bo.NextBackOff()
    if d == backoff.Stop {
    break
    }
    time.Sleep(d)
    }
    if err != nil {
    return nil, err
    }
    return resp, nil
    }

    func crawlJobStreet() {
    var urls []string

    pipe := make(chan string)
    done := make(chan bool)
    go func() {
    for {
    url, more := <-pipe
    if more {
    fmt.Println("Received urls", url)
    urls = append(urls, url)
    fmt.Println("Append url received to array", len(urls))
    } else {
    fmt.Println("Received all urls", len(urls))
    extractInfoJob(urls)
    done <- true
    return
    }
    }
    }()

    var wg sync.WaitGroup
    wg.Add(2)
    go getUrlByProvince(pipe, &wg)
    go getUrlByCategory(pipe, &wg)

    go func() {
    wg.Wait()
    close(pipe)
    }()
    <-done
    }

    func extractInfoJob(urls []string) error {
    var jobs Jobs
    var job Job

    c := colly.NewCollector(
    colly.Async(true),
    )

    c.Limit(&colly.LimitRule{
    Parallelism: 2,
    })

    c.SetRequestTimeout(120 * time.Second)

    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
    fmt.Println(err)
    })

    c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) {
    job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href")
    job.Title = e.ChildText("h3.job-title > a")
    job.Company = e.ChildText("span.job-company")
    job.Location = e.ChildText("span.job-location")

    c.Visit(e.Request.AbsoluteURL(job.Url))
    c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) {
    job.Site = e.ChildText("span.site")
    job.CreatedAt = e.ChildText("span.listed-date")
    })

    if job.Site == "TopCV" {
    job.Descript = ""
    } else {
    c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) {
    job.Descript = e.Text
    })
    }

    jobs.TotalJobs++
    jobs.List = append(jobs.List, job)

    dataBytes, errMarshal := json.Marshal(jobs)
    if errMarshal != nil {
    fmt.Println(errMarshal)
    }
    os.WriteFile("jobstreet.json", dataBytes, 0700)
    })

    for _, url := range urls {
    c.Visit(url)
    }

    c.Wait()

    return nil
    }

    // getUrlByProvince get all search url by province
    func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
    defer wg.Done()

    doc, err := getNewDocument(webPage)
    if err != nil {
    return err
    }

    // Get all search urls by province
    doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) {
    href, _ := province.Attr("href")
    urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href)

    // Get total page count of each url by province
    totalPage, err := getTotalPage(urlProvince)
    if err != nil {
    log.Fatal(err)
    }

    // Merge all url pages by province
    for page := 1; page <= totalPage; page++ {
    urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page)
    pipe <- urlProvinceByPage
    }
    })

    return nil
    }

    // getUrlByCategories get all search url by category
    func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
    defer wg.Done()
    doc, err := getNewDocument(webPage)
    if err != nil {
    return err
    }

    // Get all search urls by category
    doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) {
    href, _ := category.Attr("href")
    urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href)

    docChild, err := getNewDocument(urlCategory)
    if err != nil {
    log.Fatal(err)
    }

    // Get all search urls by category child
    docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) {
    href, _ := key.Attr("href")
    urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href)

    // Get total page count of each url by category child
    totalPage, err := getTotalPage(urlCategoryChild)
    if err != nil {
    log.Fatal(err)
    }

    // Merge all url pages by category child
    for page := 1; page <= totalPage; page++ {
    urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page)
    pipe <- urlCategoryChildByPage
    }
    })
    })

    return nil
    }

    // getTotalPage get total page count of each url
    func getTotalPage(url string) (int, error) {
    var totalPage int
    doc, err := getNewDocument(url)
    if err != nil {
    return 0, err
    }

    pageStr := doc.Find("div.search-results-count strong:last-child").Text()
    if pageStr != "" {
    totalPage, err = strconv.Atoi(pageStr)
    if err != nil {
    return 0, err
    }
    }

    return totalPage, nil
    }

    // getNewDocument get html document from url
    func getNewDocument(url string) (*goquery.Document, error) {
    resp, err := Get(url)
    if err != nil {
    log.Fatal(err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != 200 {
    log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status)
    }

    doc, err := goquery.NewDocumentFromReader(resp.Body)
    if err != nil {
    log.Fatal(err)
    }

    return doc, nil
    }

    func schedule(timeSchedule time.Duration, index int) {
    ticker := time.NewTicker(timeSchedule)
    go func() {
    for {
    switch index {
    case 1:
    <-ticker.C
    crawlJobStreet()
    }
    }
    }()
    }

    func main() {
    crawlJobStreet()

    // schedule crawler
    go schedule(24*time.Hour, 1)
    }