Skip to content

Instantly share code, notes, and snippets.

@sean9999
Created October 23, 2024 17:04
Show Gist options
  • Select an option

  • Save sean9999/47b7a3fecb81d7bc2837aace385698f7 to your computer and use it in GitHub Desktop.

Select an option

Save sean9999/47b7a3fecb81d7bc2837aace385698f7 to your computer and use it in GitHub Desktop.

Revisions

  1. sean9999 created this gist Oct 23, 2024.
    150 changes: 150 additions & 0 deletions scraper.go
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,150 @@
    package main

    import (
    "fmt"
    "io"
    "net/http"
    "net/url"
    "slices"
    "strings"
    "sync"

    "golang.org/x/net/html"
    )

    var wg = &sync.WaitGroup{}
    var mut = sync.Mutex{}

    type safeSlice struct {
    sync.RWMutex
    data []string
    }

    func (s *safeSlice) Contains(str string) bool {
    s.RLock()
    defer s.RUnlock()
    return slices.Contains(s.data, str)
    }

    func (s *safeSlice) Add(str string) {
    s.Lock()
    defer s.Unlock()
    s.data = append(s.data, str)
    }

    func (s *safeSlice) Length() int {
    s.RLock()
    defer s.RUnlock()
    return len(s.data)
    }

    var seen = &safeSlice{}
    var results = &safeSlice{}

    func main() {
    const URL = "https://scrape-me.dreamsofcode.io"

    wg.Add(1)
    go getValidURLs(URL)
    wg.Wait()

    fmt.Println("Total URLs found:", results.Length())
    fmt.Println("Total seen URLs:", seen.Length())

    }

    func getValidURLs(url string) {
    defer wg.Done()

    // if we know the url has been seen, go away
    if seen.Contains(url) {
    return
    }

    statusCode, node := fetchURL(url)

    // if the status is not ok, we've seen it. Return
    if statusCode != http.StatusOK {
    seen.Add(url)
    return
    }

    // concurrency requires that we check this again
    // as fetchURL takes time
    if !seen.Contains(url) {
    results.Add(url)
    seen.Add(url)
    }

    hrefs := getHrefs(url, node)
    for _, href := range hrefs {
    wg.Add(1)
    // recursively call getValidURLs
    go getValidURLs(href)
    }
    }

    func getHrefs(baseHost string, n *html.Node) []string {
    result := []string{}
    if n.Type == html.ElementNode && n.Data == "a" {
    for _, a := range n.Attr {
    if a.Key == "href" && isBaseHost(baseHost, a.Val) {
    parsedURL, err := url.Parse(baseHost)
    if err != nil {
    panic(err)
    }

    result = append(result, addPath(parsedURL.Scheme+"://"+parsedURL.Host, a.Val))
    break
    }
    }
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling {
    result = append(result, getHrefs(baseHost, c)...)
    }
    return result
    }

    func isBaseHost(baseHost string, href string) bool {
    return strings.HasPrefix(href, "/") || strings.HasPrefix(href, baseHost)
    }

    func addPath(baseHost string, href string) string {
    // If href is already a full URL, return it
    if strings.HasPrefix(href, baseHost) {
    return href
    }

    // Remove leading slash from href and trailing slash from baseHost
    href = strings.TrimPrefix(href, "/")
    baseHost = strings.TrimSuffix(baseHost, "/")

    return baseHost + "/" + href
    }

    func fetchURL(url string) (statusCode int, node *html.Node) {
    fmt.Println("Checking URL:", url)

    response, err := http.Get(url)
    if err != nil {
    panic(err)
    }
    defer response.Body.Close()

    if response.StatusCode != http.StatusOK {
    return response.StatusCode, nil
    }

    bytes, err := io.ReadAll(response.Body)
    if err != nil {
    panic(err)
    }

    rawHtml := string(bytes)
    _node, err := html.Parse(strings.NewReader(rawHtml))
    if err != nil {
    panic(err)
    }

    return response.StatusCode, _node
    }