Last active
January 25, 2021 09:46
-
-
Save liut/880910c6131887716d62a7e09e6dd3cb to your computer and use it in GitHub Desktop.
crawl with goquery, fixes from http://blog.csdn.net/lamphp/article/details/52452226
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "flag" | |
| "fmt" | |
| "io/ioutil" | |
| "log" | |
| "net/http" | |
| "net/url" | |
| "os" | |
| "path" | |
| "runtime" | |
| "strconv" | |
| "strings" | |
| "github.com/PuerkitoBio/goquery" | |
| ) | |
| const UrlPrefix string = "http://www.aitaotu.com" | |
| var ( | |
| ch1 chan string | |
| ch2 chan string | |
| ch3 chan int | |
| img_dir string | |
| log_dir string | |
| keyword string | |
| ) | |
| //初始化变量 | |
| func init() { | |
| flag.StringVar(&img_dir, "img_dir", "", "where is images to save") | |
| flag.StringVar(&log_dir, "log_dir", "/var/tmp", "where is log to save") | |
| flag.StringVar(&keyword, "kw", "Hello", "search for special keyword") | |
| ch1 = make(chan string, 20) | |
| ch2 = make(chan string, 1000) | |
| ch3 = make(chan int, 1000) | |
| logpath := path.Join(log_dir, "crawl.log") | |
| logfile, err := os.OpenFile(logpath, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0777) | |
| if err != nil { | |
| log.Printf("create log %q ERR %s", logpath, err) | |
| os.Exit(1) | |
| } | |
| log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) | |
| log.SetOutput(logfile) | |
| } | |
| func main() { | |
| runtime.GOMAXPROCS(runtime.NumCPU()) | |
| flag.Parse() | |
| if img_dir == "" || keyword == "" { | |
| flag.PrintDefaults() | |
| os.Exit(1) | |
| return | |
| } | |
| //检查目录是否存在 | |
| img_dir = path.Join(img_dir, keyword) | |
| file, err := os.Stat(img_dir) | |
| if err != nil || !file.IsDir() { | |
| dir_err := os.Mkdir(img_dir, os.ModePerm) | |
| if dir_err != nil { | |
| fmt.Printf("create dir %q failed\n", img_dir) | |
| os.Exit(1) | |
| } | |
| } | |
| go getListUrl() | |
| go parseListUrl() | |
| go downloadImage() | |
| count := 0 | |
| for num := range ch3 { | |
| count = count + num | |
| fmt.Println("count:", count) | |
| } | |
| fmt.Println("crawl end") | |
| } | |
| func getListUrl() { | |
| docUrl := fmt.Sprintf("%s/search/%s/", UrlPrefix, keyword) | |
| doc, err := goquery.NewDocument(docUrl) | |
| if err != nil { | |
| fmt.Println("err:", err) | |
| os.Exit(1) | |
| } | |
| doc.Find(".picbox").Each(func(i int, s *goquery.Selection) { | |
| text, _ := s.Find("a").Attr("href") | |
| list_url := UrlPrefix + text | |
| ch1 <- list_url | |
| }) | |
| } | |
| //根据模块和总数据列出所有的图片页面 | |
| func parseListUrl() { | |
| suffix := ".html" | |
| for list_url := range ch1 { | |
| page_count := getPageCount(list_url) | |
| prefix := strings.TrimRight(list_url, suffix) | |
| for i := 1; i <= page_count; i++ { | |
| img_list_url := prefix + "_" + strconv.Itoa(i) + suffix | |
| ch2 <- img_list_url | |
| } | |
| } | |
| } | |
| //获取总页数 | |
| func getPageCount(list_url string) (count int) { | |
| count = 0 | |
| doc, _ := goquery.NewDocument(list_url) | |
| doc.Find(".pages ul li").Each(func(i int, s *goquery.Selection) { | |
| text := s.Find("a").Text() | |
| if text == "末页" { | |
| last_page_url, _ := s.Find("a").Attr("href") | |
| prefix := strings.Trim(last_page_url, ".html") | |
| index := strings.Index(prefix, "_") | |
| last_page_num := prefix[index+1:] | |
| page_num, _ := strconv.Atoi(last_page_num) | |
| count = page_num | |
| } | |
| }) | |
| return count | |
| } | |
| //解析图片url | |
| func downloadImage() { | |
| for img_list_url := range ch2 { | |
| doc, _ := goquery.NewDocument(img_list_url) | |
| doc.Find("#big-pic p a").Each(func(i int, s *goquery.Selection) { | |
| img_url, _ := s.Find("img").Attr("src") | |
| go func() { | |
| saveImages(img_url) | |
| }() | |
| }) | |
| } | |
| } | |
| //下载图片 | |
| func saveImages(img_url string) { | |
| log.Printf("Get %s", img_url) | |
| u, err := url.Parse(img_url) | |
| if err != nil { | |
| log.Println("parse url failed:", img_url, err) | |
| return | |
| } | |
| tmp := strings.TrimLeft(u.Path, "/") | |
| tmp = strings.ToLower(strings.Replace(tmp, "/", "-", -1)) | |
| filename := path.Join(img_dir, tmp) | |
| if checkExists(filename) { | |
| log.Printf("Exists %s", filename) | |
| return | |
| } | |
| response, err := http.Get(img_url) | |
| if err != nil { | |
| log.Println("get img_url failed:", err) | |
| return | |
| } | |
| defer response.Body.Close() | |
| data, err := ioutil.ReadAll(response.Body) | |
| if err != nil { | |
| log.Println("read data failed:", img_url, err) | |
| return | |
| } | |
| image, err := os.Create(filename) | |
| if err != nil { | |
| log.Println("create file failed:", filename, err) | |
| return | |
| } | |
| ch3 <- 1 | |
| defer image.Close() | |
| image.Write(data) | |
| } | |
| func checkExists(filename string) bool { | |
| _, err := os.Stat(filename) | |
| return err == nil | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
six six six