Skip to content

Instantly share code, notes, and snippets.

@liut
Last active January 25, 2021 09:46
Show Gist options
  • Select an option

  • Save liut/880910c6131887716d62a7e09e6dd3cb to your computer and use it in GitHub Desktop.

Select an option

Save liut/880910c6131887716d62a7e09e6dd3cb to your computer and use it in GitHub Desktop.
package main
import (
"flag"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"path"
"runtime"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
const UrlPrefix string = "http://www.aitaotu.com"
var (
ch1 chan string
ch2 chan string
ch3 chan int
img_dir string
log_dir string
keyword string
)
//初始化变量
func init() {
flag.StringVar(&img_dir, "img_dir", "", "where is images to save")
flag.StringVar(&log_dir, "log_dir", "/var/tmp", "where is log to save")
flag.StringVar(&keyword, "kw", "Hello", "search for special keyword")
ch1 = make(chan string, 20)
ch2 = make(chan string, 1000)
ch3 = make(chan int, 1000)
logpath := path.Join(log_dir, "crawl.log")
logfile, err := os.OpenFile(logpath, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0777)
if err != nil {
log.Printf("create log %q ERR %s", logpath, err)
os.Exit(1)
}
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
log.SetOutput(logfile)
}
func main() {
runtime.GOMAXPROCS(runtime.NumCPU())
flag.Parse()
if img_dir == "" || keyword == "" {
flag.PrintDefaults()
os.Exit(1)
return
}
//检查目录是否存在
img_dir = path.Join(img_dir, keyword)
file, err := os.Stat(img_dir)
if err != nil || !file.IsDir() {
dir_err := os.Mkdir(img_dir, os.ModePerm)
if dir_err != nil {
fmt.Printf("create dir %q failed\n", img_dir)
os.Exit(1)
}
}
go getListUrl()
go parseListUrl()
go downloadImage()
count := 0
for num := range ch3 {
count = count + num
fmt.Println("count:", count)
}
fmt.Println("crawl end")
}
func getListUrl() {
docUrl := fmt.Sprintf("%s/search/%s/", UrlPrefix, keyword)
doc, err := goquery.NewDocument(docUrl)
if err != nil {
fmt.Println("err:", err)
os.Exit(1)
}
doc.Find(".picbox").Each(func(i int, s *goquery.Selection) {
text, _ := s.Find("a").Attr("href")
list_url := UrlPrefix + text
ch1 <- list_url
})
}
//根据模块和总数据列出所有的图片页面
func parseListUrl() {
suffix := ".html"
for list_url := range ch1 {
page_count := getPageCount(list_url)
prefix := strings.TrimRight(list_url, suffix)
for i := 1; i <= page_count; i++ {
img_list_url := prefix + "_" + strconv.Itoa(i) + suffix
ch2 <- img_list_url
}
}
}
//获取总页数
func getPageCount(list_url string) (count int) {
count = 0
doc, _ := goquery.NewDocument(list_url)
doc.Find(".pages ul li").Each(func(i int, s *goquery.Selection) {
text := s.Find("a").Text()
if text == "末页" {
last_page_url, _ := s.Find("a").Attr("href")
prefix := strings.Trim(last_page_url, ".html")
index := strings.Index(prefix, "_")
last_page_num := prefix[index+1:]
page_num, _ := strconv.Atoi(last_page_num)
count = page_num
}
})
return count
}
//解析图片url
func downloadImage() {
for img_list_url := range ch2 {
doc, _ := goquery.NewDocument(img_list_url)
doc.Find("#big-pic p a").Each(func(i int, s *goquery.Selection) {
img_url, _ := s.Find("img").Attr("src")
go func() {
saveImages(img_url)
}()
})
}
}
//下载图片
func saveImages(img_url string) {
log.Printf("Get %s", img_url)
u, err := url.Parse(img_url)
if err != nil {
log.Println("parse url failed:", img_url, err)
return
}
tmp := strings.TrimLeft(u.Path, "/")
tmp = strings.ToLower(strings.Replace(tmp, "/", "-", -1))
filename := path.Join(img_dir, tmp)
if checkExists(filename) {
log.Printf("Exists %s", filename)
return
}
response, err := http.Get(img_url)
if err != nil {
log.Println("get img_url failed:", err)
return
}
defer response.Body.Close()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
log.Println("read data failed:", img_url, err)
return
}
image, err := os.Create(filename)
if err != nil {
log.Println("create file failed:", filename, err)
return
}
ch3 <- 1
defer image.Close()
image.Write(data)
}
func checkExists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
@CacheDoor
Copy link

six six six

@hzambrella
Copy link

nine nine nine

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment