Gorang multithreaded crawler

Keywords: Windows github

This is a golang crawler demo. It goes to the homepage of a beauty picture website
 Adopt the way of gorang multithreading to crawl pictures and save the crawled pictures locally
 In the code, it is useful to use chan, the data analysis framework of goquery web page, to control goroutine to download

http://www.umei.cc/
 The header requested by a website must be accompanied by a Referer or 404
 Use wireshark to grab the data of the image requested by the browser to get the Referer

//Uncomplicated code, suitable for novice learning
var url = "http://www.umei.cc/"

var c chan int

func main() {
	runtime.GOMAXPROCS(4)
	spider()
	//testDownLoad()
}

//URL - > document - > all pictures URL - > enable multi thread download - > save to local
func spider() {
	doc, err := goquery.NewDocument(url)
	if err != nil {
		log.Fatal(err)
	}
	urls := ImageRule(doc, match);
	fmt.Println("Resolves to", len(urls), "Picture address")
	c = make(chan int)
	for _, s := range urls {
		fmt.Println(s)
		go downloadImage(s)
	}
	//You can wait for a while and leave time for the child goroutine to execute
	//But this way is not very reliable / / the chan way is used directly
	//time.Sleep(1e9*10)
	for i := 0; i < len(urls); i++ {
		<-c
	}
}

// The following download methods were tested separately
func testDownLoad() {
	var url_img = "http://i1.umei.cc/uploads/tu/201608/164/hanguomeinv.jpg";
	//var  url_img  = "http://t1.mmonly.cc/uploads/tu/sm/201601/19/005.jpg";
	downloadImage(url_img)
}

func match(image string) {
	fmt.Println(image);
}

func getData(url string) (eader io.Reader, err error) {
	req := buildRequest(url)
	resp, err := http.DefaultClient.Do(req)
	defer resp.Body.Close()
	return io.Reader(resp.Body), err
}

// Get all imageurls in a web page
func parseImageUrl(reader io.Reader) (res []string, err error) {

	doc, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return nil, err
	}
	fmt.Println(doc.Url)
	ImageRule(doc, func(image string) {
		res = append(res, image)
	})
	return res, nil
}

func ImageRule(doc *goquery.Document, f func(image string)) (urls []string) {
	str := make([]string, 0)
	//Just find the tag starting with img and filter out the url that doesn't conform to the rules
	doc.Find("img").Each(func(i int, s *goquery.Selection) {
		url, result := s.Attr("src")
		if result {
			if strings.HasSuffix(url, ".jpg") {
				str = append(str, url)
			}
		}
	})
	return str
}

//Create http request based on url
//wireshark doesn't explain that the website has an anti crawler strategy
func buildRequest(url string) *http.Request {
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		panic(err)
	}
	//	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36")
	//	req.Header.Set("Cookie", "Hm_lvt_c605a31292b623d214d012ec2a737685=1516111586; Hm_lpvt_c605a31292b623d214d012ec2a737685=1516111613")
	//req.Header.Set("If-None-Match", "5a309bab-26057")
	req.Header.Set("Referer", "http://www.umei.cc/")
	//req.Header.Set("If-Modified-Since", "Wed, 13 Dec 2017 03:16:59 GMT")
	return req
}
// Download pictures
func downloadImage(url string) {
	fileName := getNameFromUrl(url)
	req := buildRequest(url)
	http.DefaultClient.Timeout = 10 * time.Second;
	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		fmt.Println("failed download ")
		panic(err)
	}
	if resp.StatusCode != http.StatusOK {
		fmt.Println("failed download " + url)
		return
	}
	defer func() {
		resp.Body.Close()
		if r := recover(); r != nil {
			fmt.Println(r)
		}
		c <- 0
	}()

	fmt.Println("begin download " + fileName)
	os.MkdirAll("./images/", 0777)
	localFile, _ := os.OpenFile("./images/"+fileName, os.O_CREATE|os.O_RDWR, 0777)

	if _, err := io.Copy(localFile, resp.Body); err != nil {
		panic("failed save " + fileName)
	}

	fmt.Println("success download " + fileName)
}
// Judge whether the folder exists
func isExist(dir string) bool {
	_, err := os.Stat(dir)
	if err == nil {
		return true
	}
	return os.IsExist(err)
}

// Get the picture name through url
func getNameFromUrl(url string) string {
	arr := strings.Split(url, "/")
	return arr[len(arr)-1]
}

Project address: Click to open the link

goquery portal https://godoc.org/github.com/PuerkitoBio/goquery

Posted by phu on Thu, 30 Apr 2020 11:05:50 -0700

Programmer Group

Gorang multithreaded crawler

Hot Keywords