Golang爬虫下载资源到本地

通过Glang爬取猛男图片到本地保存;

package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"regexp"
	"strconv"
)

//http读取网页数据写入result返回
func HttpGet(url string) (result string, err error) {
	resp, err1 := http.Get(url)
	if err1 != nil {
		err = err1
		return
	}
	defer resp.Body.Close()

	buf := make([]byte, 4096)

	for {
		n, err2 := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}
		result += string(buf[:n])
	}
	return result, err
}

var path string = "D:/test/Gundam/"

//保存图片到本地
func saveImg(index int, url string, page chan int) {
	//图片httpbody
	fmt.Println(index, "-----", url)

	//爬取的网站重复使用了这张图
	if url == "http://images.17173.com/gd/images/ms/15019.gif" {
		return
	}

	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("http get err:", err)
		return
	}
	defer resp.Body.Close()

	//打开文件流
	picName := path + strconv.Itoa(index) + ".gif"
	f, errf := os.Create(picName)
	if errf != nil {
		fmt.Println("os create err:", errf)
		return
	}
	defer f.Close()

	buf := make([]byte, 4096)

	//读httpbody数据写入文件流
	for {
		n, err2 := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}

		f.Write(buf[:n])
	}

	page <- index
}

//爬取图片
func spiderPic(data string) {

	str := regexp.MustCompile("<A href=\"(.*?)\" target=_blank><IMG")
	alls := str.FindAllStringSubmatch(data, -1)

	page := make(chan int)

	for index, value := range alls {
		result, err := HttpGet(value[1])

		if err != nil {
			fmt.Println("HttpGet err3:", err)
			return
		}

		regexpStr := regexp.MustCompile("src=\"(.*?)\" width=\"120\"")
		picData := regexpStr.FindAllStringSubmatch(result, -1)

		if len(picData) == 0 {
			regexpStr = regexp.MustCompile("src=\"(.*?)\" width=120")
			picData = regexpStr.FindAllStringSubmatch(result, -1)
		}

		go saveImg(index, picData[0][1], page)
	}

	//防止主go退出
	count := len(alls)
	for i := 0; i < count; i++ {
		fmt.Printf("Download %d gif\n", <-page)
	}
}

func working() {
	url := "http://gd.17173.com/mechanics/index.shtml"

	result, err := HttpGet(url)
	if err != nil {
		fmt.Println("HttpGet err1:", err)
		return
	}

	spiderPic(result)
}

func main() {
	working()
}
posted @ 2021-12-24 19:37  小紫苏  阅读(411)  评论(0编辑  收藏  举报