【Golang】爬虫笔记


阅读目录

一、net/http

二、grequests

三、实战应用

一、net/http

net/http包提供了http客户端及服务端的实现

1.简单使用

通过http最常见的几种请求方式(GET、POST、PUT、DELETE)演示net/http使用

GET

func get(){
	resp, err := http.Get("http://httpbin.org/get")
	if err != nil{
		panic(err)
	}
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

POST

func post(){
	resp, err := http.Post("http://httpbin.org/post", "", nil)
	if err != nil{
		panic(err)
	}
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

PUT

func put(){
	request, err := http.NewRequest(http.MethodPut, "http://httpbin.org/put", nil)
	if err != nil{
		panic(err)
	}
	resp, err := http.DefaultClient.Do(request)
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

DELETE

func del(){
	request, err := http.NewRequest(http.MethodDelete, "http://httpbin.org/delete", nil)
	if err != nil{
		panic(err)
	}
	resp, err := http.DefaultClient.Do(request)
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

2.请求参数

GET请求参数

func getWithParams(){
	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
	if err != nil{
		panic(err)
	}
	params := make(url.Values)
	params.Add("name", "ero")
	params.Add("arg", "18")
	request.URL.RawQuery = params.Encode()
        // 或者
        // request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get?name=ero&age=18", nil)

	resp, err := http.DefaultClient.Do(request)
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

POST请求参数

form表单
func postForm(){
	data := make(url.Values)
	data.Add("name", "ero")
	data.Add("age", "18")
	resp, err := http.Post("http://httpbin.org/post",
		"application/x-www-form-urlencoded",
		strings.NewReader(data.Encode()))
	if err != nil{
		panic(err)
	}
	defer func() {_=resp.Body.Close()}()

	content, err := ioutil.ReadAll(resp.Body)
	fmt.Printf("%s", content)
}
Json数据
func postJson(){
	u := struct {
		Name string `json:"name"`
		Age int `json:"age"`
	}{
		Name:"ero",
		Age:18,
	}
	payload, _ := json.Marshal(u)
	resp, _ := http.Post("http://httpbin.org/post", "application/json",
		bytes.NewReader(payload))
	defer func() {_=resp.Body.Close()}()

	content, _ := ioutil.ReadAll(resp.Body)
	fmt.Printf("%s", content)
}
文件
func postFile(){
        //缓冲对象
	body := &bytes.Buffer{}
	writer := multipart.NewWriter(body)
        // 也可以写入form格式数据
	_ = writer.WriteField("number", "123456")

	//一个是数据表单的name,另一个是上传文件的名称
	uploadWriter1, _ := writer.CreateFormFile("uploadFile1", "uploadFileName1")
	uploadFile1, err := os.Open("uploadFileName1.txt")
	if err != nil{
		panic(err)
	}
	defer uploadFile1.Close()
	_, _ = io.Copy(uploadWriter1, uploadFile1)

	uploadWriter2, _ := writer.CreateFormFile("uploadFile2", "uploadFileName2")
	uploadFile2, _ := os.Open("uploadFileName2.txt")
	defer uploadFile2.Close()
	_,_ = io.Copy(uploadWriter2, uploadFile2)

	_ = writer.Close()
	fmt.Println(body.String())


	resp, _ := http.Post("http://httpbin.org/post", writer.FormDataContentType(), body)
	defer resp.Body.Close()
	content, _ := ioutil.ReadAll(resp.Body)
	fmt.Printf("%s", content)
}

3.请求头

func getWithHeaders(){
	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
	if err != nil{
		panic(err)
	}

	request.Header.Add("name","ero")
	request.Header.Add("user-agent","chrome")

	resp, err := http.DefaultClient.Do(request)
	defer func() {_:resp.Body.Close()}()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		panic(err)
	}
	fmt.Printf("%s", content)
}

4.Cookie信息

手动保存

func manualSetCookies(){
	client := &http.Client{
		Transport:     nil,
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			// 禁止重定向
			return http.ErrUseLastResponse
		},
		Jar:           nil,
		Timeout:       0,
	}
	firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
	resp, err := client.Do(firstRequest)
	if err != nil{
		panic(err)
	}
	defer resp.Body.Close()
	fmt.Printf("%s\n", resp.Cookies())
	secondRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies", nil)
	for _, cookie := range resp.Cookies(){
		secondRequest.AddCookie(cookie)
	}
	resp2, err := client.Do(secondRequest)
	if err != nil{
		panic(err)
	}
	defer resp2.Body.Close()
	fmt.Printf("%s\n", resp2.Cookies())

	content, _ := ioutil.ReadAll(resp2.Body)
	fmt.Printf("%s", content)
}

cookiejar自动保存

func autoSetCookies(){
	jar,_ := cookiejar.New(nil)
	client := &http.Client{Jar:jar}
	firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
	resp, err := client.Do(firstRequest)
	if err != nil{
		panic(err)
	}

	defer resp.Body.Close()

	_, _ = io.Copy(os.Stdout, resp.Body)
}

cookie保存持久化

浏览器访问会自动做cookie信息持久化,但是通过cookiejar保存的cookie只支持一次会话
可以通过github.com/juju/persistent-cookiejar做持久化cookie

jar, _ := cookiejar2.New(nil)
...
//在执行最后设置保存
_ = jar.Save()

5.响应信息

状态码

func status(r *http.Response){
	fmt.Println(r.Status)      //状态码字符串
	fmt.Println(r.StatusCode)  //状态码
}

响应头

func headers(r *http.Response){
	s := r.Header.Get("name")
	fmt.Println(s)
}

编码

func encoding(r *http.Response){
	reader := bufio.NewReader(r.Body)
	//不会移动reader读取位置
	bytes, err := reader.Peek(100)
	if err != nil{
		fmt.Println(err.Error())
		panic(err)
	}
	// 可以相对准确获取编码格式
	e, _, _ := charset.DetermineEncoding(bytes, r.Header.Get("content-type"))
	fmt.Println(e.NewDecoder())
	// 获取解码信息
	bodyReader := transform.NewReader(reader, e.NewDecoder())
        // 读取解码后的信息
	content, _ := ioutil.ReadAll(bodyReader)
	fmt.Println(string(content))
}

6.超时时间

func timeoutTest(){
	client := &http.Client{
                // 通过transport更详细设置超时时间
		Transport:     &http.Transport{
			DialContext: func(ctx context.Context, network, addr string) (conn net.Conn, err error) {
				return net.DialTimeout(network, addr, 2*time.Second)
			},
			Dial:                   nil,
			DialTLS:                nil,
			TLSClientConfig:        nil,
			TLSHandshakeTimeout:    5 * time.Second,
			IdleConnTimeout:        0,
			ResponseHeaderTimeout:  5 * time.Second,
			ExpectContinueTimeout:  0,
		},
		CheckRedirect: nil,
		Jar:           nil,
		Timeout:       5 * time.Second,
	}
	request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/delay/10", nil)
	response, err := client.Do(request)
	if err !=nil{
		panic(err)
	}
	fmt.Printf("%s", response.Body)
}

7.代理

func main() {
	//proxyUrl, _ := url.Parse("socks5://127.0.0.1:1080")  //socks5代理
	proxyUrl, _ := url.Parse("http://127.0.0.1:1080")      //http代理
	t := &http.Transport{
		Proxy: http.ProxyURL(proxyUrl),
	}
	client := &http.Client{Transport:t,}
	resp, err := client.Get("https://www.google.com")
	if err != nil{
		panic(err)
	}
	defer resp.Body.Close()
	fmt.Printf("%s", resp.Body)
}

8.重定向

限制重定向次数

func redirectLimitTimes(){
	client := http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			if len(via) > 10{
				return errors.New("redirect times > 10")
			}
			return nil
		},
	}

	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/redirect/20", nil)
	if err != nil{
		panic(err)
	}
	_, err = client.Do(request)
	if err != nil{
		panic(err)
	}

}

禁止重定向

func redirectForbidden(){
	client := &http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			return http.ErrUseLastResponse
		},
	}
	request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero", nil)

	//resp, err := http.DefaultClient.Do(request)  //默认会被重新定向到另一个地址
	resp, err := client.Do(request)  //禁止重定向
	if err != nil{
		panic(err)
	}
	defer func() {_ = resp.Body.Close()}()

	fmt.Printf("%s", resp.Request.URL)

}

9.下载

简单下载

func downloadFile(url, filename string){
	r, err := http.Get(url)
	if err != nil{
		panic(err)
	}
	defer func() {_=r.Body.Close()}()

	f, err:= os.Create(filename)
	if err != nil{
		panic(err)
	}

	written, err := io.Copy(f, r.Body)
	if err != nil{
		panic(err)
	}
	fmt.Println(written)
}

下载进度

type Reader struct {
	io.Reader
	Total int64
	CurrentLength int64
}
// 实现接口方法
func(r *Reader) Read(p []byte) (n int, err error){
	n, err = r.Reader.Read(p)
	r.CurrentLength += int64(n)
	fmt.Printf("\r进度 %.2f%%\n", float64(r.CurrentLength*10000/r.Total)/100)
	return
}

func downloadFileProgress(url, filename string){
	r, err := http.Get(url)
	if err != nil{
		panic(err)
	}
	defer func() {_=r.Body.Close()}()

	f, err:= os.Create(filename)
	if err != nil{
		panic(err)
	}
	reader := &Reader{
		Reader:        r.Body,
		Total:         r.ContentLength,
	}
	n, err := io.Copy(f, reader)
	if err != nil{
		panic(err)
	}
	fmt.Println(n)
}

二、grequests

语法更简单,使用过python requests包的同学肯定会很喜欢

1.安装

go get -u github.com/levigross/grequests

2.GET示例

import "github.com/levigross/grequests"

ro := &RequestOptions{
	Params: map[string]string{"Hello": "Goodbye"},
}
// url路径上的参数会被覆盖
resp, err := grequests.Get("http://httpbin.org/get?Hello=11", ro )

if err != nil {
	log.Fatalln("Unable to make request: ", err)
}

fmt.Println(resp.String())

3.POST示例

resp, err := grequests.Post("http://httpbin.org/post",
    &grequests.RequestOptions{Data: map[string]string{"One": "Two"}})

if err != nil {
    log.Println("Cannot post: ", err)
}

if resp.Ok != true {
    log.Println("Request did not return OK")
}

4.POST上传文件

fd, err := grequests.FileUploadFromDisk("test_files/mypassword")

if err != nil {
    log.Println("Unable to open file: ", err)
}

// This will upload the file as a multipart mime request
resp, err := grequests.Post("http://httpbin.org/post",
    &grequests.RequestOptions{
        Files: fd,
        Data:  map[string]string{"One": "Two"},
    })

if err != nil {
    log.Println("Unable to make request", resp.Error)
}

if resp.Ok != true {
    log.Println("Request did not return OK")
}

三、实战应用

posted @ 2020-05-01 17:16  初遇ぃ  阅读(502)  评论(0编辑  收藏  举报
//一下两个链接最好自己保存下来,再上传到自己的博客园的“文件”选项中 //一下两个链接最好自己保存下来,再上传到自己的博客园的“文件”选项中