【Golang】爬虫笔记
阅读目录
一、net/http
二、grequests
三、实战应用
一、net/http
net/http包提供了http客户端及服务端的实现
1.简单使用
通过http最常见的几种请求方式(GET、POST、PUT、DELETE)演示net/http使用
GET
func get(){
resp, err := http.Get("http://httpbin.org/get")
if err != nil{
panic(err)
}
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
POST
func post(){
resp, err := http.Post("http://httpbin.org/post", "", nil)
if err != nil{
panic(err)
}
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
PUT
func put(){
request, err := http.NewRequest(http.MethodPut, "http://httpbin.org/put", nil)
if err != nil{
panic(err)
}
resp, err := http.DefaultClient.Do(request)
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
DELETE
func del(){
request, err := http.NewRequest(http.MethodDelete, "http://httpbin.org/delete", nil)
if err != nil{
panic(err)
}
resp, err := http.DefaultClient.Do(request)
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
2.请求参数
GET请求参数
func getWithParams(){
request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
if err != nil{
panic(err)
}
params := make(url.Values)
params.Add("name", "ero")
params.Add("arg", "18")
request.URL.RawQuery = params.Encode()
// 或者
// request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get?name=ero&age=18", nil)
resp, err := http.DefaultClient.Do(request)
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
POST请求参数
form表单
func postForm(){
data := make(url.Values)
data.Add("name", "ero")
data.Add("age", "18")
resp, err := http.Post("http://httpbin.org/post",
"application/x-www-form-urlencoded",
strings.NewReader(data.Encode()))
if err != nil{
panic(err)
}
defer func() {_=resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
fmt.Printf("%s", content)
}
Json数据
func postJson(){
u := struct {
Name string `json:"name"`
Age int `json:"age"`
}{
Name:"ero",
Age:18,
}
payload, _ := json.Marshal(u)
resp, _ := http.Post("http://httpbin.org/post", "application/json",
bytes.NewReader(payload))
defer func() {_=resp.Body.Close()}()
content, _ := ioutil.ReadAll(resp.Body)
fmt.Printf("%s", content)
}
文件
func postFile(){
//缓冲对象
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
// 也可以写入form格式数据
_ = writer.WriteField("number", "123456")
//一个是数据表单的name,另一个是上传文件的名称
uploadWriter1, _ := writer.CreateFormFile("uploadFile1", "uploadFileName1")
uploadFile1, err := os.Open("uploadFileName1.txt")
if err != nil{
panic(err)
}
defer uploadFile1.Close()
_, _ = io.Copy(uploadWriter1, uploadFile1)
uploadWriter2, _ := writer.CreateFormFile("uploadFile2", "uploadFileName2")
uploadFile2, _ := os.Open("uploadFileName2.txt")
defer uploadFile2.Close()
_,_ = io.Copy(uploadWriter2, uploadFile2)
_ = writer.Close()
fmt.Println(body.String())
resp, _ := http.Post("http://httpbin.org/post", writer.FormDataContentType(), body)
defer resp.Body.Close()
content, _ := ioutil.ReadAll(resp.Body)
fmt.Printf("%s", content)
}
3.请求头
func getWithHeaders(){
request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
if err != nil{
panic(err)
}
request.Header.Add("name","ero")
request.Header.Add("user-agent","chrome")
resp, err := http.DefaultClient.Do(request)
defer func() {_:resp.Body.Close()}()
content, err := ioutil.ReadAll(resp.Body)
if err != nil{
panic(err)
}
fmt.Printf("%s", content)
}
4.Cookie信息
手动保存
func manualSetCookies(){
client := &http.Client{
Transport: nil,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
// 禁止重定向
return http.ErrUseLastResponse
},
Jar: nil,
Timeout: 0,
}
firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
resp, err := client.Do(firstRequest)
if err != nil{
panic(err)
}
defer resp.Body.Close()
fmt.Printf("%s\n", resp.Cookies())
secondRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies", nil)
for _, cookie := range resp.Cookies(){
secondRequest.AddCookie(cookie)
}
resp2, err := client.Do(secondRequest)
if err != nil{
panic(err)
}
defer resp2.Body.Close()
fmt.Printf("%s\n", resp2.Cookies())
content, _ := ioutil.ReadAll(resp2.Body)
fmt.Printf("%s", content)
}
cookiejar自动保存
func autoSetCookies(){
jar,_ := cookiejar.New(nil)
client := &http.Client{Jar:jar}
firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
resp, err := client.Do(firstRequest)
if err != nil{
panic(err)
}
defer resp.Body.Close()
_, _ = io.Copy(os.Stdout, resp.Body)
}
cookie保存持久化
浏览器访问会自动做cookie信息持久化,但是通过cookiejar保存的cookie只支持一次会话
可以通过github.com/juju/persistent-cookiejar做持久化cookie
jar, _ := cookiejar2.New(nil)
...
//在执行最后设置保存
_ = jar.Save()
5.响应信息
状态码
func status(r *http.Response){
fmt.Println(r.Status) //状态码字符串
fmt.Println(r.StatusCode) //状态码
}
响应头
func headers(r *http.Response){
s := r.Header.Get("name")
fmt.Println(s)
}
编码
func encoding(r *http.Response){
reader := bufio.NewReader(r.Body)
//不会移动reader读取位置
bytes, err := reader.Peek(100)
if err != nil{
fmt.Println(err.Error())
panic(err)
}
// 可以相对准确获取编码格式
e, _, _ := charset.DetermineEncoding(bytes, r.Header.Get("content-type"))
fmt.Println(e.NewDecoder())
// 获取解码信息
bodyReader := transform.NewReader(reader, e.NewDecoder())
// 读取解码后的信息
content, _ := ioutil.ReadAll(bodyReader)
fmt.Println(string(content))
}
6.超时时间
func timeoutTest(){
client := &http.Client{
// 通过transport更详细设置超时时间
Transport: &http.Transport{
DialContext: func(ctx context.Context, network, addr string) (conn net.Conn, err error) {
return net.DialTimeout(network, addr, 2*time.Second)
},
Dial: nil,
DialTLS: nil,
TLSClientConfig: nil,
TLSHandshakeTimeout: 5 * time.Second,
IdleConnTimeout: 0,
ResponseHeaderTimeout: 5 * time.Second,
ExpectContinueTimeout: 0,
},
CheckRedirect: nil,
Jar: nil,
Timeout: 5 * time.Second,
}
request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/delay/10", nil)
response, err := client.Do(request)
if err !=nil{
panic(err)
}
fmt.Printf("%s", response.Body)
}
7.代理
func main() {
//proxyUrl, _ := url.Parse("socks5://127.0.0.1:1080") //socks5代理
proxyUrl, _ := url.Parse("http://127.0.0.1:1080") //http代理
t := &http.Transport{
Proxy: http.ProxyURL(proxyUrl),
}
client := &http.Client{Transport:t,}
resp, err := client.Get("https://www.google.com")
if err != nil{
panic(err)
}
defer resp.Body.Close()
fmt.Printf("%s", resp.Body)
}
8.重定向
限制重定向次数
func redirectLimitTimes(){
client := http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) > 10{
return errors.New("redirect times > 10")
}
return nil
},
}
request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/redirect/20", nil)
if err != nil{
panic(err)
}
_, err = client.Do(request)
if err != nil{
panic(err)
}
}
禁止重定向
func redirectForbidden(){
client := &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero", nil)
//resp, err := http.DefaultClient.Do(request) //默认会被重新定向到另一个地址
resp, err := client.Do(request) //禁止重定向
if err != nil{
panic(err)
}
defer func() {_ = resp.Body.Close()}()
fmt.Printf("%s", resp.Request.URL)
}
9.下载
简单下载
func downloadFile(url, filename string){
r, err := http.Get(url)
if err != nil{
panic(err)
}
defer func() {_=r.Body.Close()}()
f, err:= os.Create(filename)
if err != nil{
panic(err)
}
written, err := io.Copy(f, r.Body)
if err != nil{
panic(err)
}
fmt.Println(written)
}
下载进度
type Reader struct {
io.Reader
Total int64
CurrentLength int64
}
// 实现接口方法
func(r *Reader) Read(p []byte) (n int, err error){
n, err = r.Reader.Read(p)
r.CurrentLength += int64(n)
fmt.Printf("\r进度 %.2f%%\n", float64(r.CurrentLength*10000/r.Total)/100)
return
}
func downloadFileProgress(url, filename string){
r, err := http.Get(url)
if err != nil{
panic(err)
}
defer func() {_=r.Body.Close()}()
f, err:= os.Create(filename)
if err != nil{
panic(err)
}
reader := &Reader{
Reader: r.Body,
Total: r.ContentLength,
}
n, err := io.Copy(f, reader)
if err != nil{
panic(err)
}
fmt.Println(n)
}
二、grequests
语法更简单,使用过python requests包的同学肯定会很喜欢
1.安装
go get -u github.com/levigross/grequests
2.GET示例
import "github.com/levigross/grequests"
ro := &RequestOptions{
Params: map[string]string{"Hello": "Goodbye"},
}
// url路径上的参数会被覆盖
resp, err := grequests.Get("http://httpbin.org/get?Hello=11", ro )
if err != nil {
log.Fatalln("Unable to make request: ", err)
}
fmt.Println(resp.String())
3.POST示例
resp, err := grequests.Post("http://httpbin.org/post",
&grequests.RequestOptions{Data: map[string]string{"One": "Two"}})
if err != nil {
log.Println("Cannot post: ", err)
}
if resp.Ok != true {
log.Println("Request did not return OK")
}
4.POST上传文件
fd, err := grequests.FileUploadFromDisk("test_files/mypassword")
if err != nil {
log.Println("Unable to open file: ", err)
}
// This will upload the file as a multipart mime request
resp, err := grequests.Post("http://httpbin.org/post",
&grequests.RequestOptions{
Files: fd,
Data: map[string]string{"One": "Two"},
})
if err != nil {
log.Println("Unable to make request", resp.Error)
}
if resp.Ok != true {
log.Println("Request did not return OK")
}