:幽灵蛛(pholcus)(三)--header get post学习资料
转载请注明出处:http://www.cnblogs.com/SSSR/p/6349298.html
get和post参考:http://ju.outofmemory.cn/entry/96382
登录知乎:https://github.com/DeanThompson/zhihu-go
并发:http://studygolang.com/articles/5658
https://sanwen8.cn/p/5985D5k.html
分布式爬虫:https://www.v2ex.com/t/179342
爬妹子图:http://www.thinksaas.cn/topics/0/425/425080.html
user agent见以下的例子,随机选择user agent。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | package http import ( "net/http" "io/ioutil" "fmt" //"net/url" "log" "strings" "math/rand" "time" "regexp" "encoding/xml" ) var atagRegExp = regexp.MustCompile(`<a[^>]+[(href)|(HREF)]\s*\t*\n*=\s*\t*\n*[( ".+" )|( '.+' )][^>]*>[^<]*</a>`) //以Must前缀的方法或函数都是必须保证一定能执行成功的,否则将引发一次panic func Spy(url string) { defer func () { if r := recover(); r != nil { log.Println( "[E]" , r) } }() req, _ := http.NewRequest( "GET" , url, nil) req.Header.Set( "User-Agent" , GetRandomUserAgent()) client := http.DefaultClient res, e := client.Do(req) if e != nil { fmt.Errorf( "Get请求%s返回错误:%s" , url, e) return } if res.StatusCode == 200 { body := res.Body defer body.Close() bodyByte, _ := ioutil.ReadAll(body) resStr := string(bodyByte) atag := atagRegExp.FindAllString(resStr, -1) for _, a := range atag { href,_ := GetHref(a) if strings.Contains(href, "article/details/" ) { fmt.Println( "☆" , href) } else { fmt.Println( "□" , href) } //urlChannel <- href fmt.Println(href) } } } var userAgent = [...]string{ "Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)" , "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)" , "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)" , "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0," , "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11" , "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)" , "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" , "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" , "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" , "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13" , "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" , "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)" , "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" , "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" } var r = rand.New(rand.NewSource(time.Now().UnixNano())) func GetRandomUserAgent() string { return userAgent[r.Intn(len(userAgent))] } func GetHref(atag string) (href,content string) { inputReader := strings.NewReader(atag) decoder := xml.NewDecoder(inputReader) for t, err := decoder.Token(); err == nil; t, err = decoder.Token() { switch token := t.( type ) { // 处理元素开始(标签) case xml.StartElement: for _, attr := range token.Attr { attrName := attr.Name.Local attrValue := attr.Value if (strings.EqualFold(attrName, "href" ) || strings.EqualFold(attrName, "HREF" )){ href = attrValue } } // 处理元素结束(标签) case xml.EndElement: // 处理字符数据(这里就是元素的文本) case xml.CharData: content = string([]byte(token)) default : href = "" content = "" } } return href, content } |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步