golang模拟新浪微博登录
1.基于幽灵蛛pholcus开源项目的规则
直接贴代码,代码可以更改后用于其他爬虫项目
package pholcus_lib // 基础包 import ( // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 . "github.com/henrylee2cn/pholcus/app/spider" //必需 // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 // "github.com/henrylee2cn/pholcus/logs" // net包 // "net/http" //设置http.Header // "net/url" // 编码包 // "encoding/xml" //"encoding/json" // 字符串处理包 //"regexp" // "strconv" // "fmt" // "math" //"net/http" "strconv" "regexp" "fmt" "encoding/json" "net/url" //"strings" //"strings" "strings" "github.com/henrylee2cn/pholcus/common/goquery" //"net/http" ) ////获取unix时间 var millisecond int64 //用户名 var name string //密码 var password string //解析json结构体 type ( Info struct { Retcode int Uid string Nick string CrossDomainUrlList []string } ) func init() { FileTest.Register() millisecond = getMillisecond() name="88888888" password="8888888" name = encryptUname(name) } var FileTest = &Spider{ Name: "微博登录测试", Description: "测试 [s.weibo.com/user/]", Pausetime: 1500, Keyin: KEYIN, // Limit: LIMIT, EnableCookie: true, RuleTree: &RuleTree{ Root: func(ctx *Context) { //https://weibo.cn/ ctx.AddQueue(&request.Request{ Url: "https://login.sina.com.cn/sso/prelogin.php?entry=account&callback=sinaSSOController.preloginCallBack&su="+name+"&rsakt=mod&client=ssologin.js(v1.4.15)&_="+strconv.FormatInt(millisecond,10), Rule: "登录一", //DownloaderID:1, }) }, Trunk: map[string]*Rule{ "登录一": { ParseFunc: func(ctx *Context) { str := ctx.GetText() println("-----1-----" + str) compile1, _ := regexp.Compile("{.*}") match1 := compile1.FindString(str) fmt.Println(match1) //json str 转map var dat map[string]interface{} if err := json.Unmarshal([]byte(match1), &dat); err == nil { if err != nil{ println("转换异常!") } } servertime := dat["servertime"] servertime= strconv.FormatFloat(servertime.(float64), 'f', -1, 64) nonce:=dat["nonce"] pubkey:=dat["pubkey"] rsakv := dat["rsakv"] //加密密码 ep := encryptPassword(pubkey.(string), servertime.(string), nonce.(string), password) postDict := map[string]string{} postDict["entry"] = "account" postDict["gateway"] = "1" postDict["from"] = "" postDict["savestate"] = "30" postDict["qrcode_flag"] = "true" postDict["useticket"] = "0" postDict["pagerefer"] = "" postDict["vsnf"] = "1" postDict["su"] = name postDict["service"] = "account" postDict["servertime"] = servertime.(string) postDict["nonce"] = nonce.(string) postDict["pwencode"] = "rsa2" postDict["rsakv"] = rsakv.(string) postDict["sp"] = ep postDict["sr"] = "1395*822" postDict["cdult"] = "3" postDict["domain"] = "sina.com.cn" postDict["prelt"] = "170" postDict["returntype"] = "TEXT" postValues := url.Values{} for postKey, PostValue := range postDict{ postValues.Set(postKey, PostValue) } //post参数编码 postDataStr := postValues.Encode() ctx.AddQueue(&request.Request{ Url: "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)&_="+strconv.FormatInt(getMillisecond(),10), Method: "POST", EnableCookie: true, PostData: postDataStr, Rule: "登录二", //DownloaderID:1, }) }, }, "登录二": { ParseFunc: func(ctx *Context) { str := ctx.GetText() println("-----2-----" + str) var dat Info json.Unmarshal([]byte(str), &dat) //此处获取2个链接,包含普通版和移动版 //print(dat.CrossDomainUrlList[2]) ctx.AddQueue(&request.Request{ Url: dat.CrossDomainUrlList[2], Method: "GET", EnableCookie: true, Rule: "登录三", }) }, }, "登录三": { ParseFunc: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "https://weibo.cn/", Method: "GET", EnableCookie: true, Rule: "重定向一", }) }, }, "重定向一": { ParseFunc: func(ctx *Context) { compile2, _ := regexp.Compile("[a-zA-z]+://[^\\s]*") string := compile2.FindAllString(ctx.GetText(), 2) ctx.AddQueue(&request.Request{ Url: string[1], Method: "GET", EnableCookie: true, Rule: "重定向二", }) }, }, "重定向二": { ParseFunc: func(ctx *Context) { compile2, _ := regexp.Compile("[a-zA-z]+://[^\\s]*") string := compile2.FindAllString(ctx.GetText(), 3) ctx.AddQueue(&request.Request{ Url: string[2], Method: "GET", EnableCookie: true, Rule: "进入首页", }) }, }, "进入首页": { ParseFunc: func(ctx *Context) { for z := 1;z<=2;z++{ ctx.AddQueue(&request.Request{ Url: "https://weibo.cn/search/user/?keyword="+ ctx.GetKeyin() + "&page=" + strconv.Itoa(z),// //, Rule: "查找微博", Method: "GET", EnableCookie: true, //PostData:"keyword=财经&suser=找人", //DownloaderID:1,smblog }) } }, }, "查找微博": { ParseFunc: func(ctx *Context) { println("---------------查找微博-------------") query := ctx.GetDom() navBox := query.Find("table") navBox.Each(func(i int, s *goquery.Selection) { str := s.Find("tr").Text() j := strings.LastIndex(str,"粉丝") z := strings.LastIndex(str,"人") //昵称 name := str[0:j] //粉丝数 fansNum := str[j+6:z] //地区 city := str[z+5:len(str)] println("name" + name) //链接 if url, ok := s.Find("table tr td a").Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: "https://weibo.cn" + url, Rule: "博主首页", Temp: map[string]interface{}{ "name": name, "fansNum": fansNum, "city": city, }, }) } }) }, }, "博主首页": { ParseFunc: func(ctx *Context) { //昵称 name := ctx.GetTemp("name","").(string) //粉丝数 fansNum := ctx.GetTemp("fansNum","").(string) //地区 city := ctx.GetTemp("city","").(string) //微博数 weiboNum := ctx.GetDom().Find(".tc").Text() j := strings.LastIndex(weiboNum,"[") z := strings.LastIndex(weiboNum,"]") weiboNum = weiboNum[j+1:z] //关注数 attentionNum := ctx.GetDom().Find(".tip2 a").Eq(0).Text() j = strings.LastIndex(attentionNum,"[") z = strings.LastIndex(attentionNum,"]") attentionNum = attentionNum[j+1:z] a :=ctx.GetDom().Find(".ut a").Eq(1) if a.Text() == "加关注"{ if url, ok := ctx.GetDom().Find(".ut a").Eq(3).Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: "https://weibo.cn" + url, Rule: "资料页", EnableCookie: true, Temp: map[string]interface{}{ "name": name, "fansNum": fansNum, "city": city, "weiboNum": weiboNum, "attentionNum": attentionNum, }, }) } } else{ if url, ok := ctx.GetDom().Find(".ut a").Eq(2).Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: "https://weibo.cn" + url, Rule: "资料页", EnableCookie: true, Temp: map[string]interface{}{ "name": name, "fansNum": fansNum, "city": city, "weiboNum": weiboNum, "attentionNum": attentionNum, }, }) } } }, }, "资料页": { ItemFields: []string{ "昵称", "粉丝数", "地区", "微博数", "关注数", "标签", "详细信息", }, ParseFunc: func(ctx *Context) { //昵称 name := ctx.GetTemp("name","").(string) //粉丝数 fansNum := ctx.GetTemp("fansNum","").(string) //地区 city := ctx.GetTemp("city","").(string) //微博数 weiboNum := ctx.GetTemp("weiboNum","").(string) //关注数 attentionNum := ctx.GetTemp("attentionNum","").(string) str := ctx.GetDom().Find("div").Eq(5).Text() i := strings.LastIndex(str,"标签") z := strings.LastIndex(str,"更多") var str2,str3 string if i == -1{ str2 = "" str3 = str }else{ //标签 str2 = str[i+7:z] //详细信息 str3 = str[0:i] } ctx.Output(map[int]interface{}{ 0: name, 1: fansNum, 2: city, 3: weiboNum, 4: attentionNum, 5: str2, 6: str3, }) }, }, }, }, }
2.相关方法
//获取unix时间 func getMillisecond() int64{ MS := time.Now().UnixNano()/1000 return MS } //用户名base64加密 func encryptUname(uname string) string { // 获取username base64加密后的结果 //println(base64.RawURLEncoding.EncodeToString([]byte(uname))) return base64.URLEncoding.EncodeToString([]byte(uname)) } //密码加密 //把字符串转换bigint func string2big(s string) *big.Int { ret := new(big.Int) ret.SetString(s, 16) // 将字符串转换成16进制 return ret } func encryptPassword(pubkey string,servertime string,nonce string, password string) string{ pub := rsa.PublicKey{ N: string2big(pubkey), E: 65537, // 10001是十六进制数,65537是它的十进制表示 } // servertime、nonce之间加\t,然后在\n ,和password拼接 encryString := servertime + "\t" + nonce + "\n" + password // 拼接字符串加密 encryResult, _ := rsa.EncryptPKCS1v15(rand.Reader, &pub, []byte(encryString)) return hex.EncodeToString(encryResult) }
欢迎指正,交流沟通,共同进步!对您有帮助的话点下推荐~~