抓取小程序
前言 ,想利用小程序导航页面来提升网站的流量,找到 www.xcxdh666.com 该小程序导航网站。
分析网页
1 发现网站其实也是用异步分页请求加载数据的 ,所以根本用不着xpath 解析html,直接分析其请求url
2点击加载更多找到请求,发现其实就 pageNum ,cagegory 两个参数
3所以直接请求url 带入参数,分析起返回json结果
编写代码
1 首先建立接收类型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
public class XcxApplet { public int id { get ; set ; } public string categoryName { get ; set ; } public string name { get ; set ; } public string saomaUrl { get ; set ; } public string sum { get ; set ; } public string logoUrl { get ; set ; } } public class Result { public List<XcxApplet> dataList { get ; set ; } public string category { get ; set ; } public int status { get ; set ; } public int pageNum { get ; set ; } } |
2 封装请求页面方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
public static string GetPostPage( this string posturl, string postData) { Encoding encoding = Encoding.UTF8; byte [] data = null ; if (! string .IsNullOrEmpty(postData)) data = encoding.GetBytes(postData); try { // 设置参数 var request = WebRequest.Create(posturl) as HttpWebRequest; if (request == null ) return string .Empty; var cookieContainer = new CookieContainer(); request.CookieContainer = cookieContainer; request.AllowAutoRedirect = true ; request.Method = "POST" ; request.ContentType = "application/x-www-form-urlencoded" ; if (data != null ) { request.ContentLength = data.Length; Stream outstream = request.GetRequestStream(); outstream.Write(data, 0, data.Length); outstream.Close(); } //发送请求并获取相应回应数据 var response = request.GetResponse() as HttpWebResponse; if (response == null ) return string .Empty; //直到request.GetResponse()程序才开始向目标网页发送Post请求 Stream instream = response.GetResponseStream(); if (instream == null ) return string .Empty; var sr = new StreamReader(instream, encoding); //返回结果网页(html)代码 string content = sr.ReadToEnd(); string err = string .Empty; //Response.Write(content); return content; } catch (Exception ex) { string err = ex.Message; return string .Empty; } } |
3 图片url处理 思路就是要将其返回的url 请求下载到本地或者上传到自己对应的图片服务器,
我这里是用七牛云存储img的 ,这里你可以改成下载到本地 返回本地的url就好。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
public string QiniuUplod( string imgurl) { var accessKey = "你的accesskey" ; var secretKey = "你的secretkey" ; // 生成(上传)凭证时需要使用此Mac // 这个示例单独使用了一个Settings类,其中包含AccessKey和SecretKey // 实际应用中,请自行设置您的AccessKey和SecretKey Mac mac = new Mac(accessKey, secretKey); string bucket = "siyouku" ; string saveKey = imgurl.Substring(imgurl.LastIndexOf( '/' )+1,imgurl.Length- imgurl.LastIndexOf( '/' )-1); // 使用前请确保AK和BUCKET正确,否则此函数会抛出异常(比如code612/631等错误) Qiniu.Common.Config.AutoZone(accessKey, bucket, false ); // 上传策略,参见 // https://developer.qiniu.com/kodo/manual/put-policy PutPolicy putPolicy = new PutPolicy(); // 如果需要设置为"覆盖"上传(如果云端已有同名文件则覆盖),请使用 SCOPE = "BUCKET:KEY" putPolicy.Scope = bucket + ":" + saveKey; putPolicy.Scope = bucket; // 上传策略有效期(对应于生成的凭证的有效期) putPolicy.SetExpires(3600); // 上传到云端多少天后自动删除该文件,如果不设置(即保持默认默认)则不删除 //putPolicy.DeleteAfterDays = 1; // 生成上传凭证,参见 // https://developer.qiniu.com/kodo/manual/upload-token string jstr = putPolicy.ToJsonString(); string token = Auth.CreateUploadToken(mac, jstr); try { var wReq = System.Net.WebRequest.Create(imgurl) as System.Net.HttpWebRequest; var resp = wReq.GetResponse() as System.Net.HttpWebResponse; using ( var stream = resp.GetResponseStream()) { // 请不要使用UploadManager的UploadStream方法,因为此流不支持查找(无法获取Stream.Length) // 请使用FormUploader或者ResumableUploader的UploadStream方法 FormUploader fu = new FormUploader(); var result = fu.UploadStream(stream, saveKey, token); var x = Newtonsoft.Json.JsonConvert.DeserializeObject<QiniuResult>(result.Text); return $ "http://img.siyouku.cn/{x.key}" ; } } catch (Exception ex) { return "" ; } } |
4 最后是请求主体方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public ActionResult GetxcxList() { Stopwatch watch = new Stopwatch(); //监控抓取耗时 watch.Start(); //https://www.xcxdh666.com/pageList.htm?pageNum=0 dataList var result = new Result(); for ( int j = 0; j <54; j++) { string url = $ "https://www.xcxdh666.com/pageList.htm?pageNum={j}" ; var str = url.GetPostPage( null ); //HttpWebRequest 请求页面 if (str != null ) { result = str.JsonConvert<Result>(); //string 的序列化扩展方法 } result.dataList.ForEach(i => { if (!Db.Applet.Any(x => x.Name == i.name)) //判断重复插入 { var x = new Applet() { CategoryName = string .IsNullOrEmpty(i.categoryName) ? "其它" : i.categoryName, Name = i.name, SaomiaoUrl = QiniuUplod($ "http://img.xcxdh666.com/wxappnav/{i.saomaUrl}" ), Summary = i.sum, LogoUrl = QiniuUplod($ "http://img.xcxdh666.com/wxappnav/{i.logoUrl}" ), SortNum = j, CreateUser = "wenqing" , CreateTime = DateTime.Now }; Db.Applet.Add(x); } }); Db.SaveChanges(); } watch.Stop(); return Content( "爬取完成!本次请求总共耗时:" + watch.ElapsedMilliseconds); } } |
ok 到这里就全部抓取完成
这里附上 展示地址 http://siyouku.cn/Applet