爬虫软件开发要用到的代码

1.时间戳转为C#格式时间

复制代码
        /// <summary>
        /// 时间戳转为C#格式时间
        /// </summary>
        /// <param name="timeStamp">Unix时间戳格式</param>
        /// <returns>C#格式时间</returns>
        private DateTime GetTime(string timeStamp)
        {
            DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1));
            long lTime = long.Parse(timeStamp + "0000");
            TimeSpan toNow = new TimeSpan(lTime);
            return dtStart.Add(toNow);
        }
复制代码

2.获取验证码

复制代码
 public Image GetImg(string url)
        {
            if (string.IsNullOrWhiteSpace(url))
            {
                url = string.Format("http://ms.baihe.com/checkcode/defaultImageService?0.{0}", DateTime.Now.Ticks);
            }

            cookieCheckCode = "";

            var item = new HttpItem()
            {
                URL = url,
                //Encoding = System.Text.Encoding.GetEncoding("GBK"),
                Method = "get",
                //IsToLower = false,
                Expect100Continue = false,//代理时用这个
                Cookie = _cookie,
                Timeout = 100000,
                ReadWriteTimeout = 30000,
                UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0",
                ContentType = "text/html",
                ResultType = ResultType.Byte
            };
            var result = http.GetHtml(item);

            if (result.Cookie != null)
                cookieCheckCode += result.Cookie.StartsWith("PHPSESSID")
                    ? result.Cookie.Substring(result.Cookie.IndexOf(',') + 1)
                    : result.Cookie;

            return byteArrayToImage(result.ResultByte);
        }


        private Image byteArrayToImage(byte[] Bytes)
        {
            using (var ms = new MemoryStream(Bytes))
            {
                return Bitmap.FromStream(ms, true);
            }
        }
复制代码

3.正则表使用类

复制代码
 Regex re = new Regex(@"&uid=(\d+)\"" class=\""yellow\"">(.+)</a>", RegexOptions.None);
            MatchCollection mc = re.Matches(resulthtml.Html);

            var list = new List<User>();
            foreach (Match match in mc)
            {
                var uid = match.Groups[1].Value;
                var nick = match.Groups[2].Value;
                list.Add(new User { uid = uid, nickname = nick });
            }
复制代码

4.获取时间

        public long GetTimeLikeJS()
        {
            var st = new DateTime(1970, 1, 1);
            var t = (DateTime.Now.ToUniversalTime() - st);
            return (long)t.TotalMilliseconds;
        }

5.另一线程操作主线程的控件

           this.Invoke((Action)delegate()
                    {
                        this.textBox1.Text = "登录成功";
                    });

 .net 2.0里的匿名委托

 this.Invoke((EventHandler)delegate  
            {  
                button.Text = i.ToString();  
            });  

另:BackgroundWorker 组件用来执行诸如数据库事务、文件下载等耗时的异步操作

6.Json对象转换

复制代码
 JArray array = JArray.Parse(json);
            JToken token = array[0];
            var total = token["total"];
            var users = token["list"].Children();
            foreach (var item in users)
            {
                var user = JsonConvert.DeserializeObject<UserInfo>(item.ToString());
                int age = user.age;
            }
复制代码

7.赶集网登录

复制代码
HttpHelper http = new HttpHelper();
            HttpItem item = null;
            item = new HttpItem()
            {
                URL = string.Format("https://passport.ganji.com/login.php?callback=jQuery{0}&username={1}&password={2}",GetTime(),textBox2.Text,textBox3.Text),//URL                
                Referer = "https://passport.ganji.com/login.php?next=/",//来源URL     可选项  
            };
            item.Header.Add("x-requested-with", "XMLHttpRequest");  //主要就是这一句,赶集服务器只接受ajax请求。
            HttpResult result = http.GetHtml(item);            
            string cookie = result.Cookie.Replace("path=/;", "").Replace(",", "%2c");
 
            //登录成功访问我发布的信息测试
            item = new HttpItem()
            {
                URL = "http://www.ganji.com/vip/my_post_list.php",//URL   
                Method = "get",//URL     可选项 默认为Get   
                Cookie = cookie,//字符串Cookie     可选项   
            };
            result = http.GetHtml(item);
            string html = result.Html;
 
            textBox1.Text = html + "\r\n" ;
复制代码

 

8.C# Unicode编码/解码

http://www.cnblogs.com/Rolends/archive/2011/09/22/2185276.html

复制代码
//如果post josn 乱码或服务器不能正常接收
HttpItem item = new HttpItem()
            {
                URL = url,// "http://159.142.15.196:8089/api/Users/Post_ErpUsers",//URL     必需项     
                Method = "post",//URL     可选项 默认为Get   
                IsToLower = false,//得到的HTML代码是否转成小写     可选项默认转小写   
                Cookie = "",//字符串Cookie     可选项   
                Referer = "",//来源URL     可选项   
                //  Postdata = json, //System.Web.HttpUtility.UrlEncode(json, Encoding.UTF8),//Post数据     可选项GET时不需要写   
                Timeout = 100000,//连接超时时间     可选项默认为100000    
                ReadWriteTimeout = 30000,//写入Post数据超时时间     可选项默认为30000   
                UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",//用户的浏览器类型,版本,操作系统     可选项有默认值   
                ContentType = "application/json",//返回类型    可选项有默认值   
                Allowautoredirect = false,//是否根据301跳转     可选项 
                PostdataByte = UTF8Encoding.UTF8.GetBytes(json),
                PostDataType = PostDataType.Byte
            };
复制代码

 

9.C#解析Html组件

组件名称:HtmlAgilityPack

xpath获取元素:"//*[@class=\"dc-intro\"]/ul/li"  ,查找class="dc-intro"下ul下的li

复制代码
            HtmlWeb web = new HtmlWeb();
            var doc = web.Load(string.Format("{0}/soft/{1}.html", baseUrl, id));
            var node = doc.GetElementbyId("form2");
            var titleNode = node.SelectSingleNode("//*[@class=\"rr-title dc-title clearfix\"]/h1");
            string title = titleNode.InnerText;

            string kfyy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[0].LastChild.InnerText;//开发语言
            string slsj = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[1].LastChild.InnerText;//收录时间
            string sqxy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[2].LastChild.InnerText;//授权协议
            string czxt = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[3].LastChild.InnerText;//支持的操作系统
            string rjpjHtml = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[4].InnerHtml;//软件评级

            var level = Regex.Matches(rjpjHtml, "xx01.png").Count;//级数

            string content = node.SelectSingleNode("//*[@class=\"markdown-body entry-content\"]").InnerHtml;

            string rjsy = node.SelectNodes("//*[@class=\"dc-info\"]/a")[0].Attributes["href"].Value;
            string rjxz = node.SelectNodes("//*[@class=\"dc-info\"]/a")[1].Attributes["href"].Value;    
复制代码

 10.上传文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
private static void UploadFile()
{
    HttpHelper Http = new HttpHelper();
 
    string cookieStr = File.ReadAllText(@"E:\下载\UploadImageFile\UploadImageFile\bbs_cookie.txt");
 
    #region 变量
    byte[] UploadBuffers = null;
    string BoundStr = "----WebKitFormBoundarynp7wXmB7Ntr1BcsX";//根据抓包生成
    StringBuilder UploadBuf = new StringBuilder();
    #endregion
 
    #region 头部数据
    UploadBuf.Append("--" + BoundStr + "\r\n");
    UploadBuf.Append(@"Content-Disposition: form-data; name=""uid""" + "\r\n\r\n1\r\n");
    UploadBuf.Append("--" + BoundStr + "\r\n" + @"Content-Disposition: form-data; name=""hash""" + "\r\n\r\n");
    UploadBuf.Append("dd865aaa4760a2715e5c5660754f7a7f");
    UploadBuf.Append("\r\n--" + BoundStr + "\r\n");
    UploadBuf.Append(@"Content-Disposition: form-data; name=""Filedata""; filename=""test.txt""" + "\r\n");
    UploadBuf.Append("Content-Type: text/plain\r\n\r\n");
    byte[] HeadBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString());
    #endregion
 
    #region 图片数据
    byte[] PicBytes = File.ReadAllBytes(@"c:\test.txt"); //System.Text.Encoding.Default.GetBytes();
    #endregion
 
    #region 尾部数据
    UploadBuf.Clear();
    UploadBuf.Append("\r\n--" + BoundStr + "--\r\n");
    byte[] TailBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString());
    #endregion
 
    #region 数组拼接
    UploadBuffers = ComposeArrays(HeadBytes, PicBytes);
    UploadBuffers = ComposeArrays(UploadBuffers, TailBytes);
    #endregion
 
    var item = new HttpItem()
      {
          URL = "http://www.xxx.com/misc.php?mod=swfupload&operation=upload&simple=1",
          Method = "POST",
          ContentType = "multipart/form-data; boundary=" + BoundStr,
          PostDataType = PostDataType.Byte,
          PostEncoding = Encoding.UTF8,
          PostdataByte = UploadBuffers,
          ResultType = ResultType.String,
          Cookie = cookieStr
      };
    var result = Http.GetHtml(item);
 
    var a = result.Html;
}
 
public static byte[] ComposeArrays(byte[] Array1, byte[] Array2)
{
    byte[] Temp = new byte[Array1.Length + Array2.Length];
    Array1.CopyTo(Temp, 0);
    Array2.CopyTo(Temp, Array1.Length);
    return Temp;
}

  

posted @   软件技术开发  阅读(1268)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示