爬虫软件开发要用到的代码
1.时间戳转为C#格式时间
/// <summary> /// 时间戳转为C#格式时间 /// </summary> /// <param name="timeStamp">Unix时间戳格式</param> /// <returns>C#格式时间</returns> private DateTime GetTime(string timeStamp) { DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1)); long lTime = long.Parse(timeStamp + "0000"); TimeSpan toNow = new TimeSpan(lTime); return dtStart.Add(toNow); }
2.获取验证码
public Image GetImg(string url) { if (string.IsNullOrWhiteSpace(url)) { url = string.Format("http://ms.baihe.com/checkcode/defaultImageService?0.{0}", DateTime.Now.Ticks); } cookieCheckCode = ""; var item = new HttpItem() { URL = url, //Encoding = System.Text.Encoding.GetEncoding("GBK"), Method = "get", //IsToLower = false, Expect100Continue = false,//代理时用这个 Cookie = _cookie, Timeout = 100000, ReadWriteTimeout = 30000, UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0", ContentType = "text/html", ResultType = ResultType.Byte }; var result = http.GetHtml(item); if (result.Cookie != null) cookieCheckCode += result.Cookie.StartsWith("PHPSESSID") ? result.Cookie.Substring(result.Cookie.IndexOf(',') + 1) : result.Cookie; return byteArrayToImage(result.ResultByte); } private Image byteArrayToImage(byte[] Bytes) { using (var ms = new MemoryStream(Bytes)) { return Bitmap.FromStream(ms, true); } }
3.正则表使用类
Regex re = new Regex(@"&uid=(\d+)\"" class=\""yellow\"">(.+)</a>", RegexOptions.None); MatchCollection mc = re.Matches(resulthtml.Html); var list = new List<User>(); foreach (Match match in mc) { var uid = match.Groups[1].Value; var nick = match.Groups[2].Value; list.Add(new User { uid = uid, nickname = nick }); }
4.获取时间
public long GetTimeLikeJS() { var st = new DateTime(1970, 1, 1); var t = (DateTime.Now.ToUniversalTime() - st); return (long)t.TotalMilliseconds; }
5.另一线程操作主线程的控件
this.Invoke((Action)delegate() { this.textBox1.Text = "登录成功"; });
.net 2.0里的匿名委托
this.Invoke((EventHandler)delegate { button.Text = i.ToString(); });
另:BackgroundWorker 组件用来执行诸如数据库事务、文件下载等耗时的异步操作
6.Json对象转换
JArray array = JArray.Parse(json); JToken token = array[0]; var total = token["total"]; var users = token["list"].Children(); foreach (var item in users) { var user = JsonConvert.DeserializeObject<UserInfo>(item.ToString()); int age = user.age; }
7.赶集网登录
HttpHelper http = new HttpHelper(); HttpItem item = null; item = new HttpItem() { URL = string.Format("https://passport.ganji.com/login.php?callback=jQuery{0}&username={1}&password={2}",GetTime(),textBox2.Text,textBox3.Text),//URL Referer = "https://passport.ganji.com/login.php?next=/",//来源URL 可选项 }; item.Header.Add("x-requested-with", "XMLHttpRequest"); //主要就是这一句,赶集服务器只接受ajax请求。 HttpResult result = http.GetHtml(item); string cookie = result.Cookie.Replace("path=/;", "").Replace(",", "%2c"); //登录成功访问我发布的信息测试 item = new HttpItem() { URL = "http://www.ganji.com/vip/my_post_list.php",//URL Method = "get",//URL 可选项 默认为Get Cookie = cookie,//字符串Cookie 可选项 }; result = http.GetHtml(item); string html = result.Html; textBox1.Text = html + "\r\n" ;
8.C# Unicode编码/解码
http://www.cnblogs.com/Rolends/archive/2011/09/22/2185276.html
//如果post josn 乱码或服务器不能正常接收 HttpItem item = new HttpItem() { URL = url,// "http://159.142.15.196:8089/api/Users/Post_ErpUsers",//URL 必需项 Method = "post",//URL 可选项 默认为Get IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "",//字符串Cookie 可选项 Referer = "",//来源URL 可选项 // Postdata = json, //System.Web.HttpUtility.UrlEncode(json, Encoding.UTF8),//Post数据 可选项GET时不需要写 Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",//用户的浏览器类型,版本,操作系统 可选项有默认值 ContentType = "application/json",//返回类型 可选项有默认值 Allowautoredirect = false,//是否根据301跳转 可选项 PostdataByte = UTF8Encoding.UTF8.GetBytes(json), PostDataType = PostDataType.Byte };
9.C#解析Html组件
组件名称:HtmlAgilityPack
xpath获取元素:"//*[@class=\"dc-intro\"]/ul/li" ,查找class="dc-intro"下ul下的li
HtmlWeb web = new HtmlWeb(); var doc = web.Load(string.Format("{0}/soft/{1}.html", baseUrl, id)); var node = doc.GetElementbyId("form2"); var titleNode = node.SelectSingleNode("//*[@class=\"rr-title dc-title clearfix\"]/h1"); string title = titleNode.InnerText; string kfyy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[0].LastChild.InnerText;//开发语言 string slsj = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[1].LastChild.InnerText;//收录时间 string sqxy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[2].LastChild.InnerText;//授权协议 string czxt = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[3].LastChild.InnerText;//支持的操作系统 string rjpjHtml = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[4].InnerHtml;//软件评级 var level = Regex.Matches(rjpjHtml, "xx01.png").Count;//级数 string content = node.SelectSingleNode("//*[@class=\"markdown-body entry-content\"]").InnerHtml; string rjsy = node.SelectNodes("//*[@class=\"dc-info\"]/a")[0].Attributes["href"].Value; string rjxz = node.SelectNodes("//*[@class=\"dc-info\"]/a")[1].Attributes["href"].Value;
10.上传文件
private static void UploadFile() { HttpHelper Http = new HttpHelper(); string cookieStr = File.ReadAllText(@"E:\下载\UploadImageFile\UploadImageFile\bbs_cookie.txt"); #region 变量 byte[] UploadBuffers = null; string BoundStr = "----WebKitFormBoundarynp7wXmB7Ntr1BcsX";//根据抓包生成 StringBuilder UploadBuf = new StringBuilder(); #endregion #region 头部数据 UploadBuf.Append("--" + BoundStr + "\r\n"); UploadBuf.Append(@"Content-Disposition: form-data; name=""uid""" + "\r\n\r\n1\r\n"); UploadBuf.Append("--" + BoundStr + "\r\n" + @"Content-Disposition: form-data; name=""hash""" + "\r\n\r\n"); UploadBuf.Append("dd865aaa4760a2715e5c5660754f7a7f"); UploadBuf.Append("\r\n--" + BoundStr + "\r\n"); UploadBuf.Append(@"Content-Disposition: form-data; name=""Filedata""; filename=""test.txt""" + "\r\n"); UploadBuf.Append("Content-Type: text/plain\r\n\r\n"); byte[] HeadBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString()); #endregion #region 图片数据 byte[] PicBytes = File.ReadAllBytes(@"c:\test.txt"); //System.Text.Encoding.Default.GetBytes(); #endregion #region 尾部数据 UploadBuf.Clear(); UploadBuf.Append("\r\n--" + BoundStr + "--\r\n"); byte[] TailBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString()); #endregion #region 数组拼接 UploadBuffers = ComposeArrays(HeadBytes, PicBytes); UploadBuffers = ComposeArrays(UploadBuffers, TailBytes); #endregion var item = new HttpItem() { URL = "http://www.xxx.com/misc.php?mod=swfupload&operation=upload&simple=1", Method = "POST", ContentType = "multipart/form-data; boundary=" + BoundStr, PostDataType = PostDataType.Byte, PostEncoding = Encoding.UTF8, PostdataByte = UploadBuffers, ResultType = ResultType.String, Cookie = cookieStr }; var result = Http.GetHtml(item); var a = result.Html; } public static byte[] ComposeArrays(byte[] Array1, byte[] Array2) { byte[] Temp = new byte[Array1.Length + Array2.Length]; Array1.CopyTo(Temp, 0); Array2.CopyTo(Temp, Array1.Length); return Temp; }