网页登陆,网页采集基础类
昨天在博客园上看到了关于秒杀的文章,通过模拟网页数据传送的方式达到更快的网页访问操作!有些时候模拟网页访问还是蛮有用的,比如自动登陆,网页采集等等。下面的几个方法就是我用到的几个类,虽然方法比较少,但基本的要求还是能满足的。
public static class HtmlHelper { /// <summary> /// 获得基础流 /// </summary> /// <param name="uri">网址</param> /// <param name="cc">cookie容器,可以为NULL</param> /// <returns></returns> public static Stream GetBaseStream(string uri, CookieContainer cc) { try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); //用指定Uri创建一个request if (cc != null) { request.CookieContainer = cc; } //浏览器欺骗 request.ContentType = "application/x-www-form-urlencoded"; request.Accept = @"application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; request.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //根据创建的request得到响应response Stream responseStream = response.GetResponseStream(); //创建一个流来获得响应体 return responseStream; } catch (Exception ex) { MessageBox.Show(@"操作失败:" + ex.Message); return null; } } /// <summary> /// 获得网页 /// </summary> /// <param name="uri">网址</param> /// <param name="postDate"></param> /// <param name="cc">cookie容器,可以为null</param> /// <param name="encoding">网页编码</param> /// <returns></returns> public static string GetHtmlString(string uri, string postDate, CookieContainer cc, Encoding encoding) { try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); request.ContentType = "application/x-www-form-urlencoded"; request.AllowAutoRedirect = true; request.Accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0"; request.CookieContainer = cc; //设置request产生cookie的容器 if (postDate != null) { request.Method = "Post"; byte[] byterequest = Encoding.UTF8.GetBytes(postDate); request.ContentLength = byterequest.Length; using (Stream stream = request.GetRequestStream()) { stream.Write(byterequest, 0, byterequest.Length); } } using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { using (Stream responsestream = response.GetResponseStream()) { StreamReader sr = new StreamReader(responsestream, encoding); string html = sr.ReadToEnd(); return html; } } } catch (Exception ex) { MessageBox.Show(@"发生错误:" + ex.Message); return null; } } /// <summary> /// 从字符串中返回匹配多个的集合值(网页抽取特定部分有效) /// </summary> /// <param name="start">开始html tag</param> /// <param name="end">结束html tag</param> /// <param name="html">html</param> /// <returns></returns> public static List<string> GetStrings(string start, string end, string html) { List<string> list = new List<string>(); try { string pattern = string.Format("{0}(?<g>(.|[\r\n])+?){1}", start, end);//匹配URL的模式,并分组 //理解这个正则 MatchCollection mc = Regex.Matches(html, pattern);//满足pattern的匹配集合 if (mc.Count != 0) { foreach (Match match in mc) { GroupCollection gc = match.Groups; list.Add(gc["g"].Value); } } } catch { } return list; } /// <summary> /// 中文url编码 /// </summary> /// <param name="buffer"></param> /// <returns></returns> public static string UrlEncode(string buffer) { byte[] bty = Encoding.Default.GetBytes(buffer); StringBuilder builder = new StringBuilder(); for (int i = 0; i < bty.Length; i++) { builder.Append("%"); builder.Append(bty[i].ToString("x2")); } return builder.ToString(); } }