正则表达式相关:C# 抓取网页类(获取网页中所有信息)
类的代码:
1 using System; 2 using System.Data; 3 using System.Configuration; 4 using System.Net; 5 using System.IO; 6 using System.Text; 7 using System.Collections.Generic; 8 using System.Text.RegularExpressions; 9 using System.Threading; 10 using System.Web; 11 using System.Web.UI.MobileControls; 12 /// <summary> 13 /// 网页类 14 /// </summary> 15 public class WebPage 16 { 17 #region 私有成员 18 private Uri m_uri; //url 19 private List<Link> m_links; //此网页上的链接 20 private string m_title; //标题 21 private string m_html; //HTML代码 22 private string m_outstr; //网页可输出的纯文本 23 private bool m_good; //网页是否可用 24 private int m_pagesize; //网页的大小 25 private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie 26 27 #endregion 28 29 #region 属性 30 31 /// <summary> 32 /// 通过此属性可获得本网页的网址,只读 33 /// </summary> 34 public string URL 35 { 36 get 37 { 38 return m_uri.AbsoluteUri; 39 } 40 } 41 42 /// <summary> 43 /// 通过此属性可获得本网页的标题,只读 44 /// </summary> 45 public string Title 46 { 47 get 48 { 49 if (m_title == "") 50 { 51 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase); 52 Match mc = reg.Match(m_html); 53 if (mc.Success) 54 m_title = mc.Groups["title"].Value.Trim(); 55 } 56 return m_title; 57 } 58 } 59 public string M_html 60 { 61 get 62 { 63 if (m_html == null) 64 { 65 m_html = ""; 66 } 67 return m_html; 68 } 69 } 70 /// <summary> 71 /// 此属性获得本网页的所有链接信息,只读 72 /// </summary> 73 public List<Link> Links 74 { 75 get 76 { 77 if (m_links.Count == 0) getLinks(); 78 return m_links; 79 } 80 } 81 82 83 /// <summary> 84 /// 此属性返回本网页的全部纯文本信息,只读 85 /// </summary> 86 public string Context 87 { 88 get 89 { 90 if (m_outstr == "") getContext(Int16.MaxValue); 91 return m_outstr; 92 } 93 } 94 95 /// <summary> 96 /// 此属性获得本网页的大小 97 /// </summary> 98 public int PageSize 99 { 100 get 101 { 102 return m_pagesize; 103 } 104 } 105 /// <summary> 106 /// 此属性获得本网页的所有站内链接 107 /// </summary> 108 public List<Link> InsiteLinks 109 { 110 get 111 { 112 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue); 113 } 114 } 115 116 /// <summary> 117 /// 此属性表示本网页是否可用 118 /// </summary> 119 public bool IsGood 120 { 121 get 122 { 123 return m_good; 124 } 125 } 126 /// <summary> 127 /// 此属性表示网页的所在的网站 128 /// </summary> 129 public string Host 130 { 131 get 132 { 133 return m_uri.Host; 134 } 135 } 136 #endregion 137 138 139 /// <summary> 140 /// 从HTML代码中分析出链接信息 141 /// </summary> 142 /// <returns>List<Link></returns> 143 private List<Link> getLinks() 144 { 145 if (m_links.Count == 0) 146 { 147 Regex[] regex = new Regex[2]; 148 regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline); 149 regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.IgnoreCase); 150 151 for (int i = 0; i < 2; i++) 152 { 153 Match match = regex[i].Match(m_html); 154 while (match.Success) 155 { 156 try 157 { 158 string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri); 159 160 string text = ""; 161 if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, ""); 162 163 Link link = new Link(); 164 link.Text = text; 165 link.NavigateUrl = url; 166 167 m_links.Add(link); 168 } 169 catch (Exception ex) { Console.WriteLine(ex.Message); }; 170 match = match.NextMatch(); 171 } 172 } 173 } 174 return m_links; 175 } 176 /// <summary> 177 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本 178 /// </summary> 179 /// <param name="instr">HTML代码</param> 180 /// <param name="firstN">提取从头数多少个字</param> 181 /// <param name="withLink">是否要链接里面的字</param> 182 /// <returns>纯文本</returns> 183 private string getFirstNchar(string instr, int firstN, bool withLink) 184 { 185 if (m_outstr == "") 186 { 187 m_outstr = instr.Clone() as string; 188 m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 189 m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 190 m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 191 if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 192 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase); 193 m_outstr = objReg.Replace(m_outstr, ""); 194 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase); 195 m_outstr = objReg2.Replace(m_outstr, " "); 196 197 } 198 return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr; 199 } 200 201 202 #region 公有文法 203 /// <summary> 204 /// 此公有方法提取网页中一定字数的纯文本,包括链接文字 205 /// </summary> 206 /// <param name="firstN">字数</param> 207 /// <returns></returns> 208 public string getContext(int firstN) 209 { 210 return getFirstNchar(m_html, firstN, true); 211 } 212 213 /// <summary> 214 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式 215 /// </summary> 216 /// <param name="pattern">正则式</param> 217 /// <param name="count">返回的链接的个数</param> 218 /// <returns>List<Link></returns> 219 public List<Link> getSpecialLinksByUrl(string pattern, int count) 220 { 221 if (m_links.Count == 0) getLinks(); 222 List<Link> SpecialLinks = new List<Link>(); 223 List<Link>.Enumerator i; 224 i = m_links.GetEnumerator(); 225 int cnt = 0; 226 while (i.MoveNext() && cnt < count) 227 { 228 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success) 229 { 230 SpecialLinks.Add(i.Current); 231 cnt++; 232 } 233 } 234 return SpecialLinks; 235 } 236 237 /// <summary> 238 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 239 /// </summary> 240 /// <param name="pattern">正则式</param> 241 /// <param name="count">返回的链接的个数</param> 242 /// <returns>List<Link></returns> 243 public List<Link> getSpecialLinksByText(string pattern, int count) 244 { 245 if (m_links.Count == 0) getLinks(); 246 List<Link> SpecialLinks = new List<Link>(); 247 List<Link>.Enumerator i; 248 i = m_links.GetEnumerator(); 249 int cnt = 0; 250 while (i.MoveNext() && cnt < count) 251 { 252 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success) 253 { 254 SpecialLinks.Add(i.Current); 255 cnt++; 256 } 257 } 258 return SpecialLinks; 259 } 260 261 /// <summary> 262 /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起 263 /// </summary> 264 /// <param name="pattern">正则式</param> 265 /// <returns>返回文字</returns> 266 public string getSpecialWords(string pattern) 267 { 268 if (m_outstr == "") getContext(Int16.MaxValue); 269 Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); 270 Match mc = regex.Match(m_outstr); 271 if (mc.Success) 272 return mc.Groups[1].Value; 273 return string.Empty; 274 } 275 #endregion 276 277 #region 构造函数 278 279 private void Init(string _url) 280 { 281 try 282 { 283 m_uri = new Uri(_url); 284 m_links = new List<Link>(); 285 m_html = ""; 286 m_outstr = ""; 287 m_title = ""; 288 m_good = true; 289 if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi")) 290 { 291 m_good = false; 292 return; 293 } 294 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); 295 rqst.AllowAutoRedirect = true; 296 rqst.MaximumAutomaticRedirections = 3; 297 rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; 298 rqst.KeepAlive = true; 299 rqst.Timeout = 10000; 300 lock (WebPage.webcookies) 301 { 302 if (WebPage.webcookies.ContainsKey(m_uri.Host)) 303 rqst.CookieContainer = WebPage.webcookies[m_uri.Host]; 304 else 305 { 306 CookieContainer cc = new CookieContainer(); 307 WebPage.webcookies[m_uri.Host] = cc; 308 rqst.CookieContainer = cc; 309 } 310 } 311 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); 312 Stream sm = rsps.GetResponseStream(); 313 if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22) 314 { 315 rsps.Close(); 316 m_good = false; 317 return; 318 } 319 Encoding cding = System.Text.Encoding.Default; 320 string contenttype = rsps.ContentType.ToLower(); 321 int ix = contenttype.IndexOf("charset="); 322 if (ix != -1) 323 { 324 try 325 { 326 cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1)); 327 } 328 catch 329 { 330 cding = Encoding.Default; 331 } 332 333 //该处视情况而定 有的需要解码 334 //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 335 m_html = new StreamReader(sm, cding).ReadToEnd(); 336 337 } 338 else 339 { 340 //该处视情况而定 有的需要解码 341 //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 342 343 m_html = new StreamReader(sm, cding).ReadToEnd(); 344 Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", RegexOptions.IgnoreCase); 345 string strcding = regex.Match(m_html).Groups["cding"].Value; 346 try 347 { 348 cding = Encoding.GetEncoding(strcding); 349 } 350 catch 351 { 352 cding = Encoding.Default; 353 } 354 byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray()); 355 m_html = cding.GetString(bytes); 356 if (m_html.Split('?').Length > 100) 357 { 358 m_html = Encoding.Default.GetString(bytes); 359 } 360 } 361 m_pagesize = m_html.Length; 362 m_uri = rsps.ResponseUri; 363 rsps.Close(); 364 } 365 catch (Exception ex) 366 { 367 368 } 369 } 370 public WebPage(string _url) 371 { 372 string uurl = ""; 373 try 374 { 375 uurl = Uri.UnescapeDataString(_url); 376 _url = uurl; 377 } 378 catch { }; 379 Init(_url); 380 } 381 #endregion 382 }
调用:
WebPage webInfo = new WebPage("http://hovertree.net/"); webInfo.Context;//不包含html标签的所有内容 webInfo.M_html;//包含html标签的内容 by 何问起