HTML代码处理
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 public class WebPage 2 { 3 #region Property 4 /// <summary> 5 /// 页面地址 6 /// </summary> 7 public string URL { get; set; } 8 /// <summary> 9 /// 域名 10 /// </summary> 11 public string Host { get; set; } 12 /// <summary> 13 /// 原始HTML 14 /// </summary> 15 public string OriginalHTML { get; set; } 16 /// <summary> 17 /// 过滤style、script及注释后的HTML 18 /// </summary> 19 public string HTML { get; set; } 20 /// <summary> 21 /// 标题 22 /// </summary> 23 public string Title { get; set; } 24 /// <summary> 25 /// 来源 26 /// </summary> 27 public string Source { get; set; } 28 /// <summary> 29 /// 发布日期 30 /// </summary> 31 public DateTime? PublishDate { get; set; } 32 /// <summary> 33 /// 作者 34 /// </summary> 35 public string Author { get; set; } 36 /// <summary> 37 /// 摘要 38 /// </summary> 39 public string Abstract { get; set; } 40 /// <summary> 41 /// 关键字 42 /// </summary> 43 public string KeyWord { get; set; } 44 /// <summary> 45 /// 正文 46 /// </summary> 47 public string Content { get; set; } 48 #endregion 49 50 #region Constructor 51 /// <summary> 52 /// 构造函数 53 /// </summary> 54 public WebPage() { } 55 /// <summary> 56 /// 构造函数 57 /// </summary> 58 /// <param name="html">HTML文档</param> 59 public WebPage(string html) : this() 60 { 61 //this.OriginalHTML = System.Web.HtmlUtility.HtmlDecode(html); 62 //过滤style/script/注释 63 this.HTML = ClearHTML(html); 64 //标题 65 this.Title = GetTitle(this.HTML); 66 } 67 /// <summary> 68 /// 构造函数 69 /// </summary> 70 /// <param name="html">HTML文档</param> 71 /// <param name="url">页面地址,用于处理img标签图片路径</param> 72 public WebPage(string html, string url) : this(html) 73 { 74 this.URL = url; 75 this.Host = new Uri(url).Host; 76 } 77 #endregion 78 79 #region public methods 80 81 public static string GetHTML(string url, string cookie = "", bool throwError = false, string charset = "") 82 { 83 if (url.Substring(0, 4) != "http") 84 { 85 url = "http://" + url; 86 } 87 Uri uri = new Uri(url); 88 return GetHTML(uri, cookie, throwError, charset); 89 } 90 public static string GetHTML(Uri uri, string cookie = "", bool throwError = false, string charset = "") 91 { 92 try 93 { 94 string html = ""; 95 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); 96 req.Method = "GET"; 97 req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"; 98 req.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"; 99 req.Timeout = 60 * 1000; 100 if (!string.IsNullOrEmpty(cookie)) 101 { 102 req.Headers.Add("Cookie", cookie); 103 } 104 105 HttpWebResponse res = (HttpWebResponse)req.GetResponse(); 106 List<byte> lstByteHtml = new List<byte>(); 107 using (Stream stream = res.GetResponseStream()) 108 { 109 int tempByte = stream.ReadByte(); 110 while (tempByte != -1) 111 { 112 lstByteHtml.Add((byte)tempByte); 113 tempByte = stream.ReadByte(); 114 } 115 } 116 byte[] arrByteHtml = lstByteHtml.ToArray(); 117 118 if (!string.IsNullOrEmpty(charset)) 119 { 120 html = Encoding.GetEncoding(charset).GetString(arrByteHtml, 0, arrByteHtml.Length); 121 } 122 else 123 { 124 html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length); 125 string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value; 126 //如果匹配到了标签并且不是utf8 那么重新解码一次 127 if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1)) 128 { 129 try 130 { 131 html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length); 132 } 133 catch { } 134 } 135 } 136 return html; 137 } 138 catch (Exception ex) 139 { 140 if (throwError) 141 { 142 throw ex; 143 } 144 else 145 { 146 return ""; 147 } 148 } 149 } 150 151 public static string GetPostHTML(string url, string cookie = "", bool throwError = false, string chatset = "") 152 { 153 string[] arrUrl = url.Split('?'); 154 string postdata = arrUrl.Length > 1 ? arrUrl[1] : ""; 155 Uri uri = new Uri(url); 156 return GetPostHTML(uri, postdata, cookie, throwError, chatset); 157 } 158 public static string GetPostHTML(Uri uri, string postdata, string cookie = "", bool throwError = false, string chatset = "") 159 { 160 try 161 { 162 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); 163 req.Method = "POST"; 164 req.ContentType = "application/x-www-form-urlencoded"; 165 //req.ContentLength = Encoding.UTF8.GetByteCount(postData); 166 if (!string.IsNullOrEmpty(cookie)) 167 { 168 req.Headers.Add("Cookie", cookie); 169 } 170 using (Stream stream = req.GetRequestStream()) 171 { 172 StreamWriter sw = new StreamWriter(stream, Encoding.GetEncoding("gb2312")); 173 sw.Write(postdata); 174 sw.Close(); 175 stream.Close(); 176 } 177 178 List<byte> lstByteHtml = new List<byte>(); 179 HttpWebResponse response = (HttpWebResponse)req.GetResponse(); 180 using (Stream stream = response.GetResponseStream()) 181 { 182 int tempByte = stream.ReadByte(); 183 while (tempByte != -1) 184 { 185 lstByteHtml.Add((byte)tempByte); 186 tempByte = stream.ReadByte(); 187 } 188 } 189 byte[] arrByteHtml = lstByteHtml.ToArray(); 190 string html = ""; 191 if (!string.IsNullOrEmpty(chatset)) 192 { 193 html = Encoding.GetEncoding(chatset).GetString(arrByteHtml, 0, arrByteHtml.Length); 194 } 195 else 196 { 197 html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length); 198 string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value; 199 //如果匹配到了标签并且不是utf8 那么重新解码一次 200 if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1)) 201 { 202 try 203 { 204 html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length); 205 } 206 catch { } 207 } 208 } 209 210 return html; 211 } 212 catch (Exception ex) 213 { 214 if (throwError) 215 { 216 throw ex; 217 } 218 else 219 { 220 return ""; 221 } 222 } 223 } 224 225 /// <summary> 226 /// 过滤style/script/注释,更新HTML 227 /// </summary> 228 public static string ClearHTML(string html) 229 { 230 //过滤表达式:style, script, 注释 231 string[] filter = { @"(?is)<style.*?>.*?</style>", 232 @"(?is)<script.*?>.*?</script>", 233 @"(?is)<!--.*?-->" }; 234 foreach (string reg in filter) 235 { 236 html = Regex.Replace(html, reg, ""); 237 } 238 return html; 239 } 240 public static string Html2Lower(string html) 241 { 242 MatchCollection matchs = Regex.Matches(html, @"(?<tag><[^\s>]+\s)|(?<tag><[^\s>]+>)"); 243 Regex.Replace("", "", ""); 244 foreach (Match match in matchs) 245 { 246 string tag = match.Value.ToLower(); 247 html = html.Replace(match.Value, tag); 248 } 249 return html; 250 } 251 /// <summary> 252 /// 获取标题(匹配title和h1标签,当h1内容部分与title匹配时优先选择h1) 253 /// </summary> 254 /// <returns>title</returns> 255 public static string GetTitle(string html) 256 { 257 string titleFilter = @"<title>[\s\S]*?</title>", 258 h1Filter = @"<h1.*?>.*?</h1>", 259 clearFilter = @"<.*?>"; 260 261 string title = "", h1 = ""; 262 //匹配title标签中的值 263 Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase); 264 if (match.Success) 265 { 266 title = Regex.Replace(match.Groups[0].Value, clearFilter, ""); 267 } 268 //匹配正文中h1标签中的值 269 match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase); 270 if (match.Success) 271 { 272 h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "").Replace("\t", ""); 273 } 274 //都匹配的时候优先取h1 275 if (!String.IsNullOrEmpty(h1) && (!String.IsNullOrEmpty(title) && title.IndexOf(h1) != -1)) 276 { 277 title = h1; 278 } 279 return title; 280 } 281 public static string GetHeadTitle(string html) 282 { 283 string title = ""; 284 Match match = Regex.Match(html, @"<title>([\s\S]*?)</title>", RegexOptions.IgnoreCase); 285 if (match.Success) 286 { 287 title = match.Groups[1].Value; 288 } 289 return title; 290 } 291 public static string GetHeadBase(string html) 292 { 293 return Regex.Match(html, @"<base.*?href=[""|'']?([a-zA-Z0-9-:/.]+)[""|''].*?>", RegexOptions.IgnoreCase).Groups[1].Value; 294 } 295 296 /// <summary> 297 /// 获取发布日期(未提取到时返回null) 298 /// </summary> 299 /// <param name="html"></param> 300 /// <returns></returns> 301 public static DateTime GetPublishDate(string html) 302 { 303 // 过滤html标签,防止标签对日期提取产生影响 304 string text = Regex.Replace(html, "(?is)<.*?>", ""); 305 Match match = Regex.Match(text, @"((\d{4}|\d{2})(\-|\/)\d{1,2}\3\d{1,2})(\s?\d{2}:\d{2})?|(\d{4}年\d{1,2}月\d{1,2}日)(\s?\d{2}:\d{2})?", RegexOptions.IgnoreCase); 306 307 DateTime result = new DateTime(1900, 1, 1); 308 if (match.Success) 309 { 310 try 311 { 312 string dateStr = ""; 313 for (int i = 0; i < match.Groups.Count; i++) 314 { 315 dateStr = match.Groups[i].Value; 316 if (!String.IsNullOrEmpty(dateStr)) 317 { 318 break; 319 } 320 } 321 // 对中文日期的处理 322 if (dateStr.Contains("年")) 323 { 324 StringBuilder sb = new StringBuilder(); 325 foreach (var ch in dateStr) 326 { 327 if (ch == '年' || ch == '月') 328 { 329 sb.Append("/"); 330 continue; 331 } 332 if (ch == '日') 333 { 334 sb.Append(' '); 335 continue; 336 } 337 sb.Append(ch); 338 } 339 dateStr = sb.ToString(); 340 } 341 result = Convert.ToDateTime(dateStr); 342 } 343 catch (Exception ex) 344 { 345 Console.WriteLine(ex); 346 } 347 } 348 return result; 349 } 350 /// <summary> 351 /// 替换a标签的地址为完整地址 352 /// </summary> 353 /// <param name="url"></param> 354 /// <param name="href"></param> 355 /// <returns></returns> 356 public static string ReplaceAhref(string url, string href, string baseStr = "") 357 { 358 try 359 { 360 string value = ""; 361 if (baseStr != "" && !IsAbsolutePath(href)) 362 { 363 value = baseStr + href; 364 } 365 else 366 { 367 if (href.PadRight(5).Substring(0, 4) == "http") 368 { 369 value = href; 370 } 371 else 372 { 373 if (url.Substring(0, 4) != "http") 374 { 375 url = "http://" + url; 376 } 377 Uri uri = new Uri(url); 378 if (href.Substring(0, 1) == "/") 379 { 380 value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + href; 381 } 382 else 383 { 384 string[] arrURL = url.Split('/'); 385 if (arrURL.Length == 1) 386 { 387 value = url + "/" + href; 388 } 389 else 390 { 391 arrURL[arrURL.Length - 1] = href; 392 value = string.Join("/", arrURL); 393 } 394 } 395 } 396 } 397 return value; 398 } 399 catch (Exception ex) 400 { 401 return href; 402 } 403 } 404 /// <summary> 405 /// 根据img标签的路径为base64编码 406 /// </summary> 407 /// <param name="url">页面地址</param> 408 /// <param name="src">img标签图片地址</param> 409 /// <returns></returns> 410 public static string ReplaceImgSrc(string url, string src, string baseStr = "") 411 { 412 try 413 { 414 string value = ""; 415 if (baseStr != "" && !IsAbsolutePath(src)) 416 { 417 value = baseStr + src; 418 } 419 else 420 { 421 if (src.PadRight(5).Substring(0, 4) == "http") 422 { 423 value = src; 424 } 425 else 426 { 427 if (url.Substring(0, 4) != "http") 428 { 429 url = "http://" + url; 430 } 431 Uri uri = new Uri(url); 432 if (src.Substring(0, 1) == "/") 433 { 434 value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + src; 435 } 436 else 437 { 438 string[] arrURL = url.Split('/'); 439 if (arrURL.Length == 1) 440 { 441 value = url + "/" + src; 442 } 443 else 444 { 445 arrURL[arrURL.Length - 1] = src; 446 value = string.Join("/", arrURL); 447 } 448 } 449 } 450 } 451 if (value.Substring(0, 4) != "http") 452 { 453 value = "http://" + value; 454 } 455 return WebPage.ImgToBase64String(value); 456 } 457 catch (Exception ex) 458 { 459 return src; 460 } 461 } 462 463 public static bool IsAbsolutePath(string url) 464 { 465 bool flag = false; 466 if (!string.IsNullOrEmpty(url)) 467 { 468 string head = url.PadRight(10, ' '); 469 if (head.Substring(0, 5) == "http:" || head.Substring(0, 6) == "https:") 470 { 471 flag = true; 472 } 473 } 474 return flag; 475 } 476 //图片 转为 base64编码的文本 477 public static string ImgToBase64String(string imageUrl) 478 { 479 try 480 { 481 string imgBase64 = ""; 482 string imgFormat = "png"; 483 484 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(imageUrl); 485 req.Accept = "image/*"; 486 HttpWebResponse res = (HttpWebResponse)req.GetResponse(); 487 488 Image img = Image.FromStream(res.GetResponseStream()); 489 imgFormat = GetImageFormat(img); 490 491 using (MemoryStream ms = new MemoryStream()) 492 { 493 img.Save(ms, img.RawFormat); 494 byte[] arrByteStream = new byte[ms.Length]; 495 ms.Position = 0; 496 ms.Read(arrByteStream, 0, (int)ms.Length); 497 ms.Close(); 498 imgBase64 = Convert.ToBase64String(arrByteStream); 499 } 500 return "data:image/" + imgFormat + ";base64," + imgBase64; 501 } 502 catch (Exception ex) 503 { 504 return imageUrl; 505 } 506 } 507 private static string GetImageFormat(Image img) 508 { 509 string format = "png"; 510 511 if (img.RawFormat.Equals(ImageFormat.Bmp)) 512 { 513 format = "jpeg"; 514 } 515 else if (img.RawFormat.Equals(ImageFormat.Emf)) 516 { 517 format = "emf"; 518 } 519 else if (img.RawFormat.Equals(ImageFormat.Exif)) 520 { 521 format = "exif"; 522 } 523 else if (img.RawFormat.Equals(ImageFormat.Gif)) 524 { 525 format = "gif"; 526 } 527 else if (img.RawFormat.Equals(ImageFormat.Icon)) 528 { 529 format = "icon"; 530 } 531 else if (img.RawFormat.Equals(ImageFormat.Jpeg)) 532 { 533 format = "jpeg"; 534 } 535 else if (img.RawFormat.Equals(ImageFormat.MemoryBmp)) 536 { 537 format = "memorybmp"; 538 } 539 else if (img.RawFormat.Equals(ImageFormat.Png)) 540 { 541 format = "png"; 542 } 543 else if (img.RawFormat.Equals(ImageFormat.Tiff)) 544 { 545 format = "tiff"; 546 } 547 else if (img.RawFormat.Equals(ImageFormat.Wmf)) 548 { 549 format = "wmf"; 550 } 551 return format; 552 } 553 #endregion 554 }