HTML代码处理

  1     public class WebPage
  2     {
  3         #region Property
  4         /// <summary>
  5         /// 页面地址
  6         /// </summary>
  7         public string URL { get; set; }
  8         /// <summary>
  9         /// 域名
 10         /// </summary>
 11         public string Host { get; set; }
 12         /// <summary>
 13         /// 原始HTML
 14         /// </summary>
 15         public string OriginalHTML { get; set; }
 16         /// <summary>
 17         /// 过滤style、script及注释后的HTML
 18         /// </summary>
 19         public string HTML { get; set; }
 20         /// <summary>
 21         /// 标题
 22         /// </summary>
 23         public string Title { get; set; }
 24         /// <summary>
 25         /// 来源
 26         /// </summary>
 27         public string Source { get; set; }
 28         /// <summary>
 29         /// 发布日期
 30         /// </summary>
 31         public DateTime? PublishDate { get; set; }
 32         /// <summary>
 33         /// 作者
 34         /// </summary>
 35         public string Author { get; set; }
 36         /// <summary>
 37         /// 摘要
 38         /// </summary>
 39         public string Abstract { get; set; }
 40         /// <summary>
 41         /// 关键字
 42         /// </summary>
 43         public string KeyWord { get; set; }
 44         /// <summary>
 45         /// 正文
 46         /// </summary>
 47         public string Content { get; set; }
 48         #endregion
 49 
 50         #region Constructor
 51         /// <summary>
 52         /// 构造函数
 53         /// </summary>
 54         public WebPage() { }
 55         /// <summary>
 56         /// 构造函数
 57         /// </summary>
 58         /// <param name="html">HTML文档</param>
 59         public WebPage(string html) : this()
 60         {
 61             //this.OriginalHTML = System.Web.HtmlUtility.HtmlDecode(html);
 62             //过滤style/script/注释
 63             this.HTML = ClearHTML(html);
 64             //标题
 65             this.Title = GetTitle(this.HTML);
 66         }
 67         /// <summary>
 68         /// 构造函数
 69         /// </summary>
 70         /// <param name="html">HTML文档</param>
 71         /// <param name="url">页面地址,用于处理img标签图片路径</param>
 72         public WebPage(string html, string url) : this(html)
 73         {
 74             this.URL = url;
 75             this.Host = new Uri(url).Host;
 76         }
 77         #endregion
 78 
 79         #region public methods
 80 
 81         public static string GetHTML(string url, string cookie = "", bool throwError = false, string charset = "")
 82         {
 83             if (url.Substring(0, 4) != "http")
 84             {
 85                 url = "http://" + url;
 86             }
 87             Uri uri = new Uri(url);
 88             return GetHTML(uri, cookie, throwError, charset);
 89         }
 90         public static string GetHTML(Uri uri, string cookie = "", bool throwError = false, string charset = "")
 91         {
 92             try
 93             {
 94                 string html = "";
 95                 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
 96                 req.Method = "GET";
 97                 req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*";
 98                 req.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)";
 99                 req.Timeout = 60 * 1000;
100                 if (!string.IsNullOrEmpty(cookie))
101                 {
102                     req.Headers.Add("Cookie", cookie);
103                 }
104 
105                 HttpWebResponse res = (HttpWebResponse)req.GetResponse();
106                 List<byte> lstByteHtml = new List<byte>();
107                 using (Stream stream = res.GetResponseStream())
108                 {
109                     int tempByte = stream.ReadByte();
110                     while (tempByte != -1)
111                     {
112                         lstByteHtml.Add((byte)tempByte);
113                         tempByte = stream.ReadByte();
114                     }
115                 }
116                 byte[] arrByteHtml = lstByteHtml.ToArray();
117 
118                 if (!string.IsNullOrEmpty(charset))
119                 {
120                     html = Encoding.GetEncoding(charset).GetString(arrByteHtml, 0, arrByteHtml.Length);
121                 }
122                 else
123                 {
124                     html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length);
125                     string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value;
126                     //如果匹配到了标签并且不是utf8 那么重新解码一次  
127                     if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1))
128                     {
129                         try
130                         {
131                             html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length);
132                         }
133                         catch { }
134                     }
135                 }
136                 return html;
137             }
138             catch (Exception ex)
139             {
140                 if (throwError)
141                 {
142                     throw ex;
143                 }
144                 else
145                 {
146                     return "";
147                 }
148             }
149         }
150 
151         public static string GetPostHTML(string url, string cookie = "", bool throwError = false, string chatset = "")
152         {
153             string[] arrUrl = url.Split('?');
154             string postdata = arrUrl.Length > 1 ? arrUrl[1] : "";
155             Uri uri = new Uri(url);
156             return GetPostHTML(uri, postdata, cookie, throwError, chatset);
157         }
158         public static string GetPostHTML(Uri uri, string postdata, string cookie = "", bool throwError = false, string chatset = "")
159         {
160             try
161             {
162                 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
163                 req.Method = "POST";
164                 req.ContentType = "application/x-www-form-urlencoded";
165                 //req.ContentLength = Encoding.UTF8.GetByteCount(postData);
166                 if (!string.IsNullOrEmpty(cookie))
167                 {
168                     req.Headers.Add("Cookie", cookie);
169                 }
170                 using (Stream stream = req.GetRequestStream())
171                 {
172                     StreamWriter sw = new StreamWriter(stream, Encoding.GetEncoding("gb2312"));
173                     sw.Write(postdata);
174                     sw.Close();
175                     stream.Close();
176                 }
177 
178                 List<byte> lstByteHtml = new List<byte>();
179                 HttpWebResponse response = (HttpWebResponse)req.GetResponse();
180                 using (Stream stream = response.GetResponseStream())
181                 {
182                     int tempByte = stream.ReadByte();
183                     while (tempByte != -1)
184                     {
185                         lstByteHtml.Add((byte)tempByte);
186                         tempByte = stream.ReadByte();
187                     }
188                 }
189                 byte[] arrByteHtml = lstByteHtml.ToArray();
190                 string html = "";
191                 if (!string.IsNullOrEmpty(chatset))
192                 {
193                     html = Encoding.GetEncoding(chatset).GetString(arrByteHtml, 0, arrByteHtml.Length);
194                 }
195                 else
196                 {
197                     html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length);
198                     string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value;
199                     //如果匹配到了标签并且不是utf8 那么重新解码一次  
200                     if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1))
201                     {
202                         try
203                         {
204                             html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length);
205                         }
206                         catch { }
207                     }
208                 }
209 
210                 return html;
211             }
212             catch (Exception ex)
213             {
214                 if (throwError)
215                 {
216                     throw ex;
217                 }
218                 else
219                 {
220                     return "";
221                 }
222             }
223         }
224 
225         /// <summary>
226         /// 过滤style/script/注释,更新HTML
227         /// </summary>
228         public static string ClearHTML(string html)
229         {
230             //过滤表达式:style, script, 注释 
231             string[] filter = { @"(?is)<style.*?>.*?</style>",
232                                 @"(?is)<script.*?>.*?</script>",
233                                 @"(?is)<!--.*?-->" };
234             foreach (string reg in filter)
235             {
236                 html = Regex.Replace(html, reg, "");
237             }
238             return html;
239         }
240         public static string Html2Lower(string html)
241         {
242             MatchCollection matchs = Regex.Matches(html, @"(?<tag><[^\s>]+\s)|(?<tag><[^\s>]+>)");
243             Regex.Replace("", "", "");
244             foreach (Match match in matchs)
245             {
246                 string tag = match.Value.ToLower();
247                 html = html.Replace(match.Value, tag);
248             }
249             return html;
250         }
251         /// <summary>
252         /// 获取标题(匹配title和h1标签,当h1内容部分与title匹配时优先选择h1)
253         /// </summary>
254         /// <returns>title</returns>
255         public static string GetTitle(string html)
256         {
257             string titleFilter = @"<title>[\s\S]*?</title>",
258                    h1Filter = @"<h1.*?>.*?</h1>",
259                    clearFilter = @"<.*?>";
260 
261             string title = "", h1 = "";
262             //匹配title标签中的值
263             Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase);
264             if (match.Success)
265             {
266                 title = Regex.Replace(match.Groups[0].Value, clearFilter, "");
267             }
268             //匹配正文中h1标签中的值
269             match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase);
270             if (match.Success)
271             {
272                 h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "").Replace("\t", "");
273             }
274             //都匹配的时候优先取h1
275             if (!String.IsNullOrEmpty(h1) && (!String.IsNullOrEmpty(title) && title.IndexOf(h1) != -1))
276             {
277                 title = h1;
278             }
279             return title;
280         }
281         public static string GetHeadTitle(string html)
282         {
283             string title = "";
284             Match match = Regex.Match(html, @"<title>([\s\S]*?)</title>", RegexOptions.IgnoreCase);
285             if (match.Success)
286             {
287                 title = match.Groups[1].Value;
288             }
289             return title;
290         }
291         public static string GetHeadBase(string html)
292         {
293             return Regex.Match(html, @"<base.*?href=[""|'']?([a-zA-Z0-9-:/.]+)[""|''].*?>", RegexOptions.IgnoreCase).Groups[1].Value;
294         }
295 
296         /// <summary>
297         /// 获取发布日期(未提取到时返回null)
298         /// </summary>
299         /// <param name="html"></param>
300         /// <returns></returns>
301         public static DateTime GetPublishDate(string html)
302         {
303             // 过滤html标签,防止标签对日期提取产生影响
304             string text = Regex.Replace(html, "(?is)<.*?>", "");
305             Match match = Regex.Match(text, @"((\d{4}|\d{2})(\-|\/)\d{1,2}\3\d{1,2})(\s?\d{2}:\d{2})?|(\d{4}年\d{1,2}月\d{1,2}日)(\s?\d{2}:\d{2})?", RegexOptions.IgnoreCase);
306 
307             DateTime result = new DateTime(1900, 1, 1);
308             if (match.Success)
309             {
310                 try
311                 {
312                     string dateStr = "";
313                     for (int i = 0; i < match.Groups.Count; i++)
314                     {
315                         dateStr = match.Groups[i].Value;
316                         if (!String.IsNullOrEmpty(dateStr))
317                         {
318                             break;
319                         }
320                     }
321                     // 对中文日期的处理
322                     if (dateStr.Contains(""))
323                     {
324                         StringBuilder sb = new StringBuilder();
325                         foreach (var ch in dateStr)
326                         {
327                             if (ch == '' || ch == '')
328                             {
329                                 sb.Append("/");
330                                 continue;
331                             }
332                             if (ch == '')
333                             {
334                                 sb.Append(' ');
335                                 continue;
336                             }
337                             sb.Append(ch);
338                         }
339                         dateStr = sb.ToString();
340                     }
341                     result = Convert.ToDateTime(dateStr);
342                 }
343                 catch (Exception ex)
344                 {
345                     Console.WriteLine(ex);
346                 }
347             }
348             return result;
349         }
350         /// <summary>
351         /// 替换a标签的地址为完整地址
352         /// </summary>
353         /// <param name="url"></param>
354         /// <param name="href"></param>
355         /// <returns></returns>
356         public static string ReplaceAhref(string url, string href, string baseStr = "")
357         {
358             try
359             {
360                 string value = "";
361                 if (baseStr != "" && !IsAbsolutePath(href))
362                 {
363                     value = baseStr + href;
364                 }
365                 else
366                 {
367                     if (href.PadRight(5).Substring(0, 4) == "http")
368                     {
369                         value = href;
370                     }
371                     else
372                     {
373                         if (url.Substring(0, 4) != "http")
374                         {
375                             url = "http://" + url;
376                         }
377                         Uri uri = new Uri(url);
378                         if (href.Substring(0, 1) == "/")
379                         {
380                             value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + href;
381                         }
382                         else
383                         {
384                             string[] arrURL = url.Split('/');
385                             if (arrURL.Length == 1)
386                             {
387                                 value = url + "/" + href;
388                             }
389                             else
390                             {
391                                 arrURL[arrURL.Length - 1] = href;
392                                 value = string.Join("/", arrURL);
393                             }
394                         }
395                     }
396                 }
397                 return value;
398             }
399             catch (Exception ex)
400             {
401                 return href;
402             }
403         }
404         /// <summary>
405         /// 根据img标签的路径为base64编码
406         /// </summary>
407         /// <param name="url">页面地址</param>
408         /// <param name="src">img标签图片地址</param>
409         /// <returns></returns>                
410         public static string ReplaceImgSrc(string url, string src, string baseStr = "")
411         {
412             try
413             {
414                 string value = "";
415                 if (baseStr != "" && !IsAbsolutePath(src))
416                 {
417                     value = baseStr + src;
418                 }
419                 else
420                 {
421                     if (src.PadRight(5).Substring(0, 4) == "http")
422                     {
423                         value = src;
424                     }
425                     else
426                     {
427                         if (url.Substring(0, 4) != "http")
428                         {
429                             url = "http://" + url;
430                         }
431                         Uri uri = new Uri(url);
432                         if (src.Substring(0, 1) == "/")
433                         {
434                             value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + src;
435                         }
436                         else
437                         {
438                             string[] arrURL = url.Split('/');
439                             if (arrURL.Length == 1)
440                             {
441                                 value = url + "/" + src;
442                             }
443                             else
444                             {
445                                 arrURL[arrURL.Length - 1] = src;
446                                 value = string.Join("/", arrURL);
447                             }
448                         }
449                     }
450                 }
451                 if (value.Substring(0, 4) != "http")
452                 {
453                     value = "http://" + value;
454                 }
455                 return WebPage.ImgToBase64String(value);
456             }
457             catch (Exception ex)
458             {
459                 return src;
460             }
461         }
462 
463         public static bool IsAbsolutePath(string url)
464         {
465             bool flag = false;
466             if (!string.IsNullOrEmpty(url))
467             {
468                 string head = url.PadRight(10, ' ');
469                 if (head.Substring(0, 5) == "http:" || head.Substring(0, 6) == "https:")
470                 {
471                     flag = true;
472                 }
473             }
474             return flag;
475         }
476         //图片 转为    base64编码的文本
477         public static string ImgToBase64String(string imageUrl)
478         {
479             try
480             {
481                 string imgBase64 = "";
482                 string imgFormat = "png";
483 
484                 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(imageUrl);
485                 req.Accept = "image/*";
486                 HttpWebResponse res = (HttpWebResponse)req.GetResponse();
487 
488                 Image img = Image.FromStream(res.GetResponseStream());
489                 imgFormat = GetImageFormat(img);
490 
491                 using (MemoryStream ms = new MemoryStream())
492                 {
493                     img.Save(ms, img.RawFormat);
494                     byte[] arrByteStream = new byte[ms.Length];
495                     ms.Position = 0;
496                     ms.Read(arrByteStream, 0, (int)ms.Length);
497                     ms.Close();
498                     imgBase64 = Convert.ToBase64String(arrByteStream);
499                 }
500                 return "data:image/" + imgFormat + ";base64," + imgBase64;
501             }
502             catch (Exception ex)
503             {
504                 return imageUrl;
505             }
506         }
507         private static string GetImageFormat(Image img)
508         {
509             string format = "png";
510 
511             if (img.RawFormat.Equals(ImageFormat.Bmp))
512             {
513                 format = "jpeg";
514             }
515             else if (img.RawFormat.Equals(ImageFormat.Emf))
516             {
517                 format = "emf";
518             }
519             else if (img.RawFormat.Equals(ImageFormat.Exif))
520             {
521                 format = "exif";
522             }
523             else if (img.RawFormat.Equals(ImageFormat.Gif))
524             {
525                 format = "gif";
526             }
527             else if (img.RawFormat.Equals(ImageFormat.Icon))
528             {
529                 format = "icon";
530             }
531             else if (img.RawFormat.Equals(ImageFormat.Jpeg))
532             {
533                 format = "jpeg";
534             }
535             else if (img.RawFormat.Equals(ImageFormat.MemoryBmp))
536             {
537                 format = "memorybmp";
538             }
539             else if (img.RawFormat.Equals(ImageFormat.Png))
540             {
541                 format = "png";
542             }
543             else if (img.RawFormat.Equals(ImageFormat.Tiff))
544             {
545                 format = "tiff";
546             }
547             else if (img.RawFormat.Equals(ImageFormat.Wmf))
548             {
549                 format = "wmf";
550             }
551             return format;
552         }
553         #endregion
554     }
View Code

 

posted @ 2017-06-09 09:59  iceexx  阅读(254)  评论(0编辑  收藏  举报