.NET HTML代码正则解析

  1         /// <summary>
  2         /// 功能描述:正则取得HTML中所有图片的 URL
  3         /// </summary>
  4         /// <param name="sHtmlText">HTML代码</param>
  5         /// <returns>图片的URL列表【href】</returns>
  6         public static List<string> GetHtmlImageUrlList(string sHtmlText)
  7         {
  8             List<string> imgList = new List<string>();
  9             // 定义正则表达式用来匹配 img 标签
 10             Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
 11 
 12             // 搜索匹配的字符串
 13             MatchCollection matches = regImg.Matches(sHtmlText);
 14             int i = 0;
 15             string[] sUrlList = new string[matches.Count];
 16 
 17             // 取得匹配项列表
 18             foreach (Match match in matches)
 19             {
 20                imgList.Add(match.Groups["imgUrl"].Value);
 21             }
 22             return imgList;
 23         }
 24 
 25         /// <summary>
 26         /// 功能描述:正则取得HTML中所有锚标签
 27         /// </summary>
 28         /// <param name="sHtmlText">HTML代码</param>
 29         /// <returns>所有的锚点标签【A】</returns>
 30         public static List<string> GetHtmlAnchorlList(string sHtmlText)
 31         {
 32             List<string> achorList = new List<string>();
 33             //定义正则表达式用来匹配锚点
 34             Regex regAchor = new Regex(@"<a\sname=""(.+?)</a>",RegexOptions.Multiline);
 35             // 搜索匹配的字符串
 36             MatchCollection matches = regAchor.Matches(sHtmlText);
 37              int i = 0;
 38             string[] sUrlList = new string[matches.Count];
 39 
 40             // 取得匹配项列表
 41             foreach (Match match in matches)
 42             {
 43                 //去除HTML中的标签,只获得纯文本
 44                 achorList.Add(GetTextNoHtml(match.Groups[1].Value));
 45             }
 46             return achorList;
 47 
 48         }
 49 
 50         /// <summary>
 51         ///  功能描述:正则表达式获取HTML所有的文本,不需要HTML标签
 52         ///  最强功能提示:可以自动生成文章摘要
 53         /// </summary>
 54         /// <param name="sHtmlText">HTML代码</param>
 55         /// <param name="length">提取文本的长度</param>
 56         /// <returns>提取后的纯文本数据</returns>
 57         public static string GetTextNoHtml(string sHtmlText, int length = 0)
 58         {
 59 
 60             //删除脚本
 61             sHtmlText = Regex.Replace(sHtmlText, @"<script[^>]+?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
 62             //删除HTML
 63             sHtmlText = Regex.Replace(sHtmlText, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
 64             sHtmlText = Regex.Replace(sHtmlText, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
 65             sHtmlText = Regex.Replace(sHtmlText, @"-->", "", RegexOptions.IgnoreCase);
 66             sHtmlText = Regex.Replace(sHtmlText, @"<!--.*", "", RegexOptions.IgnoreCase);
 67             sHtmlText = Regex.Replace(sHtmlText, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
 68             sHtmlText = Regex.Replace(sHtmlText, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
 69             sHtmlText = Regex.Replace(sHtmlText, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
 70             sHtmlText = Regex.Replace(sHtmlText, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
 71             sHtmlText = Regex.Replace(sHtmlText, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
 72             sHtmlText = Regex.Replace(sHtmlText, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
 73             sHtmlText = Regex.Replace(sHtmlText, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
 74             sHtmlText = Regex.Replace(sHtmlText, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
 75             sHtmlText = Regex.Replace(sHtmlText, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
 76             sHtmlText = Regex.Replace(sHtmlText, @"&#(\d+);", "", RegexOptions.IgnoreCase);
 77             sHtmlText = sHtmlText.Replace("\"", "");
 78             sHtmlText = Regex.Replace(sHtmlText, @"//\(function\(\)[\s\S]+?}\)\(\);", "", RegexOptions.IgnoreCase);
 79             sHtmlText = sHtmlText.Replace("<", "");
 80             sHtmlText = sHtmlText.Replace(">", "");
 81             sHtmlText = sHtmlText.Replace("\r\n", "");
 82 
 83             if (length > 0 && sHtmlText.Length > length)
 84                 return sHtmlText.Substring(0, length);
 85 
 86             return sHtmlText;
 87         }
 88 
 89         /// <summary>
 90         /// 功能描述:正则表达式获取分页HTML中文本包含或者不包含指定内容的A标签的第一个A标签
 91         /// 实例: <a href="">曼码科技</a>,提取包换曼码科技,则此A标签可以获取到;如果不想包换曼码科技,则此标签无需获取到。
 92         /// (可以升级:集合版)
 93         /// </summary>
 94         /// <param name="html">要处理的HTML</param>
 95         /// <param name="text">包含的指定内容</param>
 96         /// <param name="iscontain">是否包含</param>
 97         /// <returns>返回第一符合规则的A标签</returns>
 98         public static string GetHtmlPageSelectA(string html,string text,bool iscontain)
 99         {
100             string url = string.Empty;
101             Regex reg = new Regex(@"<a href=""(?<url>.*?)""(.*?)>(?<text>.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
102             MatchCollection mctitleList = reg.Matches(html);
103             if (mctitleList.Count > 0)
104             {
105                 foreach (Match m in mctitleList)
106                 {
107                     string str = m.Groups["text"].Value.ToString().RemoveHtml();
108                     if (iscontain)
109                     {
110                         if (str.Trim().Contains(text))
111                         {
112                             url = m.Groups["url"].Value.ToString().Trim();
113                             break;
114                         }
115                     }
116                     else
117                     {
118                         if (str.Trim().Equals(text))
119                         {
120                             url = m.Groups["url"].Value.ToString().Trim();
121                             break;
122                         }
123                     }
124                 }
125             }
126             return url;
127         }

 

posted @ 2018-09-25 15:16  咖啡漩涡  阅读(527)  评论(0编辑  收藏  举报