.NET HTML代码正则解析
1 /// <summary> 2 /// 功能描述:正则取得HTML中所有图片的 URL 3 /// </summary> 4 /// <param name="sHtmlText">HTML代码</param> 5 /// <returns>图片的URL列表【href】</returns> 6 public static List<string> GetHtmlImageUrlList(string sHtmlText) 7 { 8 List<string> imgList = new List<string>(); 9 // 定义正则表达式用来匹配 img 标签 10 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); 11 12 // 搜索匹配的字符串 13 MatchCollection matches = regImg.Matches(sHtmlText); 14 int i = 0; 15 string[] sUrlList = new string[matches.Count]; 16 17 // 取得匹配项列表 18 foreach (Match match in matches) 19 { 20 imgList.Add(match.Groups["imgUrl"].Value); 21 } 22 return imgList; 23 } 24 25 /// <summary> 26 /// 功能描述:正则取得HTML中所有锚标签 27 /// </summary> 28 /// <param name="sHtmlText">HTML代码</param> 29 /// <returns>所有的锚点标签【A】</returns> 30 public static List<string> GetHtmlAnchorlList(string sHtmlText) 31 { 32 List<string> achorList = new List<string>(); 33 //定义正则表达式用来匹配锚点 34 Regex regAchor = new Regex(@"<a\sname=""(.+?)</a>",RegexOptions.Multiline); 35 // 搜索匹配的字符串 36 MatchCollection matches = regAchor.Matches(sHtmlText); 37 int i = 0; 38 string[] sUrlList = new string[matches.Count]; 39 40 // 取得匹配项列表 41 foreach (Match match in matches) 42 { 43 //去除HTML中的标签,只获得纯文本 44 achorList.Add(GetTextNoHtml(match.Groups[1].Value)); 45 } 46 return achorList; 47 48 } 49 50 /// <summary> 51 /// 功能描述:正则表达式获取HTML所有的文本,不需要HTML标签 52 /// 最强功能提示:可以自动生成文章摘要 53 /// </summary> 54 /// <param name="sHtmlText">HTML代码</param> 55 /// <param name="length">提取文本的长度</param> 56 /// <returns>提取后的纯文本数据</returns> 57 public static string GetTextNoHtml(string sHtmlText, int length = 0) 58 { 59 60 //删除脚本 61 sHtmlText = Regex.Replace(sHtmlText, @"<script[^>]+?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase); 62 //删除HTML 63 sHtmlText = Regex.Replace(sHtmlText, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); 64 sHtmlText = Regex.Replace(sHtmlText, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); 65 sHtmlText = Regex.Replace(sHtmlText, @"-->", "", RegexOptions.IgnoreCase); 66 sHtmlText = Regex.Replace(sHtmlText, @"<!--.*", "", RegexOptions.IgnoreCase); 67 sHtmlText = Regex.Replace(sHtmlText, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); 68 sHtmlText = Regex.Replace(sHtmlText, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); 69 sHtmlText = Regex.Replace(sHtmlText, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); 70 sHtmlText = Regex.Replace(sHtmlText, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); 71 sHtmlText = Regex.Replace(sHtmlText, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); 72 sHtmlText = Regex.Replace(sHtmlText, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); 73 sHtmlText = Regex.Replace(sHtmlText, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); 74 sHtmlText = Regex.Replace(sHtmlText, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); 75 sHtmlText = Regex.Replace(sHtmlText, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); 76 sHtmlText = Regex.Replace(sHtmlText, @"&#(\d+);", "", RegexOptions.IgnoreCase); 77 sHtmlText = sHtmlText.Replace("\"", ""); 78 sHtmlText = Regex.Replace(sHtmlText, @"//\(function\(\)[\s\S]+?}\)\(\);", "", RegexOptions.IgnoreCase); 79 sHtmlText = sHtmlText.Replace("<", ""); 80 sHtmlText = sHtmlText.Replace(">", ""); 81 sHtmlText = sHtmlText.Replace("\r\n", ""); 82 83 if (length > 0 && sHtmlText.Length > length) 84 return sHtmlText.Substring(0, length); 85 86 return sHtmlText; 87 } 88 89 /// <summary> 90 /// 功能描述:正则表达式获取分页HTML中文本包含或者不包含指定内容的A标签的第一个A标签 91 /// 实例: <a href="">曼码科技</a>,提取包换曼码科技,则此A标签可以获取到;如果不想包换曼码科技,则此标签无需获取到。 92 /// (可以升级:集合版) 93 /// </summary> 94 /// <param name="html">要处理的HTML</param> 95 /// <param name="text">包含的指定内容</param> 96 /// <param name="iscontain">是否包含</param> 97 /// <returns>返回第一符合规则的A标签</returns> 98 public static string GetHtmlPageSelectA(string html,string text,bool iscontain) 99 { 100 string url = string.Empty; 101 Regex reg = new Regex(@"<a href=""(?<url>.*?)""(.*?)>(?<text>.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline); 102 MatchCollection mctitleList = reg.Matches(html); 103 if (mctitleList.Count > 0) 104 { 105 foreach (Match m in mctitleList) 106 { 107 string str = m.Groups["text"].Value.ToString().RemoveHtml(); 108 if (iscontain) 109 { 110 if (str.Trim().Contains(text)) 111 { 112 url = m.Groups["url"].Value.ToString().Trim(); 113 break; 114 } 115 } 116 else 117 { 118 if (str.Trim().Equals(text)) 119 { 120 url = m.Groups["url"].Value.ToString().Trim(); 121 break; 122 } 123 } 124 } 125 } 126 return url; 127 }