匹配html的正则表达式
Code
Regex regex = new Regex("<div id=\"cabotage\">(?<content>.*?)</div>\r\n<script>", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline);
//得到这两个标签之间所有的html (?<content>.*?) 用它来匹配
string str = regex.Match(strcontent).Groups["content"].Value;
str = System.Text.RegularExpressions.Regex.Replace(str, @"<script[\s\S]+</script *>","", RegexOptions.IgnoreCase); //替换掉带有脚本的 如<script>
str = System.Text.RegularExpressions.Regex.Replace(str, "<(?!(/?table)|(/?tr)|(/?td))[^<>]*?>", "", RegexOptions.IgnoreCase); //留下需要留下的标签
str = System.Text.RegularExpressions.Regex.Replace(str, @"<(\w+)\s*[^<>]*?>", "<$1>", RegexOptions.IgnoreCase); //将多余的全部剔除
str = System.Text.RegularExpressions.Regex.Replace(str, " ", "", RegexOptions.IgnoreCase); //将多余的nbsp剔除
str = System.Text.RegularExpressions.Regex.Replace(str, "^s*", "<PaNode>", RegexOptions.IgnoreCase); //替换前面空格
str = System.Text.RegularExpressions.Regex.Replace(str, "s*$", "</PaNode>", RegexOptions.IgnoreCase); //替换后面的空格
//此方法也可以剔除别的
public string wipeScript(string html)
{
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s \S]+</script *>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" on[\s\S]*=",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
return html;
}
Regex regex = new Regex("<div id=\"cabotage\">(?<content>.*?)</div>\r\n<script>", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline);
//得到这两个标签之间所有的html (?<content>.*?) 用它来匹配
string str = regex.Match(strcontent).Groups["content"].Value;
str = System.Text.RegularExpressions.Regex.Replace(str, @"<script[\s\S]+</script *>","", RegexOptions.IgnoreCase); //替换掉带有脚本的 如<script>
str = System.Text.RegularExpressions.Regex.Replace(str, "<(?!(/?table)|(/?tr)|(/?td))[^<>]*?>", "", RegexOptions.IgnoreCase); //留下需要留下的标签
str = System.Text.RegularExpressions.Regex.Replace(str, @"<(\w+)\s*[^<>]*?>", "<$1>", RegexOptions.IgnoreCase); //将多余的全部剔除
str = System.Text.RegularExpressions.Regex.Replace(str, " ", "", RegexOptions.IgnoreCase); //将多余的nbsp剔除
str = System.Text.RegularExpressions.Regex.Replace(str, "^s*", "<PaNode>", RegexOptions.IgnoreCase); //替换前面空格
str = System.Text.RegularExpressions.Regex.Replace(str, "s*$", "</PaNode>", RegexOptions.IgnoreCase); //替换后面的空格
//此方法也可以剔除别的
public string wipeScript(string html)
{
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" on[\s\S]*=",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
return html;
}