一、后台抓取代码
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();
System.Net.WebClient wc = new System.Net.WebClient();
wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
Byte[] pageData = wc.DownloadData(PageUrl);
string Content= System.Text.Encoding.Default.GetString(pageData);
try
{
HttpWebRequest request=(HttpWebRequest)HttpWebRequest.Create("http://www.baidu.com");
request.Method=WebRequestMethods.Http.Get;
HttpWebResponse response=(HttpWebResponse)request.GetResponse();
System.IO.StreamReader reader=new System.IO.StreamReader(response.GetResponseStream());
string data=reader.ReadToEnd();
response.Close();
HttpContext.Current.Response.Write(data);
HttpContext.Current.Response.End();
}
}
catch{}
Regex reg = new Regex(@"(?i)(?<=<span.*?id=""s"".*?>)[^<]+(?=</span>)");
MatchCollection mc = reg.Matches(html);
foreach (Match m in mc)
{
Console.WriteLine(m.Groups[0].ToString() );
}
二 正则应用
//删除所有的html标记
public static string delHtml(string str)
{
if (str != null && str.Trim() != " ")
return Regex.Replace(str, " <[^> ]+> ", " ");
return str;
}
// 删除字符串中的特定标记
//isContent:是否清除内容
public static string delTag(string str, string tag, bool isContent)
{
if (tag == null || tag == " ")
{
return str;
}
if (isContent) //要求清除内容
{
return Regex.Replace(str, string.Format( " <({0})[^> ]*> ([\\s\\S]*?)
<\\/\\1> ", tag), " ", RegexOptions.IgnoreCase);
}
return Regex.Replace(str, string.Format(@ "( <{0}[^> ]*(> )?)|( </{0}[^> ]
*> )| ", tag), " ", RegexOptions.IgnoreCase);
}
- // 链接正则
- String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";
MatchCollection mc = Regex.Matches(htmlstring, @"<a\s+href=(?<url>.+?)>(?<content>.+?)</a>");
2 foreach (Match m in mc)
3 {
4 url = m.Groups["url"].Value;
5
6 content = m.Groups["content"].Value;
7 }
其中htmlstring 为输入代码
图片 src[^>]*[^/].(?:jpg|bmp|gif)(?:\"|\')
中文 ^([\u4e00-\u9fa5]+|[a-zA-Z0-9]+)$
网址 "\<a.+?href=['""](?!http\:\/\/)(?!mailto\:)(?>foundAnchor>[^'"">]+?)[^>]*?\>"
匹配中文字符的正则表达式: [\u4e00-\u9fa5]
匹配双字节字符(包括汉字在内):[^\x00-\xff]
匹配空行的正则表达式:\n[\s| ]*\r
匹配HTML标记的正则表达式:/<(.*)>.*<\/\1>|<(.*) \/>/
匹配首尾空格的正则表达式:(^\s*)|(\s*$)(像vbscript那样的trim函数)
匹配Email地址的正则表达式:\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*
匹配网址URL的正则表达式:http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
平衡组的一个最常见的应用就是匹配HTML,下面这个例子可以匹配嵌套的<div>标签:<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>
正则表达式 |
说明 |
---|---|
/^\s*$/ |
匹配空行。 |
/\d{2}-\d{5}/ |
匹配由两位数字、一个连字符再加五位数字组成的 ID 号。 |
/<\s*(\S+)(\s[^>]*)?>[\s\S]*<\s*\/\1\s*>/ |
匹配 HTML 标记。
|