HttpWebRequest及正则表达式
近日做了一下采集某个网页的内容,并获取其中所有的链接地址及链接标题。
其中用到了HttpWebRequest和正则表达式,代码备忘如下:
//WebClient wc = new WebClient();
//NetworkCredential nc = new NetworkCredential("用户名", "密码", "域名");
//wc.Credentials = nc;
//Response.Write(Server.HtmlEncode(wc.DownloadString("地址")));
HttpWebRequest req = (HttpWebRequest)WebRequest.Create("地址");
req.Credentials = new NetworkCredential("用户名", "密码", "域名");
req.Method = "GET";
IAsyncResult ir = req.BeginGetResponse(null, null);
ir.AsyncWaitHandle.WaitOne();
try {
HttpWebResponse response1 = (HttpWebResponse)req.EndGetResponse(ir);
System.IO.Stream stream = response1.GetResponseStream();
sReader = new System.IO.StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
if (null != sReader) {
string pattern = @"<a(?:\s*?)href=['|""](?<url>[\s\S]+?)['|""]>(?<title>[\s\S]+?)</a>";
System.Text.RegularExpressions.MatchCollection matchs = System.Text.RegularExpressions.Regex.Matches(sReader.ReadToEnd(), pattern);
if (matchs.Count <= 0)
Response.Write("没有匹配项");
else
{
for(int i=0;i<50;i++)
{
Response.Write("链接:" + matchs[i].Groups["url"].Value+"___名称:"+matchs[i].Groups["title"].Value+"<br />");
}
}
}
}
catch (System.Exception ex) {
Response.Write(ex.Message);
}
finally {
if (null != sReader) {
sReader.Dispose();
}
}
//NetworkCredential nc = new NetworkCredential("用户名", "密码", "域名");
//wc.Credentials = nc;
//Response.Write(Server.HtmlEncode(wc.DownloadString("地址")));
HttpWebRequest req = (HttpWebRequest)WebRequest.Create("地址");
req.Credentials = new NetworkCredential("用户名", "密码", "域名");
req.Method = "GET";
IAsyncResult ir = req.BeginGetResponse(null, null);
ir.AsyncWaitHandle.WaitOne();
try {
HttpWebResponse response1 = (HttpWebResponse)req.EndGetResponse(ir);
System.IO.Stream stream = response1.GetResponseStream();
sReader = new System.IO.StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
if (null != sReader) {
string pattern = @"<a(?:\s*?)href=['|""](?<url>[\s\S]+?)['|""]>(?<title>[\s\S]+?)</a>";
System.Text.RegularExpressions.MatchCollection matchs = System.Text.RegularExpressions.Regex.Matches(sReader.ReadToEnd(), pattern);
if (matchs.Count <= 0)
Response.Write("没有匹配项");
else
{
for(int i=0;i<50;i++)
{
Response.Write("链接:" + matchs[i].Groups["url"].Value+"___名称:"+matchs[i].Groups["title"].Value+"<br />");
}
}
}
}
catch (System.Exception ex) {
Response.Write(ex.Message);
}
finally {
if (null != sReader) {
sReader.Dispose();
}
}
这其中,正则表达式迷糊了我一会儿:因为没有使用惰性匹配,导致每一次都只能匹配到一条信息。。。。
<h3>
心静似高山流水不动,心清若巫峰雾气不沾。
</h3>
心静似高山流水不动,心清若巫峰雾气不沾。
</h3>