正则抓取网页所有href和src
根据抓取的页面,用正则来匹配页面href和src
string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0"; string ContentType = ""; Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/"); protected void Application_BeginRequest(object sender, EventArgs e) { Uri u = new Uri(strReqUrl, Request.RawUrl); byte[] b = getVerificationCode(u); //MemoryStream ms = new MemoryStream(b); //Response.ClearContent(); //Response.ContentType = ContentType; //Response.BinaryWrite(b); StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b)); StringBuilder sb = new StringBuilder(); GetHtmlUrl(ref strHtml); Response.Write(strHtml.ToString()); Response.End(); } public byte[] getVerificationCode(Uri url) { WebClient MyWebClient = new WebClient(); MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); MyWebClient.Headers.Add("Accept-Language", " zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); MyWebClient.Headers.Add("User-Agent", this.UserAgent); MyWebClient.Credentials = CredentialCache.DefaultCredentials; try { Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri); ContentType = MyWebClient.ResponseHeaders["Content-Type"]; return (pageData); } catch { return null; } }
void GetHtmlUrl(ref StringBuilder strHtml) { //string headstr = "(src|href)=", endstr = "(\")"; //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")"; string reg = "(src|href)\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))"; Regex r = new Regex(reg, RegexOptions.None); Match match = r.Match(strHtml.ToString()); StringBuilder sb = new StringBuilder(); while (match.Success) { //sb.Append(match.Groups["url"].Value + "\n");//得到href值 //sb.Append(match.Groups["text"].Value + "\n");//得到<a><a/>中间的内容 sb.Append(match + "\n");//得到href值 match = match.NextMatch(); //try //{ // Uri u = new Uri(strReqUrl, match.Value.Replace("\"", "").Replace("'", "")); // strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), "")); //} //catch //{ //} } }