通过正则表达式提取网页
1.先看需求吧,我们需要设计一个文本框,然后输入网址后从该网址上下载HTML源码
设想想,我们可以通过wenclinet进行处理吧,
private string GetWebPage(string uri) { try { HttpWebRequest httprep = (HttpWebRequest)WebRequest.Create(uri); // 创建HTTP请求 HttpWebResponse res = httprep.GetResponse() as HttpWebResponse; // 返回响应的值 Encoding defaultEncoding = Encoding.UTF8; // 设置初步编码 string strType = string.Empty; // 判断类型 string strReturnedEncoding = string.Empty; // 编码 strType = res.ContentType; // 获取响应内容值 if (strType.IndexOf(";") > -1) // 判断响应内容类型中有无 ; { strType = strType.Split(new char[] { ';' })[0].Trim().ToLower(); // 存在分号进行分割 } if (strType != "text/html") // 返回类型不为text/html 直接返回空值 { return string.Empty; } if (res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset") > -1) // 先判断编码 Content-Type 中 有无charset { strReturnedEncoding = res.ContentType.ToLower().Replace(" ", string.Empty).Substring(res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset=") + 8); // 取出charset= 后的内容 if (strReturnedEncoding != string.Empty) { defaultEncoding = Encoding.GetEncoding(strReturnedEncoding); // 设置取出的编码 } } if (strReturnedEncoding == string.Empty) // 获取响应的编码方法 { strReturnedEncoding = res.ContentEncoding; // 获取web请求的编码 if (strReturnedEncoding != string.Empty && strReturnedEncoding != "gzip") // 编码响应方法不为空设置编码 切不为 gzip { defaultEncoding = Encoding.GetEncoding(strReturnedEncoding); // 设置取出的编码 } } if (strReturnedEncoding == string.Empty) // 获取响应的字符 { strReturnedEncoding = res.CharacterSet; if (strReturnedEncoding != string.Empty) // 响应字符不为空设置编码 { defaultEncoding = Encoding.GetEncoding(strReturnedEncoding); } } httprep.Abort(); // 取消Internet资源请求 res.Close(); // 关闭响应流 string strHtml = string.Empty; WebClient wc = new WebClient(); byte[] btHtml = wc.DownloadData(uri); strHtml = Encoding.GetEncoding(defaultEncoding.WebName).GetString(btHtml); // 以Url形式请求资源 获取Internet编号管理机构注册的当前编码 wc.Dispose(); // 释放所有资源 Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""|^'']*)"); // 正则匹配获取编码(charset=‘Value’) if (reg_charset.IsMatch(strHtml)) // 在网页源码中提取编码 { strReturnedEncoding = reg_charset.Match(strHtml).Groups["charset"].Value; // 取出网页真实编码 } if (strReturnedEncoding != string.Empty && Encoding.GetEncoding(strReturnedEncoding) != defaultEncoding) { strHtml = Encoding.GetEncoding(strReturnedEncoding).GetString(btHtml); // 设置真正的编码进行源码下载 } return strHtml; } catch { return string.Empty; } }
当提取出HTml源码后,需要的就是对源码进行处理
Html = Html.Replace("<BR>", ""); Html = Html.Replace("<P>", ""); Html = Html.Replace("<br>", ""); Html = Html.Replace("<p>", ""); Html = Html.Replace("</P>", ""); Html = Html.Replace("<P/>", ""); Html = Html.Replace("</p>", ""); Html = Html.Replace("<p/>", ""); Html = Html.Replace("</BR>", ""); Html = Html.Replace("<BR/>", ""); Html = Html.Replace("</br>", ""); Html = Html.Replace("<br/>", "");由于我们需要在TreeView中完整的将网页的数据显示出来,首先在页面上创建TreeView,然后创建节点
private TreeNode populateTagNode(string inputString, string Titles) { TreeNode htmlTagNode = new TreeNode(); try { MatchCollection matchesFound; TreeNode htmlSubTagNode; string sTag; //通过正则表达式提取HTML matchesFound = regex.Matches(inputString); htmlTagNode.Text = Titles; foreach (Match matchMade in matchesFound) { intMathchesMade = intMathchesMade + 1; sTag = "<" + matchMade.Groups[1].Value +" "+ matchMade.Groups[2].Value + ">"; htmlSubTagNode = populateTagNode(matchMade.Groups[3].Value, sTag); htmlTagNode.Nodes.Add(htmlSubTagNode); } } catch (Exception ex) { MessageBox.Show("Error:" + ex.Message.ToString()); } return htmlTagNode; }正则表示式Regex regex = new Regex("<(?<outertag>[a-z]+[\\d]?)(?<attributes>[^>]*)*>" + "(?<innerhtml>(<(?<innertag>[a-z]+[\\d]?)[^>]*>.*?</\\k<innertag>>|" + "<[a-z]+[\\d]?[^>]*>|(?>[^>]*))*(?=</\\k<outertag>>))?", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.ExplicitCapture | RegexOptions.Singleline);