C#用正则表达式 获取网页源代码标签的属性或值
1.有url获取到网页源代码:
1 using System.Web; 2 using System.IO; 3 using System.Net; 4 private void GetHtmlinfo(string PageUrl) 5 { 6 WebRequest request = WebRequest.Create(PageUrl); 7 WebResponse response = request.GetResponse(); 8 Stream resStream = response.GetResponseStream(); 9 StreamReader sr = new StreamReader(resStream, System.Text.Encoding.UTF8); 10 string htmlinfo = sr.ReadToEnd(); 11 resStream.Close(); 12 sr.Close(); 13 14 }
2.获取标签中的值:
1 using System.Text.RegularExpressions; 2 /// 获取字符中指定标签的值 3 /// </summary> 4 /// <param name="str">字符串</param> 5 /// <param name="title">标签</param> 6 /// <returns>值</returns> 7 public static string GetTitleContent(string str, string title1, string title2) 8 { 9 string tmpStr = string.Format("<{0}[^>]*?>(?<Text>[^<]*)</ {1}>", title1, title2); //获取<title>之间内容 10 11 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase); 12 13 string result = TitleMatch.Groups["Text"].Value; 14 return result; 15 }
Example:
HTML 源文件:<span class="t1_tx">现排名:<b class="color1">20</b>
Parameter: title1 = @"span class=""t1_tx"">现排名:<b class=""color1""";
title2 - "b";
3.获取标签中的属性:
1 /// 获取字符中指定标签的值 2 /// </summary> 3 /// <param name="str">字符串</param> 4 /// <param name="title">标签</param> 5 /// <param name="attrib">属性名</param> 6 /// <returns>属性</returns> 7 public static string GetTitleContent(string str, string title,string attrib) 8 { 9 10 string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取<title>之间内容 11 12 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase); 13 14 string result = TitleMatch.Groups["url"].Value; 15 return result; 16 }