/// <summary> /// 读取URL数据内容 /// </summary> /// <param name="url">网址</param> /// <returns>网站文本内容</returns> public string HttpGetText(string url) { HttpWebRequest Request = (HttpWebRequest)HttpWebRequest.Create(url); Request.Method = "GET"; Request.ContentType = @"application/x-www-form-urlencoded"; Request.Accept = @"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; Request.Headers.Add(HttpRequestHeader.AcceptLanguage, @"Accept-Language:zh-CN,zh;q=0.8"); Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"; Request.Proxy = proxy; var stream = Request.GetResponse().GetResponseStream(); var read = new StreamReader(stream); var json = read.ReadToEnd(); read.Close(); stream.Close(); System.Diagnostics.Debug.WriteLine("".PadRight(20, '=')); System.Diagnostics.Debug.WriteLine(json); System.Diagnostics.Debug.WriteLine("".PadRight(20, '=')); return json; } public class IPs { public List<proxy> items = new List<proxy>(); public class proxy { public string ip; public int port; public string address; public int speed; public int life;//持续分钟数 public DateTime check_time; } } private void button1_Click(object sender, EventArgs e) { var html= HttpGetText("http://www.xicidaili.com/nt"); int i1= html.IndexOf("<table id=\"ip_list\">"); int i2= html.IndexOf("</table>"); string ip_list = html.Substring(i1, i2 - i1+ "</table>".Length); var find = new Regex(@"<tr.*?>\s*?<td.*?>.*?</td>\s*?<td.*?>(?<ip>.*?)</td>\s*?<td.*?>(?<port>.*?)</td>\s*?<td.*?>\s*?<a.*?>(?<address>.*?)</a>\s*?</td>.*?width:(?<speed>.*?)%.*?<td>(?<life>.*?)</td>.*?<td>(?<check_time>.*?)</td>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline); var ips= find.Matches(ip_list); listView1.BeginUpdate(); foreach (System.Text.RegularExpressions.Match item in ips) { try { var ip = new IPs.proxy(); ListViewItem lvi = new ListViewItem(item.Groups["ip"].Value); ip.ip = item.Groups["ip"].Value; lvi.SubItems.Add(item.Groups["port"].Value); ip.port = Convert.ToInt32(item.Groups["port"].Value); lvi.SubItems.Add(item.Groups["address"].Value); ip.address = item.Groups["address"].Value; lvi.SubItems.Add(item.Groups["speed"].Value); ip.speed = Convert.ToInt32(item.Groups["speed"].Value); lvi.SubItems.Add(item.Groups["life"].Value); ip.life = conv(item.Groups["life"].Value); lvi.SubItems.Add(item.Groups["check_time"].Value); ip.check_time = Convert.ToDateTime(item.Groups["check_time"].Value); listView1.Items.Add(lvi); IPaddress.items.Add(ip); } catch { LogAdd("转换IP地址信息出错 " + item.Value); } } listView1.EndUpdate(); int conv(string life) { int a = 1; if (life.Contains("天")) { a = 60 * 24; life = life.Replace("天", ""); }else if (life.Contains("分钟")) { a =1; life = life.Replace("分钟", ""); } else if (life.Contains("小时")) { a = 60; life = life.Replace("小时", ""); } return Convert.ToInt32(life)*a; } }
关键代码就是获取指定网页里的IP代理信息,然后用正则表达式提取出来
本来想着直接将html转换为xml,谁知它网页写的不标准,转换不成功
只有用正则来查找了,效果不错~
代码运行环境: vs2017
当然老版本也可以,将局部函数代码放到外部即可。
效果图:
关键代码部分:
var html= HttpGetText("http://www.xicidaili.com/nt"); int i1= html.IndexOf("<table id=\"ip_list\">"); int i2= html.IndexOf("</table>"); string ip_list = html.Substring(i1, i2 - i1+ "</table>".Length); var find = new Regex(@"<tr.*?>\s*?<td.*?>.*?</td>\s*?<td.*?>(?<ip>.*?)</td>\s*?<td.*?>(?<port>.*?)</td>\s*?<td.*?>\s*?<a.*?>(?<address>.*?)</a>\s*?</td>.*?width:(?<speed>.*?)%.*?<td>(?<life>.*?)</td>.*?<td>(?<check_time>.*?)</td>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline); var ips= find.Matches(ip_list);
正则表达式推荐一个网址及学习工具:
http://deerchao.net/tutorials/regex/regex.htm#charclass
我本人也记不住 正则表达式 ,需要用的时候现查。