正则抓取页面信息
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace CollectingInformation
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
private string HTMLPath = Application.StartupPath.ToString() + "/58.html";
private void btnOK_Click(object sender, EventArgs e)
{
string pagePath = textBox1.Text.Trim();
try
{
if (!File.Exists(XMLPath))
{
XElement xeCreateFile = new XElement("InfoBy58");
xeCreateFile.Save(XMLPath);
}
//开始抓取数据
//获得指定页面的内容
WebRequest hwr = WebRequest.Create(pagePath);
HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
StreamReader sr;
string code = hwp.ContentType;
//得到编码了
//如果取不到则默认为gb2312
try
{
code = code.Split('=')[1];
}
catch
{
code = "gb2312";
}
Stream rep = hwp.GetResponseStream();
sr = new StreamReader(rep, Encoding.GetEncoding(code));
string strSource = sr.ReadToEnd();
Regex rx = new Regex("<h1>" + @"([\S\s]*?)" + "<h2>"
, RegexOptions.Compiled | RegexOptions.IgnoreCase);
MatchCollection matchs = rx.Matches(strSource);
if (matchs.Count > 0)
{
strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
pattern = pattern.Replace("@all", @"[\S\s]*?");
rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
matchs = rx.Matches(strSource);
if (matchs.Count == 1)
{
XDocument root = XDocument.Load(XMLPath);
XElement xele = root.Element("InfoBy58");
xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
root.Save(XMLPath);
}
}
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
// pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";
}
private void btnExport_Click(object sender, EventArgs e)
{
try
{
XDocument root = XDocument.Load(XMLPath);
XElement xele = root.Element("InfoBy58");
StringBuilder strBuilder = new StringBuilder();
strBuilder.Append("<html>");
strBuilder.Append("<body>");
strBuilder.Append("<table border=\"1\">");
strBuilder.Append("<th>");
strBuilder.Append("<td>标题</td>");
strBuilder.Append("<td>联系人</td>");
strBuilder.Append("<td>电话</td>");
strBuilder.Append("</th>");
foreach (var item in root.Elements("UserInfo"))
{
strBuilder.Append("<tr>");
strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
strBuilder.Append("</tr>");
}
strBuilder.Append("</body></html>");
if (!File.Exists(HTMLPath))
{
File.Create(HTMLPath);
}
FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
StreamWriter sw = new StreamWriter(fs);
fs.SetLength(0);//首先把文件清空了。
sw.Write(strBuilder.ToString());//写你的字符串。
sw.Close();
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
}
}