正则抓取页面信息

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;

namespace CollectingInformation
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
        private string HTMLPath = Application.StartupPath.ToString() + "/58.html";

        private void btnOK_Click(object sender, EventArgs e)
        {
            string pagePath = textBox1.Text.Trim();
            try
            {
                if (!File.Exists(XMLPath))
                {
                    XElement xeCreateFile = new XElement("InfoBy58");
                    xeCreateFile.Save(XMLPath);
                }

                //开始抓取数据
                //获得指定页面的内容  
                WebRequest hwr = WebRequest.Create(pagePath);
                HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
                StreamReader sr;
                string code = hwp.ContentType;
                //得到编码了
                //如果取不到则默认为gb2312
                try
                {
                    code = code.Split('=')[1];
                }
                catch
                {
                    code = "gb2312";
                }
                Stream rep = hwp.GetResponseStream();
                sr = new StreamReader(rep, Encoding.GetEncoding(code));
                string strSource = sr.ReadToEnd();

                Regex rx = new Regex("<h1>" + @"([\S\s]*?)" + "<h2>"
                        , RegexOptions.Compiled | RegexOptions.IgnoreCase);

                MatchCollection matchs = rx.Matches(strSource);
                if (matchs.Count > 0)
                {
                    strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
                    string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
                    pattern = pattern.Replace("@all", @"[\S\s]*?");
                    rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                    matchs = rx.Matches(strSource);
                    if (matchs.Count == 1)
                    {
                        XDocument root = XDocument.Load(XMLPath);
                        XElement xele = root.Element("InfoBy58");
                        xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
                        root.Save(XMLPath);
                    }
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }

            // pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";

        }

        private void btnExport_Click(object sender, EventArgs e)
        {
            try
            {
                XDocument root = XDocument.Load(XMLPath);
                XElement xele = root.Element("InfoBy58");

                StringBuilder strBuilder = new StringBuilder();
                strBuilder.Append("<html>");
                strBuilder.Append("<body>");
                strBuilder.Append("<table border=\"1\">");
                strBuilder.Append("<th>");
                strBuilder.Append("<td>标题</td>");
                strBuilder.Append("<td>联系人</td>");
                strBuilder.Append("<td>电话</td>");
                strBuilder.Append("</th>");

                foreach (var item in root.Elements("UserInfo"))
                {
                    strBuilder.Append("<tr>");
                    strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                    strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                    strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
                    strBuilder.Append("</tr>");
                }
                strBuilder.Append("</body></html>");

                if (!File.Exists(HTMLPath))
                {
                    File.Create(HTMLPath);
                }
                FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
                StreamWriter sw = new StreamWriter(fs);
                fs.SetLength(0);//首先把文件清空了。
                sw.Write(strBuilder.ToString());//写你的字符串。
                sw.Close();

            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
        }
    }
}

posted @ 2013-08-27 18:50  Johan-Choi  阅读(250)  评论(0编辑  收藏  举报