C#远程抓取网页

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Diagnostics;

namespace frmTest
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            //初始化
            txtFilePath.Text = Application.StartupPath;//初始化保存目录为应用程序所在的目录
            txtWeb.Text = "http://www.cfchina.cn";
            cboEnCode.SelectedIndex = 0;
            timer1.Enabled = false;
            lblResult.Text = "";
            initToolTip();
        }


        private void btnSelect_Click(object sender, EventArgs e)
        {
            //选择文件的保存路径
            if (folderBrowserDlg.ShowDialog()==DialogResult.OK) {
                txtFilePath.Text = folderBrowserDlg.SelectedPath;
            }
        }

        private void btnStart_Click(object sender, EventArgs e)
        {

            lblResult.Text = "正在初始化...";
            getUrl();
            timer1.Enabled = true;//启动定时器
        }

        //抓取网页并保存
        private void getUrl() {
            try
            {
                string fileName;
                fileName = txtFilePath.Text + "\\" + DateTime.Now.ToString().Replace(" ", "").Replace(":", "").Replace("-", "") + ".html";
                WebClient mywebclient = new WebClient();

                //从指定网址下载数据
                lblResult.Text = "正在尝试从指定网址下载数据...";
                byte[] pagedata = mywebclient.DownloadData(txtWeb.Text);
                string pagehtml = "";

                //设置编码
                if (cboEnCode.SelectedItem.ToString() == "GB2312")
                {
                    pagehtml = Encoding.Default.GetString(pagedata);
                }
                else
                {
                    pagehtml = Encoding.UTF8.GetString(pagedata);
                }              

                using (StreamWriter sw = new StreamWriter(fileName))
                {
                    textBox1.Text = pagehtml;
                    textBox1.Text =  textBox1.Text.Replace("gb2312", "utf-8");//默认抓取下的内容就是utf-8编码的,没这一行,抓取下来的网页无法正常显示
                    //很奇怪用pagehtml = pagehtml.Repalce("gb2312","utf-8")没用????可能pagehtml内部已经是utf-8了,根本找不到"gb2312"这几个字
                    sw.WriteLine(textBox1.Text);                  
                    lst1.Items.Add(fileName);                  
                }
            }
            catch (WebException webEx)
            {
                lblResult.Text = "错误:" + webEx.Message.ToString();

            }
        }

        //定时器事件
        private void timer1_Tick(object sender, EventArgs e)
        {          
            timer1.Interval = Convert.ToInt16(txtTime.Text) * 1000;
            getUrl();
        }

        private void btnEnd_Click(object sender, EventArgs e)
        {
            lblResult.Text = "已停止";
            timer1.Enabled = false;//关闭定时器
            delmyFile();//删除生成的网页
            lst1.Items.Clear();
        }

        //双击打开生成的网页
        private void lst1_DoubleClick(object sender, EventArgs e)
        {
            if (lst1.Items.Count == 0)
            {
                return;
            }
            else
            {
                using (Process p = new Process())
                {
                    p.StartInfo.FileName = "IEXPLORE.EXE";
                    p.StartInfo.Arguments = lst1.SelectedItem.ToString();
                    p.Start();
                }
            }
        }

 

        //删除生成的网页
        private void delmyFile() {           
            string filename;
            int i = 0;
            for (i = 0; i <= lst1.Items.Count - 1; i++) {
                filename = lst1.Items[i].ToString();
                if (File.Exists(filename)){
                        File.Delete(filename);
                }
            }
        }


        //初始化toolTip
        private void initToolTip()
        {
            toolTip1.SetToolTip(lst1, "抓取并保存后的网页名称列表\n双击可直接用IE打开抓取下来的网页");
            toolTip1.SetToolTip(btnEnd, "停止网页抓取,并删除抓取下来的网页");
            toolTip1.SetToolTip(btnStart, "开始抓取指定的网页");
            toolTip1.SetToolTip(btnSelect, "选择保存的文件路径");
            toolTip1.SetToolTip(txtTime, "定时器间隔秒数,即每隔多少秒抓取一次网页");
            toolTip1.SetToolTip(lblTime, "定时器间隔秒数,即每隔多少秒抓取一次网页");
            toolTip1.SetToolTip(textBox1, "显示抓取下来的网页代码内容");
            toolTip1.SetToolTip(txtWeb, "输入要抓取的网址,必须以http://开头");
            toolTip1.SetToolTip(cboEnCode, "选取网页的保存编码,必须与网站源代码一致,否则抓取下来后可能显示不正常");
            toolTip1.SetToolTip(txtFilePath, "输入文件保存目录,抓下来的网页将保存在这里");
            toolTip1.SetToolTip(btnPause, "暂时抓取");
        }


        //抓取暂停
        private void btnPause_Click(object sender, EventArgs e)
        {
            timer1.Enabled = false;
        }
    }
}

posted @ 2007-04-17 11:00  海底的鱼  阅读(2654)  评论(3编辑  收藏  举报