悉野小楼

导航

抓取源码爱好者所有网页特效例子并保存到本地

提取http://www.codefans.net/jscss/code/1866.shtm等l类似网页中运行区块的html代码 并保存到本地.

应该是将 源码爱好者 » 网页特效代码 下面的子目录都抓下来了, 我机器上运行了15min抓了有10几个子目录  共4M多

用到了正则表达式,文件读取保存, 多线程

 是用vs2005写的, winform

form1.cs

代码
/**************************************************************************************************************
 * 本程序多线程从特定网页中提取一块内容
 * 具体从http://www.codefans.net/jscss/code/1866.shtml提取中间演示textarea内的html文本
 * 从(网页特效代码)->(详细分类)->html网页中textarea内容
 * 
 * 程序内使用了1.多线程 2.正则表达式 3.web文件读取 4.本地文件保存及编码问题
 * 
 * 
 * 
 * 线程挂起没实现 好像用ThreadPool可以暂停纯种线程
 * 
 * 
 * 
 * 
 * 
 **************************************************************************************************************
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;

namespace dig
{
    
public partial class form1 : Form
    {
        
private Thread getFileThread = null;
        
private DateTime startTime = DateTime.Now;
        
private string strCurUrl = "";//当前处理的url
        private string strSaveUrl = "";//当前保存网页的url
        public form1()
        {
            InitializeComponent();
        }
        
        
// http://www.codefans.net/jscss/code/1866.shtml
        private void btnStart_Click(object sender, EventArgs e)
        {
            
if (getFileThread == null)
            {
                getFileThread 
= new Thread(new ThreadStart(GetFileAndSave));//新建一个线程
                getFileThread.Start();//线程开始
            }
        }

        
private void GetFileAndSave()
        {
            
for (int i = 0; i < 9999++i)
            {
                
/*string strI = i + "";
                while (strI.Length < 4)
                {
                    strI = "0" + strI;
                }
*/
                CreateHtmlPage(
@"http://www.codefans.net/jscss/code/" + i + ".shtml");
                
//Console.WriteLine(strI);

            }
        }

        
private void CreateHtmlPage(string strUrl)
        {
            
try
            {
                
//正在处理的url
                strCurUrl = strUrl;

                
//读取文件
                HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse myResp 
= (HttpWebResponse)myReq.GetResponse();
                StreamReader respStream 
= new StreamReader(myResp.GetResponseStream(), Encoding.Default);
                
string respStr = respStream.ReadToEnd();
                respStream.Close();

                
//得到文件名 以文件标题为文件名
                string strReg = @"(?<=(<title>)).*(?=_源码爱好者</title>)";
                
string strFileName = new Regex(strReg).Match(respStr).ToString();

                
//得到文件夹名 从"网页特效代码"后面取100个字符分析
                int iTemp = respStr.IndexOf("网页特效代码");
                
string strFloderName = respStr.Substring(iTemp, 100);
                strReg 
= @"(?<=(<a.*>)).*(?=</a>)";
                strFloderName 
= new Regex(strReg, RegexOptions.IgnoreCase).Match(strFloderName).ToString();

                
//取出<textarea></textarea>之间的字符
                strReg = @"(?<=(<textarea.*?>))([\w\W]*)(?=</textarea>)";//?<=表示左环视 不包()里面东东  .表示任何字符除了/n  *?表示尽可能少的(好像是lazy) ?=右环视 不包括内容
                Match match = new Regex(strReg).Match(respStr);
                
//将&quot;替换成"
                strReg = @"&quot;";
                respStr 
= new Regex(strReg).Replace(match.ToString(), "\"");
                //将&gt;替换成>
                /*
                strReg = @"&gt;";
                respStr = new Regex(strReg).Replace(match.ToString(), ">");
                //将&lt;替换成<
                strReg = @"&lt;";
                respStr = new Regex(strReg).Replace(match.ToString(), "<");
                
*/
                respStr 
= respStr.Replace("&quot;""\"");
                respStr = respStr.Replace("&lt;""<");
                respStr 
= respStr.Replace("&gt;"">");
                
//写入文件
                string path = SaveFile(respStr, strFileName, strFloderName);
                
//处理完的url
                strSaveUrl = path + "\\" + strFileName + ".html";
            }
            
catch
            {
                StreamWriter sw 
= new StreamWriter(@"c:\error.txt"true, System.Text.Encoding.GetEncoding("gb2312"));//将不能读取的文件url写进txt文档
                sw.Write(strUrl);
                sw.Flush();
                sw.Close();
                
this.strSaveUrl = "读取远程url失败, 未能保存";
            }
        }

        
private static string SaveFile(string str, string strFileName, string strFloderName)
        {
            
string path = @"c:\" + @"网页特效代码\" + strFloderName;
            
if (!Directory.Exists(path))
                Directory.CreateDirectory(path);
            StreamWriter sw 
= new StreamWriter(path + "\\" + strFileName + ".html"true, System.Text.Encoding.GetEncoding("gb2312"));//System.Text.Encoding.Default;
            sw.Write(str);
            sw.Flush();
            sw.Close();
            
return path;
            
/*TextWriter myWriter = File.CreateText(path + "\\" + strFileName + ".html");//file只能以utf-8写入
                myWriter.Write(respStr);
                myWriter.Flush();
                myWriter.Close();
*/
        }

        
private void timer1_Tick(object sender, EventArgs e)
        {
            
this.lblTime.Text = DateTime.Now.ToLocalTime().ToString();
            TimeSpan span 
= DateTime.Now.Subtract(startTime);
            
this.lblTimeElapsed.Text = span.Seconds.ToString();
            
this.txtUrl.Text = strCurUrl;
            
this.txtSaveUrl.Text = strSaveUrl;
        }

        
private void btnStop_Click(object sender, EventArgs e)
        {
            getFileThread.Abort();
//结束线程
        }

        
private void btnPause_Click(object sender, EventArgs e)
        {
            
//getFileThread.Suspend();//线程挂起
        }
        
private void form1_FormClosed(object sender, FormClosedEventArgs e)
        {
            
if(getFileThread != null)
                getFileThread.Abort();
        }
    }
}

form1.desginer.cs
代码
namespace dig
{
    
partial class form1
    {
        
/// <summary>
        
/// 必需的设计器变量。
        
/// </summary>
        private System.ComponentModel.IContainer components = null;

        
/// <summary>
        
/// 清理所有正在使用的资源。
        
/// </summary>
        
/// <param name="disposing">如果应释放托管资源,为 true;否则为 false。</param>
        protected override void Dispose(bool disposing)
        {
            
if (disposing && (components != null))
            {
                components.Dispose();
            }
            
base.Dispose(disposing);
        }

        
#region Windows 窗体设计器生成的代码

        
/// <summary>
        
/// 设计器支持所需的方法 - 不要
        
/// 使用代码编辑器修改此方法的内容。
        
/// </summary>
        private void InitializeComponent()
        {
            
this.components = new System.ComponentModel.Container();
            
this.btnStart = new System.Windows.Forms.Button();
            
this.txtUrl = new System.Windows.Forms.TextBox();
            
this.label1 = new System.Windows.Forms.Label();
            
this.label2 = new System.Windows.Forms.Label();
            
this.txtSaveUrl = new System.Windows.Forms.TextBox();
            
this.timer1 = new System.Windows.Forms.Timer(this.components);
            
this.label3 = new System.Windows.Forms.Label();
            
this.lblTime = new System.Windows.Forms.Label();
            
this.label4 = new System.Windows.Forms.Label();
            
this.lblTimeElapsed = new System.Windows.Forms.Label();
            
this.btnStop = new System.Windows.Forms.Button();
            
this.btnPause = new System.Windows.Forms.Button();
            
this.SuspendLayout();
            
// 
            
// btnStart
            
// 
            this.btnStart.Location = new System.Drawing.Point(97263);
            
this.btnStart.Name = "btnStart";
            
this.btnStart.Size = new System.Drawing.Size(7523);
            
this.btnStart.TabIndex = 0;
            
this.btnStart.Text = "&Start";
            
this.btnStart.UseVisualStyleBackColor = true;
            
this.btnStart.Click += new System.EventHandler(this.btnStart_Click);
            
// 
            
// txtUrl
            
// 
            this.txtUrl.Location = new System.Drawing.Point(9520);
            
this.txtUrl.Name = "txtUrl";
            
this.txtUrl.ReadOnly = true;
            
this.txtUrl.Size = new System.Drawing.Size(32621);
            
this.txtUrl.TabIndex = 1;
            
this.txtUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
            
// 
            
// label1
            
// 
            this.label1.AutoSize = true;
            
this.label1.Location = new System.Drawing.Point(3023);
            
this.label1.Name = "label1";
            
this.label1.Size = new System.Drawing.Size(5912);
            
this.label1.TabIndex = 2;
            
this.label1.Text = "远程文件:";
            
// 
            
// label2
            
// 
            this.label2.AutoSize = true;
            
this.label2.Location = new System.Drawing.Point(3082);
            
this.label2.Name = "label2";
            
this.label2.Size = new System.Drawing.Size(5912);
            
this.label2.TabIndex = 3;
            
this.label2.Text = "保存位置:";
            
// 
            
// txtSaveUrl
            
// 
            this.txtSaveUrl.Location = new System.Drawing.Point(9779);
            
this.txtSaveUrl.Name = "txtSaveUrl";
            
this.txtSaveUrl.ReadOnly = true;
            
this.txtSaveUrl.Size = new System.Drawing.Size(32621);
            
this.txtSaveUrl.TabIndex = 1;
            
this.txtSaveUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
            
// 
            
// timer1
            
// 
            this.timer1.Enabled = true;
            
this.timer1.Interval = 1000;
            
this.timer1.Tick += new System.EventHandler(this.timer1_Tick);
            
// 
            
// label3
            
// 
            this.label3.AutoSize = true;
            
this.label3.Location = new System.Drawing.Point(30141);
            
this.label3.Name = "label3";
            
this.label3.Size = new System.Drawing.Size(5912);
            
this.label3.TabIndex = 4;
            
this.label3.Text = "当前时间:";
            
// 
            
// lblTime
            
// 
            this.lblTime.AutoSize = true;
            
this.lblTime.Location = new System.Drawing.Point(95141);
            
this.lblTime.Name = "lblTime";
            
this.lblTime.Size = new System.Drawing.Size(3512);
            
this.lblTime.TabIndex = 5;
            
this.lblTime.Text = "12:00";
            
// 
            
// label4
            
// 
            this.label4.AutoSize = true;
            
this.label4.Location = new System.Drawing.Point(30200);
            
this.label4.Name = "label4";
            
this.label4.Size = new System.Drawing.Size(5912);
            
this.label4.TabIndex = 4;
            
this.label4.Text = "共用时间:";
            
// 
            
// lblTimeElapsed
            
// 
            this.lblTimeElapsed.AutoSize = true;
            
this.lblTimeElapsed.Location = new System.Drawing.Point(95201);
            
this.lblTimeElapsed.Name = "lblTimeElapsed";
            
this.lblTimeElapsed.Size = new System.Drawing.Size(1112);
            
this.lblTimeElapsed.TabIndex = 5;
            
this.lblTimeElapsed.Text = "0";
            
// 
            
// btnStop
            
// 
            this.btnStop.Location = new System.Drawing.Point(290263);
            
this.btnStop.Name = "btnStop";
            
this.btnStop.Size = new System.Drawing.Size(7523);
            
this.btnStop.TabIndex = 0;
            
this.btnStop.Text = "S&top";
            
this.btnStop.UseVisualStyleBackColor = true;
            
this.btnStop.Click += new System.EventHandler(this.btnStop_Click);
            
// 
            
// btnPause
            
// 
            this.btnPause.Location = new System.Drawing.Point(194263);
            
this.btnPause.Name = "btnPause";
            
this.btnPause.Size = new System.Drawing.Size(7523);
            
this.btnPause.TabIndex = 0;
            
this.btnPause.Text = "&Pause";
            
this.btnPause.UseVisualStyleBackColor = true;
            
this.btnPause.Click += new System.EventHandler(this.btnPause_Click);
            
// 
            
// form1
            
// 
            this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 12F);
            
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
            
this.ClientSize = new System.Drawing.Size(442313);
            
this.Controls.Add(this.lblTimeElapsed);
            
this.Controls.Add(this.lblTime);
            
this.Controls.Add(this.label4);
            
this.Controls.Add(this.label3);
            
this.Controls.Add(this.label2);
            
this.Controls.Add(this.label1);
            
this.Controls.Add(this.txtSaveUrl);
            
this.Controls.Add(this.txtUrl);
            
this.Controls.Add(this.btnStop);
            
this.Controls.Add(this.btnPause);
            
this.Controls.Add(this.btnStart);
            
this.Name = "form1";
            
this.Text = "提取网页";
            
this.FormClosed += new System.Windows.Forms.FormClosedEventHandler(this.form1_FormClosed);
            
this.ResumeLayout(false);
            
this.PerformLayout();

        }

        
#endregion

        
private System.Windows.Forms.Button btnStart;
        
private System.Windows.Forms.TextBox txtUrl;
        
private System.Windows.Forms.Label label1;
        
private System.Windows.Forms.Label label2;
        
private System.Windows.Forms.TextBox txtSaveUrl;
        
private System.Windows.Forms.Timer timer1;
        
private System.Windows.Forms.Label label3;
        
private System.Windows.Forms.Label lblTime;
        
private System.Windows.Forms.Label label4;
        
private System.Windows.Forms.Label lblTimeElapsed;
        
private System.Windows.Forms.Button btnStop;
        
private System.Windows.Forms.Button btnPause;
    }
}


 源码下载

posted on 2010-06-27 21:12  悉野  阅读(724)  评论(0编辑  收藏  举报