信息采集

     如果自己想运营一个小网站,但是里面的信息时自己一个人无法及时更新,我们可以通过采集机器人实时自动的更新。例如现在的小说阅读,基本上所有的小说都是通过起点连载,那么多的盗版都是通过后台定期爬的,一般更新时差只有几个小时。.net采集使用webrequeset请求,而其中的难点就是权限的验证、html分析。

 

html分析就是从get的page里面找出规律,有两种办法:

1. 就是好好的分析html字符串,使用split,indexof,lastindexof,substring等操作。

2. 由于还没有直接解析html的工具,现在有开源项目可以将html转化成xml在用xpath(推荐)。

 

一。登陆验证 

因为我的目标站需要验证码问题提示,所以登陆步骤就没做(分析验证码的图片比较难做)。但我们可以人工登录后,用跟踪器从ie里面得到认证过的cookie字符串。后来发现如果使用相同的cookie,一段时间之后将会过期,可能做了防止程序资源盗用。难道这样就没有办法了吗?分析发现,cookie在不断地变化,那我们也在代码中动态维护cookie

 下面:通过httphead 的Set-Cookie 更新cookies容器

代码
 string setck = rep.Headers["Set-Cookie"];

        
if (setck!=null&&setck.Trim() != "")
        {
            
string[] cks = setck.Split(new string[] { ";""/," }, StringSplitOptions.RemoveEmptyEntries);
            
foreach (string ck in cks)
            {
                
int aa = ck.IndexOf("=");
                
string name = ck.Substring(0, aa).Trim();
                
string cValue = ck.Substring(aa + 1).Trim();
                
if (name.ToLower() != "expires" && name.ToLower() != "path")
                {
                    MyCookie[] my 
= LiCookie.ToArray();
                    
for (int i = 0; i < my.Length; i++)
                    {
                        
if (my[i].name == name)
                        {
                            LiCookie.Remove(my[i]);
                        }
                    }
                    LiCookie.Add(
new MyCookie(name, cValue));
                }

            }
         

        }

 

下面是get page的类:

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Collections.Generic;
using System.IO.Compression;
using System.Data;
using System.Data.OleDb;

/// <summary>
/// Rebot
/// </summary>
public class MyCookie
{
    
public MyCookie(string name, string cValue)
    {
        
this.name = name;
        
this.cValue = cValue;
    }
    
public string name
    {
        
get { return _name; }
        
set { this._name=value; }
    }
    
private string _name;
    
private string _cValue;

    
public string cValue
    {
        
get { return _cValue; }
        
set { _cValue = value; }
    }
}
public class Rebot
{
    
public Rebot()
    {
        LiCookie = new List<MyCookie>();
        
string[] cks = cookiestr.Split(new char[] { ';' });
        
foreach (string ck in cks)
        {
            
int aa = ck.IndexOf("=");
            
string name = ck.Substring(0, aa);
            
string cValue = ck.Substring(aa + 1);
         LiCookie.Add(new MyCookie(name,cValue));
        }
    }
    
private  List<MyCookie> LiCookie;
    
private string _cookiestr ;
    
public string cookiestr
    {
        
get {
            Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
            OleDbDataReader r = a.olExcuteReader("select top 1 * from  config ");
            
while (r.Read())
            {
              _cookiestr= r.GetString(1);
            }
            r.Close();
            
return _cookiestr;
        }
    }
        
        
//这个是cookies,我放在了数据库:) " M2g_smile=1D1; cnzz_eid=17120817-1256550191-; ltime=1256699837662; rtime=1; M2g_visitedfid=17D2D14D18D10; M2g_auth=yyI%2BXwaJmnE%2BCbbBer9qkU9bCxLSBE6O5BJYBS%2Be%2FBGkPKCABewuNEsZkfM4NlfRDUvdv5nPcQ; M2g_cookietime=2592000; M2g_sid=p5J856; 
    private CookieContainer GetCookie(string strurl)
    {
        Uri url = new Uri(strurl);
        CookieContainer cc = new CookieContainer();
        
foreach (MyCookie m in LiCookie)
        {
            cc.Add(new Cookie(m.name.Trim(), m.cValue.Trim(), "/", url.Host));
          
//  HttpContext.Current.Response.Write(m.name + "=" + m.cValue + "<br>");
        }
       
// HttpContext.Current.Response.Write("================<br>");
        return cc;
    }

    
/// <summary>
    
///获取page html数据
    
/// </summary>
    
/// <param name="strurl"></param>
    
/// <param name="Setstr"></param>
    
/// <returns></returns>
    public string GetWebString(string strurl)
    {
      
        Uri url = new Uri(strurl);  
       
// HttpContext.Current.Response.Write(url.ToString());
        HttpWebRequest web = (HttpWebRequest)WebRequest.Create(url);
        web.Method = "get";
        web.KeepAlive = true;
        web.Timeout = 10000;
        web.AllowAutoRedirect = true;
        web.Headers.Add("Accept-Encoding: deflate");
        web.Headers.Add("UA-CPU: x86");
        web.Headers.Add("Accept-Language: zh-cn");
        web.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.5.21022; InfoPath.2; .NET CLR 3.5.30729; .NET CLR 3.0.30618; .NET CLR 4.0.20506; .NET CLR 1.1.4322)";
        web.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-ms-application, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
        
//web.Referer = url.ToString();
    
     web.CookieContainer = GetCookie(strurl);

        HttpWebResponse rep = (HttpWebResponse)web.GetResponse();
         
//     从响应头中获取验证的cookie信息
        string setck = rep.Headers["Set-Cookie"];

        
if (setck!=null&&setck.Trim() != "")
        {
            
string[] cks = setck.Split(new string[] { ";""/," }, StringSplitOptions.RemoveEmptyEntries);
            
foreach (string ck in cks)
            {
                
int aa = ck.IndexOf("=");
                
string name = ck.Substring(0, aa).Trim();
                
string cValue = ck.Substring(aa + 1).Trim();
                
if (name.ToLower() != "expires" && name.ToLower() != "path")
                {
                    MyCookie[] my = LiCookie.ToArray();
                    
for (int i = 0; i < my.Length; i++)
                    {
                        
if (my[i].name == name)
                        {
                            LiCookie.Remove(my[i]);
                        }
                    }
                    LiCookie.Add(new MyCookie(name, cValue));
                }

            }
           
// HttpContext.Current.Response.Write("<br>" + setck+"<br>=========================<br>");
          
         
//  return GetWebString2(strurl, setck);
        }
    
            Stream str = rep.GetResponseStream();
            StreamReader r = new StreamReader(str, System.Text.Encoding.GetEncoding("gb2312"));
            
#region gzip 解压
            
//int offset = 0;
            
//int totalCount = 0;
            
//List<byte> lb = new List<byte>();
            
//while (true)
            
//{
            
//    byte[] buffer = new byte[1000];
            
//    int bytesRead = str.Read(buffer, 0, 1000);
            
//    if (bytesRead == 0)
            
//    {
            
//        break;
            
//    }
            
//    foreach (byte b in buffer)
            
//    {
            
//        lb.Add(b);
            
//    }
            
//    offset += bytesRead;
            
//    totalCount += bytesRead;
            
//}

            
//string page = DecompressFile(lb.ToArray());
            #endregion
            
string page = r.ReadToEnd();
            r.Close();
            r.Dispose();
            str.Close();
            
return page;
  

    }
    

 

下面是html 解析,具体页面具体分析。一般的话都是分页式的,先得到列表的url,再去get content;

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Collections.Generic;
public class post
{
    
public post(string title, string ctarget)
    {
        
this.tilte = title;
        
this.ctarget = ctarget;
    }

    
private string  _title;

    
public string  tilte
    {
        
get { return _title; }
        
set { _title = value; }
    }
    
private string  _ctarget;

    
public string  ctarget
    {
        
get { return _ctarget; }
        
set { _ctarget = value; }
    }

}
/// <summary> 
/// </summary>
public class HtmlAnalysis
{
    
public HtmlAnalysis()
    {
        
//
        
// TODO: Add constructor logic here
        
//
    }
    
public static string GetContentFromHtml(string HtmlStr)
    { string pageContent = "";


       HtmlStr=HtmlStr.Replace("  ","");
       HtmlStr = HtmlStr.ToLower();
       
string[] str1=HtmlStr.Split(new string[]{"<div id=\"ad_thread4_0\"></div>"},StringSplitOptions.RemoveEmptyEntries);
       
foreach (string s in str1)
       {
          
               
if (s.Contains("<div id=\"postmessage"))
               {
                   
int st = s.IndexOf("</div>");
                   
if (st != -1)
                       pageContent = s.Substring(0, st)+"</div>";
               }
           
       }
       
        
return ""+pageContent;
    }
    
public static List<post> GetListFromHtml(string HtmlStr)
    {
        List<post> p = new List<post>();
        
string htmlList = "";
        HtmlStr = HtmlStr.Replace("  """);
        HtmlStr = HtmlStr.ToLower();
        
string[] str = HtmlStr.Split(new string[] { "<tbody id=\"normalthread"},StringSplitOptions.RemoveEmptyEntries);
        foreach (string s in str)
        {
            
if (s.Contains("<td class=\"folder\">"))
            {
                
string st = "";
                
string[] s2 = s.Split(new string[] { "<span id=\"thread" }, StringSplitOptions.RemoveEmptyEntries);
                if (s2.Length == 2)
                    st = s2[1];
                
else
                    
continue;
                
string title = "";
                
string starget = "";
                
string time = "";
                
int i = st.IndexOf("<a href=\"");
                int j = 0,m=0;
                
if (i != -1)
                {
                    m = st.IndexOf("\"", i + 15);
                    j = st.IndexOf("\">", i + 5);
                }
                
int k = st.IndexOf("</a>");
                
if (i != -1 && m != -1 && m > i)
                    starget = st.Substring(i + 9, m - i-9);
                
if (j != -1 && k != -1 && k > j)
                    title = st.Substring(j+2, k - j-2);

               p.Add(new post(title,starget));
            }
        }
        
return p;
    }
    

} 

} 

下面是:采集功能了,可以写成定时器,定时执行更新内容:先循环分页,在循环列表,再得到内容。

using System;

using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Collections.Generic;
using System.Data;
using System.Data.OleDb;

public partial class admin_reflash : AdminPageBase
{
    
protected void Page_Load(object sender, EventArgs e)
    {

    }
    
protected void Button2_Click(object sender, EventArgs e)
    {
        
string s = "";
        Rebot r = new Rebot();
        
for (int i = 0; i < GridView1.Rows.Count; i++)
        {

            
if (((CheckBox)GridView1.Rows[i].FindControl("CheckBox5")).Checked)
            {
                
int CateId = Convert.ToInt32(GridView1.Rows[i].Cells[1].Text);
                Response.Write(CateId);
                
string webhost = Gethost();    //域名
                string url = webhost + GridView1.Rows[i].Cells[3].Text.Replace("&amp;","&"); // urlformat
                string boardName = GridView1.Rows[i].Cells[2].Text;
               
// int Cateid = Convert.ToInt32(TextBox3.Text.Trim()); //对应的cateid
                try
                {
                    Convert.ToInt32(TextBox1.Text);
                    Convert.ToInt32(TextBox2.Text);
                }
                
catch
                {
                    Label1.Text = "页码要数字";
                }
                
int Getcount = Convert.ToInt32(TextBox1.Text);   //提取页数
                int miniStart = Convert.ToInt32(TextBox2.Text);
         
                s += "更新:"+boardName+url+"===========================<br>";
            
             
                
int flag = 0;
                
for (int j = Getcount; j >= miniStart;j-- )
                {     
                    
string url2 = url + j.ToString();
                    s += "" + j.ToString() + "页("+url2+")==============<br>";
                 
                    
string content = "";
                    
try
                    {   
                     
                        content = r.GetWebString(url2);
                        s += "获取" + url2 + "成功!!";
                    }
                    
catch(Exception ex)
                    {
                        System.Threading.Thread.Sleep(10000);
                        s += "获取" + url2 + ex.Message; 
                        
//后去出错重试一次
                        if(flag==0)   //是否已经重试
                        j++;
                        
                        flag=1;
                        r = new Rebot();//重置cookies
                        continue;
                    }
                    flag = 0;
                    List<post> p = new List<post>();
                    p = HtmlAnalysis.GetListFromHtml(content);
                    s += p.Count.ToString() + "条记录";
                    
foreach (post p1 in p)
                    {
                        flag++;  
                        
string turl = webhost + p1.ctarget;
                        
int postId = Convert.ToInt32(GetpostId(turl));
                        
bool ttemp = checkoldpostid(postId);
                        
if (ttemp)
                        {
                            s += p1.ctarget + "==" + p1.tilte + "重复!!</br>";
                            
continue;
                        }
                        
try
                        {
                            
                            
string tcontent = HtmlAnalysis.GetContentFromHtml(r.GetWebString(turl));
                            
if (tcontent.Trim().Length < 10)
                            {
                                s += p1.ctarget + "==" + p1.tilte + "no content!!</br>";
                                
continue;
                            }
                          
                                insertdata(p1.tilte, tcontent, CateId, postId, DateTime.Now);
                                s += p1.ctarget + "==" + p1.tilte + "sueccess!!</br>";
                          
                        }
                        
catch(Exception ex)
                        {
                            s += p1.ctarget + "==" + p1.tilte + "error!!"+ex.Message+"</br>";
                        }
                    }
                }
            }
        }
        Label1.Text = s;

    }
    
/// <summary>
    
/// 系统设置值host
    
/// </summary>
    
/// <returns></returns>
    private string Gethost()
    { 
        
string s="";
      Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
            OleDbDataReader r = a.olExcuteReader("select top 1 * from  config ");
            
while (r.Read())
            {
                s = r.GetString(0);
            }
            
return s;
    }
    
private string GetpostId(string Url)
    {
        Url = Url.ToLower();
        
int i = Url.IndexOf("?tid=");
        
int j = Url.IndexOf("&");
        
return Url.Substring(i + 5, j - i - 5);
    }
    
public int insertdata(string Art_Title, string Art_Content, int Cate_Id, int Arti_oldId, DateTime time)
    {
        Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
        OleDbParameter[] param = new OleDbParameter[5];
        param[0= new OleDbParameter("@arti_title", Art_Title);
        param[1= new OleDbParameter("@arti_content", Art_Content);
        param[2= new OleDbParameter("@arti_cateid", Cate_Id);
        param[3= new OleDbParameter("@arti_createtime", time.ToString());
        param[4= new OleDbParameter("@Arti_oldId", Arti_oldId);
        
return a.olExcuteNonequry("insert into article (arti_title,arti_content,arti_cateid,arti_createtime,Arti_oldId) values (@arti_title,@arti_content,@arti_cateid,@arti_createtime,@Arti_oldId)", param);

    }
    
public bool checkoldpostid(int oldid)
    {
        Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
        OleDbDataReader r = a.olExcuteReader("select arti_title from article where Arti_oldId=" + oldid.ToString());
        
bool re = r.Read();
        r.Close();
        
return re;

    }

} 

 

      这样虽然可以很好的做到信息采集,但是需要为每一个网站单独制作一套解析,当目的站改版,我们也需要即时的改版。这样很不灵活,而且这样相当的费精力。应该可以写出一套智能的分析方法,因为大多说的文章系统,论坛系统的页面布局基本相同,以后有机会的话一定做一个。

 

posted @ 2009-11-07 14:57  LittlePeng  阅读(2905)  评论(0编辑  收藏  举报