信息采集
如果自己想运营一个小网站,但是里面的信息时自己一个人无法及时更新,我们可以通过采集机器人实时自动的更新。例如现在的小说阅读,基本上所有的小说都是通过起点连载,那么多的盗版都是通过后台定期爬的,一般更新时差只有几个小时。.net采集使用webrequeset请求,而其中的难点就是权限的验证、html分析。
html分析就是从get的page里面找出规律,有两种办法:
1. 就是好好的分析html字符串,使用split,indexof,lastindexof,substring等操作。
2. 由于还没有直接解析html的工具,现在有开源项目可以将html转化成xml在用xpath(推荐)。
一。登陆验证
因为我的目标站需要验证码问题提示,所以登陆步骤就没做(分析验证码的图片比较难做)。但我们可以人工登录后,用跟踪器从ie里面得到认证过的cookie字符串。后来发现如果使用相同的cookie,一段时间之后将会过期,可能做了防止程序资源盗用。难道这样就没有办法了吗?分析发现,cookie在不断地变化,那我们也在代码中动态维护cookie。
下面:通过httphead 的Set-Cookie 更新cookies容器
if (setck!=null&&setck.Trim() != "")
{
string[] cks = setck.Split(new string[] { ";", "/," }, StringSplitOptions.RemoveEmptyEntries);
foreach (string ck in cks)
{
int aa = ck.IndexOf("=");
string name = ck.Substring(0, aa).Trim();
string cValue = ck.Substring(aa + 1).Trim();
if (name.ToLower() != "expires" && name.ToLower() != "path")
{
MyCookie[] my = LiCookie.ToArray();
for (int i = 0; i < my.Length; i++)
{
if (my[i].name == name)
{
LiCookie.Remove(my[i]);
}
}
LiCookie.Add(new MyCookie(name, cValue));
}
}
}
下面是get page的类:
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Collections.Generic;
using System.IO.Compression;
using System.Data;
using System.Data.OleDb;
/// <summary>
/// Rebot
/// </summary>
public class MyCookie
{
public MyCookie(string name, string cValue)
{
this.name = name;
this.cValue = cValue;
}
public string name
{
get { return _name; }
set { this._name=value; }
}
private string _name;
private string _cValue;
public string cValue
{
get { return _cValue; }
set { _cValue = value; }
}
}
public class Rebot
{
public Rebot()
{
LiCookie = new List<MyCookie>();
string[] cks = cookiestr.Split(new char[] { ';' });
foreach (string ck in cks)
{
int aa = ck.IndexOf("=");
string name = ck.Substring(0, aa);
string cValue = ck.Substring(aa + 1);
LiCookie.Add(new MyCookie(name,cValue));
}
}
private List<MyCookie> LiCookie;
private string _cookiestr ;
public string cookiestr
{
get {
Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
OleDbDataReader r = a.olExcuteReader("select top 1 * from config ");
while (r.Read())
{
_cookiestr= r.GetString(1);
}
r.Close();
return _cookiestr;
}
}
//这个是cookies,我放在了数据库:) " M2g_smile=1D1; cnzz_eid=17120817-1256550191-; ltime=1256699837662; rtime=1; M2g_visitedfid=17D2D14D18D10; M2g_auth=yyI%2BXwaJmnE%2BCbbBer9qkU9bCxLSBE6O5BJYBS%2Be%2FBGkPKCABewuNEsZkfM4NlfRDUvdv5nPcQ; M2g_cookietime=2592000; M2g_sid=p5J856;
private CookieContainer GetCookie(string strurl)
{
Uri url = new Uri(strurl);
CookieContainer cc = new CookieContainer();
foreach (MyCookie m in LiCookie)
{
cc.Add(new Cookie(m.name.Trim(), m.cValue.Trim(), "/", url.Host));
// HttpContext.Current.Response.Write(m.name + "=" + m.cValue + "<br>");
}
// HttpContext.Current.Response.Write("================<br>");
return cc;
}
/// <summary>
///获取page html数据
/// </summary>
/// <param name="strurl"></param>
/// <param name="Setstr"></param>
/// <returns></returns>
public string GetWebString(string strurl)
{
Uri url = new Uri(strurl);
// HttpContext.Current.Response.Write(url.ToString());
HttpWebRequest web = (HttpWebRequest)WebRequest.Create(url);
web.Method = "get";
web.KeepAlive = true;
web.Timeout = 10000;
web.AllowAutoRedirect = true;
web.Headers.Add("Accept-Encoding: deflate");
web.Headers.Add("UA-CPU: x86");
web.Headers.Add("Accept-Language: zh-cn");
web.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.5.21022; InfoPath.2; .NET CLR 3.5.30729; .NET CLR 3.0.30618; .NET CLR 4.0.20506; .NET CLR 1.1.4322)";
web.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-ms-application, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
//web.Referer = url.ToString();
web.CookieContainer = GetCookie(strurl);
HttpWebResponse rep = (HttpWebResponse)web.GetResponse();
// 从响应头中获取验证的cookie信息
string setck = rep.Headers["Set-Cookie"];
if (setck!=null&&setck.Trim() != "")
{
string[] cks = setck.Split(new string[] { ";", "/," }, StringSplitOptions.RemoveEmptyEntries);
foreach (string ck in cks)
{
int aa = ck.IndexOf("=");
string name = ck.Substring(0, aa).Trim();
string cValue = ck.Substring(aa + 1).Trim();
if (name.ToLower() != "expires" && name.ToLower() != "path")
{
MyCookie[] my = LiCookie.ToArray();
for (int i = 0; i < my.Length; i++)
{
if (my[i].name == name)
{
LiCookie.Remove(my[i]);
}
}
LiCookie.Add(new MyCookie(name, cValue));
}
}
// HttpContext.Current.Response.Write("<br>" + setck+"<br>=========================<br>");
// return GetWebString2(strurl, setck);
}
Stream str = rep.GetResponseStream();
StreamReader r = new StreamReader(str, System.Text.Encoding.GetEncoding("gb2312"));
#region gzip 解压
//int offset = 0;
//int totalCount = 0;
//List<byte> lb = new List<byte>();
//while (true)
//{
// byte[] buffer = new byte[1000];
// int bytesRead = str.Read(buffer, 0, 1000);
// if (bytesRead == 0)
// {
// break;
// }
// foreach (byte b in buffer)
// {
// lb.Add(b);
// }
// offset += bytesRead;
// totalCount += bytesRead;
//}
//string page = DecompressFile(lb.ToArray());
#endregion
string page = r.ReadToEnd();
r.Close();
r.Dispose();
str.Close();
return page;
}
下面是html 解析,具体页面具体分析。一般的话都是分页式的,先得到列表的url,再去get content;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Collections.Generic;
public class post
{
public post(string title, string ctarget)
{
this.tilte = title;
this.ctarget = ctarget;
}
private string _title;
public string tilte
{
get { return _title; }
set { _title = value; }
}
private string _ctarget;
public string ctarget
{
get { return _ctarget; }
set { _ctarget = value; }
}
}
/// <summary>
/// </summary>
public class HtmlAnalysis
{
public HtmlAnalysis()
{
//
// TODO: Add constructor logic here
//
}
public static string GetContentFromHtml(string HtmlStr)
{ string pageContent = "";
HtmlStr=HtmlStr.Replace(" ","");
HtmlStr = HtmlStr.ToLower();
string[] str1=HtmlStr.Split(new string[]{"<div id=\"ad_thread4_0\"></div>"},StringSplitOptions.RemoveEmptyEntries);
foreach (string s in str1)
{
if (s.Contains("<div id=\"postmessage"))
{
int st = s.IndexOf("</div>");
if (st != -1)
pageContent = s.Substring(0, st)+"</div>";
}
}
return ""+pageContent;
}
public static List<post> GetListFromHtml(string HtmlStr)
{
List<post> p = new List<post>();
string htmlList = "";
HtmlStr = HtmlStr.Replace(" ", "");
HtmlStr = HtmlStr.ToLower();
string[] str = HtmlStr.Split(new string[] { "<tbody id=\"normalthread"},StringSplitOptions.RemoveEmptyEntries);
foreach (string s in str)
{
if (s.Contains("<td class=\"folder\">"))
{
string st = "";
string[] s2 = s.Split(new string[] { "<span id=\"thread" }, StringSplitOptions.RemoveEmptyEntries);
if (s2.Length == 2)
st = s2[1];
else
continue;
string title = "";
string starget = "";
string time = "";
int i = st.IndexOf("<a href=\"");
int j = 0,m=0;
if (i != -1)
{
m = st.IndexOf("\"", i + 15);
j = st.IndexOf("\">", i + 5);
}
int k = st.IndexOf("</a>");
if (i != -1 && m != -1 && m > i)
starget = st.Substring(i + 9, m - i-9);
if (j != -1 && k != -1 && k > j)
title = st.Substring(j+2, k - j-2);
p.Add(new post(title,starget));
}
}
return p;
}
}
}
下面是:采集功能了,可以写成定时器,定时执行更新内容:先循环分页,在循环列表,再得到内容。
using System;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Collections.Generic;
using System.Data;
using System.Data.OleDb;
public partial class admin_reflash : AdminPageBase
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button2_Click(object sender, EventArgs e)
{
string s = "";
Rebot r = new Rebot();
for (int i = 0; i < GridView1.Rows.Count; i++)
{
if (((CheckBox)GridView1.Rows[i].FindControl("CheckBox5")).Checked)
{
int CateId = Convert.ToInt32(GridView1.Rows[i].Cells[1].Text);
Response.Write(CateId);
string webhost = Gethost(); //域名
string url = webhost + GridView1.Rows[i].Cells[3].Text.Replace("&","&"); // urlformat
string boardName = GridView1.Rows[i].Cells[2].Text;
// int Cateid = Convert.ToInt32(TextBox3.Text.Trim()); //对应的cateid
try
{
Convert.ToInt32(TextBox1.Text);
Convert.ToInt32(TextBox2.Text);
}
catch
{
Label1.Text = "页码要数字";
}
int Getcount = Convert.ToInt32(TextBox1.Text); //提取页数
int miniStart = Convert.ToInt32(TextBox2.Text);
s += "更新:"+boardName+url+"===========================<br>";
int flag = 0;
for (int j = Getcount; j >= miniStart;j-- )
{
string url2 = url + j.ToString();
s += "第" + j.ToString() + "页("+url2+")==============<br>";
string content = "";
try
{
content = r.GetWebString(url2);
s += "获取" + url2 + "成功!!";
}
catch(Exception ex)
{
System.Threading.Thread.Sleep(10000);
s += "获取" + url2 + ex.Message;
//后去出错重试一次
if(flag==0) //是否已经重试
j++;
flag=1;
r = new Rebot();//重置cookies
continue;
}
flag = 0;
List<post> p = new List<post>();
p = HtmlAnalysis.GetListFromHtml(content);
s += p.Count.ToString() + "条记录";
foreach (post p1 in p)
{
flag++;
string turl = webhost + p1.ctarget;
int postId = Convert.ToInt32(GetpostId(turl));
bool ttemp = checkoldpostid(postId);
if (ttemp)
{
s += p1.ctarget + "==" + p1.tilte + "重复!!</br>";
continue;
}
try
{
string tcontent = HtmlAnalysis.GetContentFromHtml(r.GetWebString(turl));
if (tcontent.Trim().Length < 10)
{
s += p1.ctarget + "==" + p1.tilte + "no content!!</br>";
continue;
}
insertdata(p1.tilte, tcontent, CateId, postId, DateTime.Now);
s += p1.ctarget + "==" + p1.tilte + "sueccess!!</br>";
}
catch(Exception ex)
{
s += p1.ctarget + "==" + p1.tilte + "error!!"+ex.Message+"</br>";
}
}
}
}
}
Label1.Text = s;
}
/// <summary>
/// 系统设置值host
/// </summary>
/// <returns></returns>
private string Gethost()
{
string s="";
Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
OleDbDataReader r = a.olExcuteReader("select top 1 * from config ");
while (r.Read())
{
s = r.GetString(0);
}
return s;
}
private string GetpostId(string Url)
{
Url = Url.ToLower();
int i = Url.IndexOf("?tid=");
int j = Url.IndexOf("&");
return Url.Substring(i + 5, j - i - 5);
}
public int insertdata(string Art_Title, string Art_Content, int Cate_Id, int Arti_oldId, DateTime time)
{
Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
OleDbParameter[] param = new OleDbParameter[5];
param[0] = new OleDbParameter("@arti_title", Art_Title);
param[1] = new OleDbParameter("@arti_content", Art_Content);
param[2] = new OleDbParameter("@arti_cateid", Cate_Id);
param[3] = new OleDbParameter("@arti_createtime", time.ToString());
param[4] = new OleDbParameter("@Arti_oldId", Arti_oldId);
return a.olExcuteNonequry("insert into article (arti_title,arti_content,arti_cateid,arti_createtime,Arti_oldId) values (@arti_title,@arti_content,@arti_cateid,@arti_createtime,@Arti_oldId)", param);
}
public bool checkoldpostid(int oldid)
{
Ncuhome.Helper.AccessDbHelper a = new Ncuhome.Helper.AccessDbHelper();
OleDbDataReader r = a.olExcuteReader("select arti_title from article where Arti_oldId=" + oldid.ToString());
bool re = r.Read();
r.Close();
return re;
}
}
这样虽然可以很好的做到信息采集,但是需要为每一个网站单独制作一套解析,当目的站改版,我们也需要即时的改版。这样很不灵活,而且这样相当的费精力。应该可以写出一套智能的分析方法,因为大多说的文章系统,论坛系统的页面布局基本相同,以后有机会的话一定做一个。