C#/winform采集百度hi文章
public partial class Form1 : Form
{
Thread newth;
public Form1()
{
InitializeComponent();
}
private void buttonGo_Click(object sender, EventArgs e)
{
CheckForIllegalCrossThreadCalls = false; //简单异步线程控制设置
newth = new Thread(new ThreadStart(doit));
newth.Start();
}
void doit()
{ //用HttpWebRequest 对象采集百度hi blog文章
HttpWebRequest webRequest; //请求对象
StreamReader responseReader;//响应对象
string responseData;
html mytml; //自定义html简单处理对象,处理文章页面数据
DataTable dt = new DataTable();//存储文章列表
DataTable dt2 = new DataTable();//存储文章内容
int pagecount = 0;
dt.Columns.Add(new DataColumn("title"));//标题
dt.Columns.Add(new DataColumn("link"));//链接
dt.Columns.Add(new DataColumn("description"));//文章内容
dt.Columns.Add(new DataColumn("pubDate"));//发表时间
dt.Columns.Add(new DataColumn("category"));//文章分类
dt2.Columns.Add(new DataColumn("title"));
dt2.Columns.Add(new DataColumn("link"));
dt2.Columns.Add(new DataColumn("description"));
dt2.Columns.Add(new DataColumn("pubDate"));
dt2.Columns.Add(new DataColumn("category"));
string url = "http://hi.baidu.com/306759613/blog/index/";//文章列表第一页为http://hi.baidu.com/306759613/blog/index/0
string arcurl="http://hi.baidu.com/306759613/blog/item/";//文章所在路径
//find page count
//from index 0
webRequest = WebRequest.Create(url + 0) as HttpWebRequest;
webRequest.Timeout = 3000;//请求延时设置
WebResponse reponse = webRequest.GetResponse();
//以gb2312读取数据
responseReader = new StreamReader(
reponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")
);
responseData = responseReader.ReadToEnd();//读取整个页面
responseReader.Close();
mytml = new html(responseData);//创建html页面处理对象
List<string> regpsl = mytml.getElementsByRegex(@"/blog/index/[\d]+");//获取分页链接的正则
List<int> pagenum = new List<int>();//存储页码
foreach (string a in regpsl) {
pagenum.Add(int.Parse(a.Replace("/blog/index/", "")));
}
pagecount = pagenum.Max() + 1;//pagenum中最大值为尾页页码,页面从0开始编号,页数为页面数+1
mytml = null;
this.progressBar1.Value = 0; //进度条
for (int i = 0; i < pagecount; i++)
{
webRequest = WebRequest.Create(url+i) as HttpWebRequest;//读取各分页
webRequest.Timeout = 3000;
responseReader = new StreamReader(
webRequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")
);
responseData = responseReader.ReadToEnd();
responseReader.Close();
mytml = new html(responseData);
DataTable dti = mytml.getAritcleTable();//获取该分页文章列表
this.progressBar1.Value = (i * 100 / pagecount);
this.label1.Text = this.progressBar1.Value + "%"; //进度条
for (int j = 0; j < dti.Rows.Count; j++)
{
dt.Rows.Add(dti.Rows[j].ItemArray);//插入该文章到总文章表
HttpWebRequest subrequest = WebRequest.Create(arcurl+dti.Rows[j][1]+".html") as HttpWebRequest;//读取文章信息
subrequest.Timeout = 3000;
StreamReader subre = new StreamReader (subrequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
string tmphtml = subre.ReadToEnd();
//处理文章页面html开始
int start = tmphtml.IndexOf("<div id=\"blog_text\" class=\"cnt\">")+"<div id=\"blog_text\" class=\"cnt\">".Length;
int end = tmphtml.IndexOf("</div",start);
tmphtml = tmphtml.Substring(start,end-start);//取得文章内容
dt2.Rows.Add(new object[] { dti.Rows[j].ItemArray[0], dti.Rows[j].ItemArray[1], tmphtml, dti.Rows[j].ItemArray[3] });//插入文章数据到文章表
subre.Close();
this.progressBar1.Value = (i * 100 / pagecount) + (j * 25 / dti.Rows.Count);//设置进度条
this.label1.Text = this.progressBar1.Value + "%"; //显示百分比
writeXML(dt2, "f:\\p\\" + dti.Rows[j][1] + ".xml");//将文章以xml格式输出
dt2.Rows.Clear();
subre.Close();
subrequest = null;
}
webRequest = null;
responseReader.Close();
responseReader = null;
responseData = string.Empty;
}
this.progressBar1.Value =this.progressBar1.Maximum;//进度100%
this.label1.Text = this.progressBar1.Value + "%";
this.dataGridView1.DataSource = dt;//显示文章列表数据
writeXML(dt, "f:\\p\\Articel.xml");//输出文章内容数据到xml文件
textBoxDebug.Text = textBoxDebug.Text+ "写入完毕\r\n";
}
/// <summary>
/// 将数据表输出到xml
/// </summary>
/// <param name="dt"></param>
/// <param name="fileName"></param>
public void writeXML(DataTable dt, string fileName)
{
string xmlstr ="<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n";
xmlstr += "<?xml-stylesheet href=\"t.xsl\" type=\"text/xsl\"?>\r\n";
xmlstr +="<root>\r\n";
dt.TableName = "articels";
System.Xml.XmlDocument xml = new System.Xml.XmlDocument();
for (int k = 0; k < dt.Rows.Count; k++)
{
xmlstr = xmlstr + "<" + dt.TableName + ">\r\n";
for (int l = 0; l < dt.Columns.Count; l++)
{
xmlstr = xmlstr + "<" + dt.Columns[l].ColumnName + ">\r\n<![CDATA[\r\n";
xmlstr = xmlstr + dt.Rows[k][l] + "\r\n";
xmlstr = xmlstr + "]]>\r\n</" + dt.Columns[l].ColumnName + ">\r\n";
}
xmlstr = xmlstr + "</" + dt.TableName + ">\r\n";
}
xmlstr += "</root>\r\n";
StreamWriter w = new StreamWriter(fileName, false, System.Text.Encoding.UTF8);//以utf8保存
w.Write(xmlstr);
w.Close();
}
}
class html
{
string htmltext=string.Empty;
/// <summary>
/// 构造函数
/// </summary>
/// <param name="htmltext"></param>
public html( string htmltext) {
this.htmltext = htmltext;
}
/// <summary>
/// 获取文章列表
/// </summary>
/// <returns></returns>
public DataTable getAritcleTable(){
DataTable dt = new DataTable();
int start = htmltext.IndexOf("div id=\"m_blog\" class=\"modbox\">");//起始位置
int end = htmltext.IndexOf("<div id=\"mod_artclg\" class=\"mod\">");//结束位置
string htm = htmltext.Substring(start-1, end - start -1 );
dt.Columns.Add(new DataColumn("title"));
dt.Columns.Add(new DataColumn("link"));
dt.Columns.Add(new DataColumn("description"));
dt.Columns.Add(new DataColumn("pubDate"));
dt.Columns.Add(new DataColumn("category"));
string title, link, description, pubDate, category,temp;
int nstart, nend;//记录上次提取位置
start = 0;
do
{//遍历html文档 提取文章信息
nstart = htm.IndexOf("<div class=\"tit\">",start) + "<div class=\"tit\">".Length;
if (nstart < start) break;
start = nstart;
nend = htm.IndexOf("</div>",start);
start = nend + 5;
temp = htm.Substring(nstart, nend - nstart );
nstart = temp.IndexOf(">");
nend =temp.IndexOf("</a>");
title = temp.Substring(nstart + 1, nend-nstart-1 );//文章标题
nstart = temp.IndexOf("\"");
nend = temp.IndexOf("\"", nstart + 1);
link = temp.Substring(nstart + 1, nend - nstart-1 );//链接
nstart = link.IndexOf("item/")+"item/".Length;
nend = link.IndexOf(".html");
link = link.Substring(nstart, nend - nstart);//取文件名(去除扩展名)
nstart = htm.IndexOf("<div class=\"date\">", start)+ "<div class=\"date\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
pubDate = htm.Substring(nstart , nend - nstart);//发表日期
start = nend + 5;
nstart = htm.IndexOf("<div class=\"cnt\">", start) + "<div class=\"cnt\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
start = nend + 5;
description = htm.Substring(nstart, nend - nstart );//文章内容
nstart = htm.IndexOf("<div class=\"opt\">", start) + "<div class=\"opt\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
start = nend + 5;
temp = htm.Substring(nstart, nend - nstart );
nstart = temp.IndexOf(":");
nend =temp.IndexOf("</a>");
category=temp.Substring(nstart + 1, nend - nstart - 1); //文章分类
dt.Rows.Add(new string[] { title, link, description, pubDate, category });
} while (nstart > 0);
return dt;
}
}