利用字符串截取的方法抓取所需内容
*注意*:</div>\r\n\r\n\r\n<!--正文内容 -->,看得到的内容里是否包含要截取内容,否则报错!并
且div要匹配,否则影响样式。
用 while(html.IndexOf("<li>")!=-1){}循环得到里面每个标题, html=html.Substring
(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环一次排除一个li
private void Linkbutton9_Click(object sender, System.EventArgs e)
{
WebClient wc = new WebClient();
byte[] b = wc.DownloadData("http://bank.hexun.com/lczx/");
string html = Encoding.GetEncoding("gb2312").GetString(b);
html = html.Substring(html.IndexOf("<div class=\"temp01\">")+"<div
class=\"temp01\">".Length);
html = html.Substring(0,html.IndexOf("<div class=\"listdh\">"));//得到所需html
string temp = "";//one link
string title="";
string url="";
while(html.IndexOf("<li>")!=-1)
{
//得到a标签里的值
temp=html.Substring(html.IndexOf("</span>")+"</span>".Length,html.IndexOf
("</a></li>")-html.IndexOf("</span>")-"</span>".Length);
title=temp.Substring(temp.IndexOf("blank\">")+7);//得到标题内容
url=temp.Substring(temp.IndexOf("http"),temp.IndexOf("html")+4-
temp.IndexOf("http"));//得到内容url地址
html=html.Substring(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环
一次排除一个li
Components.Entity.Article at = GetContentHXWeb(url,title,"235");
if(at != null)
{
arr1.Add(at);
}
}
RepeaterBankInfo.DataSource =arr1;
RepeaterBankInfo.DataBind();
rowsCount = arr1.Count;
}
//和讯新闻内容抓取
protected Components.Entity.Article GetContentHXWeb(string url,string title,string
type)
{
Components.Entity.Article at = null;
//根据地址来绑定显示数据
System.Net.WebClient wc = new System.Net.WebClient();
byte[] bt = wc.DownloadData(url);
string htmlContent = System.Text.Encoding.Default.GetString(bt);
//得到正文内容
htmlContent=htmlContent.Substring(htmlContent.IndexOf("<div class=\"concent\"
id=\"artibody\">")+"<div class=\"concent\" id=\"artibody\">".Length,htmlContent.IndexOf
("\r\n\r\n\r\n\r\n</div>\r\n\r\n\r\n<!--正文内容 -->")-htmlContent.IndexOf("<div
class=\"concent\" id=\"artibody\">")-"<div class=\"concent\" id=\"artibody\">".Length);
if(htmlContent.IndexOf("<!--新增加推荐阅读 -->")!=-1)
{
string htmlContent1=htmlContent.Substring(0,htmlContent.IndexOf("<!--新增
加推荐阅读 -->"));//推荐前面的内容
string htmlContent2=htmlContent.Substring(htmlContent.IndexOf("<!--新增加
推荐阅读 结束-->"));//推荐后面的内容
htmlContent=htmlContent1+htmlContent2;
}
if(htmlContent.IndexOf("下一页")<0)
{
at = new Components.Entity.Article();
at.ClassId = Convert.ToInt32(type);
at.Date = DateTime.Now;
at.Title =title;
at.Content= Components.Formater.trimTagA(htmlContent);
at.PassDate = DateTime.Now;
at.Pass = 1;
at.PassUserId = Components.Context.GetAdministrator().Id;
at.TopicType = (int)Components.Enum.TopicType.Bank;
}
return at;
}
取出新闻内容去掉里面的链接,调用方法trimTagA
public static string trimTagA(string strHtml)
{
strHtml = strHtml.ToLower();
string temp = "";
while(strHtml.IndexOf("<a") != -1)
{
temp = strHtml.Substring(strHtml.IndexOf("<a"),strHtml.IndexOf("</a>")-
strHtml.IndexOf("<a")+4);
strHtml = strHtml.Replace(temp,temp.Substring(temp.IndexOf(">")
+1,temp.LastIndexOf("</a>")-temp.IndexOf(">")-1));
}
return strHtml;
}
下载事件
private void btDownLoad_Click(object sender, System.EventArgs e)
{
int count = 0;
for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
{
CheckBox ck = RepeaterBankInfo.Items[i].FindControl
("BookContentCheck") as CheckBox;
if(ck.Checked)
{
count++;
}
}
if(count>0)
{
for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
{
CheckBox ck = RepeaterBankInfo.Items[i].FindControl
("BookContentCheck") as CheckBox;
Label Title = RepeaterBankInfo.Items[i].FindControl("lbTitle") as
Label;
Label Content = RepeaterBankInfo.Items[i].FindControl("lbContent")
as Label;
if(ck.Checked)
{
Components.Entity.Article at = new Components.Entity.Article
();
at.ClassId = Convert.ToInt32(strType);
at.Date = DateTime.Now;
at.Title = Title.Text;
at.Content= Content.Text;
at.PassDate = DateTime.Now;
at.Pass = 1;
at.PassUserId = Components.Context.GetAdministrator().Id;
at.TopicType = (int)Components.Enum.TopicType.Bank;
at.Insert();
}
}
}
Response.Write("<script>alert('下载完成!')</script>");
}