小偷程序获取新闻标题及内容

利用字符串截取的方法抓取所需内容

*注意*：</div>\r\n\r\n\r\n，看得到的内容里是否包含要截取内容，否则报错！并

且div要匹配，否则影响样式。

用 while(html.IndexOf("<li>")!=-1){}循环得到里面每个标题， html=html.Substring

(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环一次排除一个li

private void Linkbutton9_Click(object sender, System.EventArgs e)

        {
            WebClient wc = new WebClient();
            byte[] b = wc.DownloadData("http://bank.hexun.com/lczx/");
            string html = Encoding.GetEncoding("gb2312").GetString(b);
            html = html.Substring(html.IndexOf("<div class=\"temp01\">")+"<div

class=\"temp01\">".Length);
            html = html.Substring(0,html.IndexOf("<div class=\"listdh\">"));//得到所需html
            string temp = "";//one link
            string title="";
            string url="";
            while(html.IndexOf("<li>")!=-1)
            {
                //得到a标签里的值
                temp=html.Substring(html.IndexOf("</span>")+"</span>".Length,html.IndexOf

("</a></li>")-html.IndexOf("</span>")-"</span>".Length);
                title=temp.Substring(temp.IndexOf("blank\">")+7);//得到标题内容
                url=temp.Substring(temp.IndexOf("http"),temp.IndexOf("html")+4-

temp.IndexOf("http"));//得到内容url地址
                html=html.Substring(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环

一次排除一个li

                Components.Entity.Article at = GetContentHXWeb(url,title,"235");
                if(at != null)
                {
                    arr1.Add(at);
                }
            }
            RepeaterBankInfo.DataSource =arr1;
            RepeaterBankInfo.DataBind();
            rowsCount = arr1.Count;

        }

//和讯新闻内容抓取
        protected Components.Entity.Article GetContentHXWeb(string url,string title,string

type)
        {
            Components.Entity.Article at = null;
           //根据地址来绑定显示数据
            System.Net.WebClient wc = new System.Net.WebClient();
            byte[]  bt = wc.DownloadData(url);
            string htmlContent  = System.Text.Encoding.Default.GetString(bt);

            //得到正文内容
            htmlContent=htmlContent.Substring(htmlContent.IndexOf("<div class=\"concent\"

id=\"artibody\">")+"<div class=\"concent\" id=\"artibody\">".Length,htmlContent.IndexOf

("\r\n\r\n\r\n\r\n</div>\r\n\r\n\r\n")-htmlContent.IndexOf("<div

class=\"concent\" id=\"artibody\">")-"<div class=\"concent\" id=\"artibody\">".Length);
            if(htmlContent.IndexOf("")!=-1)
            {
                string htmlContent1=htmlContent.Substring(0,htmlContent.IndexOf(""));//推荐前面的内容
                string htmlContent2=htmlContent.Substring(htmlContent.IndexOf(""));//推荐后面的内容
                htmlContent=htmlContent1+htmlContent2;
            }

            if(htmlContent.IndexOf("下一页")<0)
            {
                at = new Components.Entity.Article();
                at.ClassId = Convert.ToInt32(type);
                at.Date = DateTime.Now;
                at.Title =title;
                at.Content=  Components.Formater.trimTagA(htmlContent);
                at.PassDate = DateTime.Now;
                at.Pass = 1;
                at.PassUserId = Components.Context.GetAdministrator().Id;
                at.TopicType = (int)Components.Enum.TopicType.Bank;
            }
            return at;

        }

取出新闻内容去掉里面的链接，调用方法trimTagA

public static string trimTagA(string strHtml)
        {
            strHtml = strHtml.ToLower();
            string temp = "";
            while(strHtml.IndexOf("<a") != -1)
            {
                temp = strHtml.Substring(strHtml.IndexOf("<a"),strHtml.IndexOf("</a>")-

strHtml.IndexOf("<a")+4);

                strHtml = strHtml.Replace(temp,temp.Substring(temp.IndexOf(">")

+1,temp.LastIndexOf("</a>")-temp.IndexOf(">")-1));


            }
            return strHtml;
        }

下载事件

    private void btDownLoad_Click(object sender, System.EventArgs e)
        {
                int count = 0;
                for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
                {
                    CheckBox ck = RepeaterBankInfo.Items[i].FindControl

("BookContentCheck") as CheckBox;
                    if(ck.Checked)
                    {
                        count++;
                    }
                }

                if(count>0)
                {
                    for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
                    {
                        CheckBox ck = RepeaterBankInfo.Items[i].FindControl

("BookContentCheck") as CheckBox;
                        Label Title = RepeaterBankInfo.Items[i].FindControl("lbTitle") as

Label;
                        Label Content = RepeaterBankInfo.Items[i].FindControl("lbContent")

as Label;
                        if(ck.Checked)
                        {
                            Components.Entity.Article at = new Components.Entity.Article

();
                            at.ClassId = Convert.ToInt32(strType);
                            at.Date = DateTime.Now;
                            at.Title = Title.Text;
                            at.Content=  Content.Text;
                            at.PassDate = DateTime.Now;
                            at.Pass = 1;
                            at.PassUserId = Components.Context.GetAdministrator().Id;
                            at.TopicType = (int)Components.Enum.TopicType.Bank;
                            at.Insert();
                        }
                    }
                }

                Response.Write("<script>alert('下载完成！')</script>");


        }

posted on 2008-09-17 21:44 孙雅玲阅读(503) 评论(1) 编辑收藏举报

刷新页面返回顶部

下一站天后

公告