下一站天后

今朝的容颜老于昨晚

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

 

 

 

利用字符串截取的方法抓取所需内容

*注意*:</div>\r\n\r\n\r\n<!--正文内容 -->,看得到的内容里是否包含要截取内容,否则报错!并

且div要匹配,否则影响样式。

用 while(html.IndexOf("<li>")!=-1){}循环得到里面每个标题,   html=html.Substring

(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环一次排除一个li

private void Linkbutton9_Click(object sender, System.EventArgs e)

        
{
            WebClient wc 
= new WebClient();
            
byte[] b = wc.DownloadData("http://bank.hexun.com/lczx/");
            
string html = Encoding.GetEncoding("gb2312").GetString(b);    
            html 
= html.Substring(html.IndexOf("<div class=\"temp01\">")+"<div 

class=\"temp01\">".Length);
            html = html.Substring(0,html.IndexOf("<div class=\"listdh\">"));//得到所需html
            string temp = "";//one link        
            string title="";
            
string url="";
            
while(html.IndexOf("<li>")!=-1)
            
{
                
//得到a标签里的值
                temp=html.Substring(html.IndexOf("</span>")+"</span>".Length,html.IndexOf

(
"</a></li>")-html.IndexOf("</span>")-"</span>".Length);
                title
=temp.Substring(temp.IndexOf("blank\">")+7);//得到标题内容
                url=temp.Substring(temp.IndexOf("http"),temp.IndexOf("html")+4-

temp.IndexOf(
"http"));//得到内容url地址
                html=html.Substring(html.IndexOf("</a></li>")+"</a></li>".Length);//每循环

一次排除一个li    
                
                Components.Entity.Article at 
= GetContentHXWeb(url,title,"235");
                
if(at != null)
                
{
                    arr1.Add(at);
                }

            }

            RepeaterBankInfo.DataSource 
=arr1;
            RepeaterBankInfo.DataBind();
            rowsCount 
= arr1.Count;    

        }


 

//和讯新闻内容抓取
        protected Components.Entity.Article GetContentHXWeb(string url,string title,string 

type)
        
{
            Components.Entity.Article at 
= null;
           
//根据地址来绑定显示数据
            System.Net.WebClient wc = new System.Net.WebClient();
            
byte[]  bt = wc.DownloadData(url);
            
string htmlContent  = System.Text.Encoding.Default.GetString(bt);           

            
//得到正文内容
            htmlContent=htmlContent.Substring(htmlContent.IndexOf("<div class=\"concent\" 

id
=\"artibody\">")+"<div class=\"concent\" id=\"artibody\">".Length,htmlContent.IndexOf

(
"\r\n\r\n\r\n\r\n</div>\r\n\r\n\r\n<!--正文内容 -->")-htmlContent.IndexOf("<div 

class=\"concent\" id=\"artibody\">")-"<div class=\"concent\" id=\"artibody\">".Length);
            if(htmlContent.IndexOf("<!--新增加推荐阅读 -->")!=-1)
            
{
                
string htmlContent1=htmlContent.Substring(0,htmlContent.IndexOf("<!--新增

加推荐阅读 
-->"));//推荐前面的内容
                string htmlContent2=htmlContent.Substring(htmlContent.IndexOf("<!--新增加

推荐阅读 结束
-->"));//推荐后面的内容
                htmlContent=htmlContent1+htmlContent2;
            }
            

            
if(htmlContent.IndexOf("下一页")<0)
            
{
                at 
= new Components.Entity.Article();
                at.ClassId 
= Convert.ToInt32(type);
                at.Date 
= DateTime.Now;
                at.Title 
=title;
                at.Content
=  Components.Formater.trimTagA(htmlContent);
                at.PassDate 
= DateTime.Now;
                at.Pass 
= 1;
                at.PassUserId 
= Components.Context.GetAdministrator().Id;
                at.TopicType 
= (int)Components.Enum.TopicType.Bank;
            }

            
return at;

        }


取出新闻内容去掉里面的链接,调用方法trimTagA

 

public static string trimTagA(string strHtml)
        
{
            strHtml 
= strHtml.ToLower();
            
string temp = "";
            
while(strHtml.IndexOf("<a"!= -1)
            
{
                temp 
= strHtml.Substring(strHtml.IndexOf("<a"),strHtml.IndexOf("</a>")-

strHtml.IndexOf(
"<a")+4);

                strHtml 
= strHtml.Replace(temp,temp.Substring(temp.IndexOf(">")

+1,temp.LastIndexOf("</a>")-temp.IndexOf(">")-1));
                

            }

            
return strHtml;
        }

 

下载事件

    
private void btDownLoad_Click(object sender, System.EventArgs e)
        
{
                
int count = 0;
                
for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
                
{
                    CheckBox ck 
= RepeaterBankInfo.Items[i].FindControl

(
"BookContentCheck"as CheckBox;
                    
if(ck.Checked)
                    
{
                        count
++;
                    }

                }


                
if(count>0)
                
{
                    
for(int i=0;i<RepeaterBankInfo.Items.Count;i++)
                    
{
                        CheckBox ck 
= RepeaterBankInfo.Items[i].FindControl

(
"BookContentCheck"as CheckBox;
                        Label Title 
= RepeaterBankInfo.Items[i].FindControl("lbTitle"as 

Label;
                        Label Content 
= RepeaterBankInfo.Items[i].FindControl("lbContent"

 
as Label;
                        
if(ck.Checked)
                        
{
                            Components.Entity.Article at 
= new Components.Entity.Article

();
                            at.ClassId 
= Convert.ToInt32(strType);
                            at.Date 
= DateTime.Now;
                            at.Title 
= Title.Text;
                            at.Content
=  Content.Text;
                            at.PassDate 
= DateTime.Now;
                            at.Pass 
= 1;
                            at.PassUserId 
= Components.Context.GetAdministrator().Id;
                            at.TopicType 
= (int)Components.Enum.TopicType.Bank;
                            at.Insert();
                        }

                    }

                }

        
                Response.Write(
"<script>alert('下载完成!')</script>");
            

        }
 

posted on 2008-09-17 21:44  孙雅玲  阅读(503)  评论(1编辑  收藏  举报