F#---百度新闻

#if INTERACTIVE
#r @"C:\Users\v-shuzhu\Desktop\HtmlAgilityPack.dll"
#endif
open System
open System.Diagnostics
open System.Net
open System.Xml
open System.IO
open HtmlAgilityPack
   
let asyncGrapUrl(newUrl : string) =
    async{            
        let fileNameXml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml"
        let fileNameHtml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".html"
        let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest
        let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync())
        let responStream = httpRespon.GetResponseStream()

        let xml = new HtmlDocument()
        xml.Load(responStream,Text.Encoding.GetEncoding("gb2312"),true)
        xml.OptionOutputAsXml <- true


        return xml//,xml)
    } |> Async.RunSynchronously   
     

//let url = @"http://feed.cnblogs.com/blog/sitehome/rss"
let url = @"http://www.news.baidu.com"

let html = asyncGrapUrl(url) 
let htmlnode = html.DocumentNode

let content = html.GetElementbyId("container")
let dls = content.SelectNodes("/html[1]/body[1]/div[3]/div[3]/div/div/dl")    
let dds = dls |> Seq.map(fun dl -> dl.ChildNodes) |> Seq.collect(fun dl -> dl)  
     
let yyc = 
    dds 
    |> Seq.map(fun dd -> dd.ChildNodes)
    |> Seq.collect (fun dd -> dd) 
    |> Seq.map(fun ui -> ui.ChildNodes) 
    |> Seq.collect (fun li -> li)         
    |> Seq.toList
    |> List.map(fun li -> li.InnerText)
    |> List.filter(fun txt -> not (txt.Trim() = ""))    
    |> List.map(fun str -> 
                        System.Text.RegularExpressions.Regex.Replace(str,"\n[ ]*", System.Environment.NewLine))    
    |> List.fold (+) ""
    |> fun str -> System.Text.RegularExpressions.Regex.Replace(str,@"^[ \t\n]*\n", "")

let fileNameTxt = @"D:\" + url.Replace('.','0').Replace('/','0').Replace(':','0') + ".txt"
let fileInfo = new System.IO.FileStream(fileNameTxt,FileMode.OpenOrCreate,FileAccess.ReadWrite)
let writer = new System.IO.StreamWriter(fileInfo,Text.Encoding.UTF8)
writer.WriteLine(yyc)

还不完善,先保存下。。。
目前可以把百度里面的新闻标题抓出来,当然要抓这些标题的链接是很简单的,下一步实现选择标题,抓取相应标题的具体内容。。。

posted @ 2013-01-05 17:55  ZackZhou  阅读(272)  评论(0编辑  收藏  举报