F#---百度新闻
#if INTERACTIVE #r @"C:\Users\v-shuzhu\Desktop\HtmlAgilityPack.dll" #endif open System open System.Diagnostics open System.Net open System.Xml open System.IO open HtmlAgilityPack let asyncGrapUrl(newUrl : string) = async{ let fileNameXml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml" let fileNameHtml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".html" let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync()) let responStream = httpRespon.GetResponseStream() let xml = new HtmlDocument() xml.Load(responStream,Text.Encoding.GetEncoding("gb2312"),true) xml.OptionOutputAsXml <- true return xml//,xml) } |> Async.RunSynchronously //let url = @"http://feed.cnblogs.com/blog/sitehome/rss" let url = @"http://www.news.baidu.com" let html = asyncGrapUrl(url) let htmlnode = html.DocumentNode let content = html.GetElementbyId("container") let dls = content.SelectNodes("/html[1]/body[1]/div[3]/div[3]/div/div/dl") let dds = dls |> Seq.map(fun dl -> dl.ChildNodes) |> Seq.collect(fun dl -> dl) let yyc = dds |> Seq.map(fun dd -> dd.ChildNodes) |> Seq.collect (fun dd -> dd) |> Seq.map(fun ui -> ui.ChildNodes) |> Seq.collect (fun li -> li) |> Seq.toList |> List.map(fun li -> li.InnerText) |> List.filter(fun txt -> not (txt.Trim() = "")) |> List.map(fun str -> System.Text.RegularExpressions.Regex.Replace(str,"\n[ ]*", System.Environment.NewLine)) |> List.fold (+) "" |> fun str -> System.Text.RegularExpressions.Regex.Replace(str,@"^[ \t\n]*\n", "") let fileNameTxt = @"D:\" + url.Replace('.','0').Replace('/','0').Replace(':','0') + ".txt" let fileInfo = new System.IO.FileStream(fileNameTxt,FileMode.OpenOrCreate,FileAccess.ReadWrite) let writer = new System.IO.StreamWriter(fileInfo,Text.Encoding.UTF8) writer.WriteLine(yyc)
还不完善,先保存下。。。
目前可以把百度里面的新闻标题抓出来,当然要抓这些标题的链接是很简单的,下一步实现选择标题,抓取相应标题的具体内容。。。