F# -- Grab Web page
open System open System.Diagnostics open System.Net open System.Xml open System.IO //open HtmlAgilityPack let asyncGrapUrl(newUrl : string) = async{ let fileName = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml" let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync()) let responStream = httpRespon.GetResponseStream() // let xml = new XmlDocument() // xml.Load(responStream) let fileStream = new System.IO.FileStream(fileName,FileMode.OpenOrCreate,FileAccess.Write) let streamWr = new StreamWriter(fileStream,Text.Encoding.GetEncoding("GB2312")) use strd =new StreamReader(responStream) while(not strd.EndOfStream ) do streamWr.WriteLine(strd.ReadLine()) streamWr.Flush() fileStream.Close() responStream.Close() return fileName//,xml) } |> Async.RunSynchronously let main() = let url = @"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/sci/tech/rss.xml" let asyncResults = asyncGrapUrl(url) let filename = asyncResults let xml = new XmlDocument()//snd asyncResults let fileInfo = new System.IO.FileStream(filename,FileMode.Open,FileAccess.Read) let fileStr = new StreamReader(fileInfo,Text.Encoding.GetEncoding("GB2312")) xml.Load(fileStr) let nodes = xml.SelectNodes("/rss/channel/item/title") for i in 0..(nodes.Count - 1) do printfn "%d : %s" (i + 1) nodes.[i].InnerText let item = int(Console.ReadLine()) let newUrl = let xpath = sprintf "/rss/channel/item[%i]/link" item let node = xml.SelectSingleNode(xpath) node.InnerText let proStart = new ProcessStartInfo(UseShellExecute=true,FileName=newUrl) let proc = new Process() proc.StartInfo <- proStart proc.Start() |> ignore asyncGrapUrl(newUrl) |> ignore main()
目前还没有完善, 中文乱码。。 在英文系统下, 没有实现抓取正文,只获取全部源代码。