Interactive Web Crawling with F#

//---------------------------------------------------------------------------
// Part O. Hello World
//System.Console.WriteLine("Hello World");;
System.Console.WriteLine("Hello World");;
open Printf;;
printf "Hello World\n";;
//---------------------------------------------------------------------------
// Part I. Web.

open System.Net
open System
open System.IO
let id x = x
let req = WebRequest.Create("http://www.microsoft.com")
let resp = req.GetResponse()
let stream = resp.GetResponseStream()
let reader = new IO.StreamReader(stream)
let html = reader.ReadToEnd();;
html;;
/// Fetch the contents of a web page
let http(url: string) =
let req = WebRequest.Create(url) in
let resp = req.GetResponse() in
let stream = resp.GetResponseStream() in
let reader = new IO.StreamReader(stream) in
let html = reader.ReadToEnd() in
resp.Close();
html
let google = http("http://www.google.com");;
let bbc = http("http://news.bbc.co.uk");;
let msft = http("http://www.microsoft.com");;
let nytRSS = http("http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml");;
//let bbcRSS = http("http://www.bbc.co.uk/go/homepage/int/ne/nrss/log/i/-/news/rss/newsonline_uk_edition/front_page/rss.xml")
// ----------------------------
// Windows Forms

open System.Windows.Forms;;
open System.Drawing;;
let form = new Form();;
form.Visible <- true;;
form.Text <- "Welcome to F# Interactive Programming";;
form.TopMost <- true;;
let textB = new RichTextBox();;
form.Controls.Add(textB);;
textB.Dock <- DockStyle.Fill ;;
textB.Text <- nytRSS;;
textB.ForeColor <- Color.DarkBlue;;
textB.Font <- new Font("Lucida Console",12.0f,FontStyle.Bold) ;
form.Size <- new Size(400,600);;
let setText text =
textB.Text <- text
let appendText text = textB.AppendText(text + "\n");;
setText "hello";;
setText "hello again";;
//let (|>) x f = f x
let any_to_string_ex opts x = x |> any_to_layout opts |> layout_to_string opts
let show x =
let opts= { format_options.Default with printWidth = form.Width/16 } in
setText (any_to_string_ex opts x);;

(1,2,3) |> show;;
Array.create 100 (1,2,3) |> show;;
nytRSS |> setText;;
show 1;;

// ----------------------------
// Scan RSS for news titles
open System.Xml;;
open System.Collections;;
open System.Collections.Generic;;
let xdoc = new XmlDocument();;
xdoc.LoadXml(nytRSS);;
xdoc.SelectNodes("//title");;
xdoc.SelectNodes("//title") |> show;;
/// Hmmm... XPathNodeList supports System.IEnumerable
/// First extract the text from the nodes then display...
xdoc.SelectNodes("//title")
|> IEnumerable.map_with_type (fun (i:XmlNode) -> i.InnerText)
|> IEnumerable.to_list
|> show;;
// ----------------------------
// Search for URLs in HTML
open System.Text.RegularExpressions;;
let httpPat = "http://[a-z-A-Z0-9./_]*"
let urlPat = "href=\s*\"[^\"h]*(http://[^&\"]*)\"";;
let bbcUrls = Regex.Matches(bbc,urlPat);;
let getUrls (txt:string) =
Regex.Matches(txt,urlPat)
|> IEnumerable.map_with_type (fun (m:Match) -> (m.Groups.Item(1)).Value)
|> IEnumerable.to_list;;
let collectUrls url =
appendText url;
Application.DoEvents();
let html =
try http(url)
with _ -> "" in
let urls = getUrls html in
urls;;
collectUrls "http://news.google.com" |> show;;
// ----------------------------
// Crawling (Synchronous)
let crawlLimit = 10;;
let rec crawl sofar url =
if Set.size sofar >= crawlLimit or Set.mem url sofar
then sofar
else
let urls = collectUrls url in
List.fold_left crawl (Set.add url sofar) urls;;

textB.Clear();;
crawl Set.empty "http://news.google.com";;
// ----------------------------
// HTTP Requests (Asynchronous)
open System.Threading
open Microsoft.FSharp.Idioms
open System.Collections.Generic
let httpAsync (url:string) (cont: string -> unit) =
let req = WebRequest.Create(url) in
let iar =
req.BeginGetResponse((fun iar ->
let rsp = req.EndGetResponse(iar) in
let str = new StreamReader(rsp.GetResponseStream()) in
let html = str.ReadToEnd() in
rsp.Close();
cont html), 0) in
()
do httpAsync "http://www.microsoft.com" (fun html -> show html)
do httpAsync "http://www.google.com" (fun html -> show html)
let collectUrlsAsync url cont = httpAsync url (getUrls >> cont)
do collectUrlsAsync "http://news.google.com" (fun urls -> show urls)

// ----------------------------
// Crawling (Asynchronous)

/// Spawn a worker thread
let spawn (f : unit -> unit) = ThreadPool.QueueUserWorkItem(fun _ -> f ()) |> ignore
/// Add text to the window from a worker thread
let appendTextRemote t =
form.Invoke(new MethodInvoker(fun () -> appendText t)) |> ignore

let addToSet (d: Dictionary<_,_>) x =
let res = d.ContainsKey(x) in
if not res then d.Add(x,1);
res

/// Async crawling
let acrawl(url:string) =
appendTextRemote (sprintf "Crawling %s..." url);
// Local state, protected by locks
let sofar = new Dictionary<_,_>() in
let rec search url =
let wasPresent = lock sofar (fun () -> addToSet sofar url) in
if not wasPresent && sofar.Count < crawlLimit * 2 then begin
spawn (fun () ->
collectUrlsAsync url (fun urls -> List.iter search urls;
appendTextRemote url)
);
end in
spawn (fun () -> search url)
do textB.Clear()
do acrawl "http://news.google.com"
// ---------------------------------------------
// Random web walk
// List functions
// Random numbers
let rand = new System.Random()
let dice n = rand.Next(n)
let diceList xs dflt =
let n = List.length xs in
if n=0 then dflt else List.nth xs (dice n);;
// Web browser control inside a form
open System
open System.Windows.Forms;;
let wb = new WebBrowser();;
wb.Dock <- DockStyle.Fill;;
wb.AllowNavigation <- true;;
let webForm = new Form();;
webForm.Controls.Add(wb);;
webForm.Visible <- true;;
webForm.Size <- new Size(600,400);;
webForm.TopMost <- true;;
// Point it at pages and get the text
wb.Navigate("http://news.bbc.co.uk");;
let text = wb.DocumentText
// Regular expressions
open System.Text.RegularExpressions
let rx = new Regex("http://news.bbc.co.uk/[a-z0-9_.-/]+stm")
// Regular expression to filter urls
let urlsOfDocument (doc : HtmlDocument) =
let urls = doc.Links |> IEnumerable.map_with_type (fun (elt:HtmlElement) -> elt.GetAttribute("href")) in
let urls = urls |> IEnumerable.filter (fun url -> rx.IsMatch(url)) in
let urls = urls |> IEnumerable.to_list in
urls;;
// Test it
urlsOfDocument (wb.Document) |> show;;
let randomLink doc =
let urls = urlsOfDocument doc in
let url = diceList urls "http://news.bbc.co.uk" in
Printf.printf "JUMP: %s\n" url;
url;;
// Test it
randomLink wb.Document |> show;;
// Click on a timer event
let randomClick () = wb.Navigate(randomLink(wb.Document))
let timer = new Timer();;
timer.Interval <- 1500;;
timer.Tick.Add(fun _ -> randomClick ());;
timer.Start();;
// Enough!
timer.Stop();;

posted on 2010-05-21 16:17  杨剑  阅读(278)  评论(0编辑  收藏  举报

导航