Interactive Web Crawling with F#

//---------------------------------------------------------------------------
// Part O. Hello World
//System.Console.WriteLine("Hello World");;
System.Console.WriteLine("Hello World");;
open Printf;;
printf "Hello World\n";;
//---------------------------------------------------------------------------
// Part I. Web.

open System.Net
open System
open System.IO
let id x = x 
let req = WebRequest.Create("http://www.microsoft.com")
let resp = req.GetResponse()
let stream = resp.GetResponseStream()
let reader = new IO.StreamReader(stream)
let html = reader.ReadToEnd();;
html;;
/// Fetch the contents of a web page
let http(url: string) = 
  let req = WebRequest.Create(url) in
  let resp = req.GetResponse() in
  let stream = resp.GetResponseStream() in
  let reader = new IO.StreamReader(stream) in
  let html = reader.ReadToEnd() in
  resp.Close();
  html
let google = http("http://www.google.com");;
let bbc    = http("http://news.bbc.co.uk");;
let msft   = http("http://www.microsoft.com");;
let nytRSS = http("http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml");;
//let bbcRSS = http("http://www.bbc.co.uk/go/homepage/int/ne/nrss/log/i/-/news/rss/newsonline_uk_edition/front_page/rss.xml")
// ----------------------------
// Windows Forms

open System.Windows.Forms;;
open System.Drawing;;
let form = new Form();;
form.Visible <- true;;
form.Text <- "Welcome to F# Interactive Programming";;
form.TopMost <- true;;
let textB = new RichTextBox();;
form.Controls.Add(textB);;
textB.Dock <- DockStyle.Fill ;;
textB.Text <- nytRSS;;
textB.ForeColor <- Color.DarkBlue;;
textB.Font <- new Font("Lucida Console",12.0f,FontStyle.Bold) ;
form.Size <- new Size(400,600);;
let setText text = 
   textB.Text <- text
let appendText text = textB.AppendText(text + "\n");;
setText "hello";;
setText "hello again";;
//let (|>) x f = f x
let any_to_string_ex opts x = x |> any_to_layout opts |> layout_to_string opts
let show  x = 
  let opts= { format_options.Default with printWidth = form.Width/16 } in
  setText (any_to_string_ex opts x);;

(1,2,3) |> show;;
Array.create 100 (1,2,3) |> show;;
nytRSS |> setText;;
show 1;;

// ----------------------------
// Scan RSS for news titles
open System.Xml;;
open System.Collections;;
open System.Collections.Generic;;
let xdoc = new XmlDocument();;
xdoc.LoadXml(nytRSS);;
xdoc.SelectNodes("//title");;
xdoc.SelectNodes("//title") |> show;;
/// Hmmm... XPathNodeList supports System.IEnumerable
/// First extract the text from the nodes then display...
xdoc.SelectNodes("//title") 
  |> IEnumerable.map_with_type (fun (i:XmlNode) -> i.InnerText)
  |> IEnumerable.to_list
  |> show;;
// ----------------------------
// Search for URLs in HTML
open System.Text.RegularExpressions;;
let httpPat = "http://[a-z-A-Z0-9./_]*"
let urlPat = "href=\s*\"[^\"h]*(http://[^&\"]*)\"";;
let bbcUrls = Regex.Matches(bbc,urlPat);;
let getUrls (txt:string) =  
  Regex.Matches(txt,urlPat) 
  |> IEnumerable.map_with_type (fun (m:Match) -> (m.Groups.Item(1)).Value)
  |> IEnumerable.to_list;;
let collectUrls url = 
  appendText url;
  Application.DoEvents();
  let html = 
     try http(url) 
     with _ -> "" in 
  let urls = getUrls html in
  urls;;
collectUrls "http://news.google.com" |> show;;
// ----------------------------
// Crawling (Synchronous)
let crawlLimit = 10;;
let rec crawl sofar url = 
  if Set.size sofar >= crawlLimit or Set.mem url sofar 
  then sofar
  else 
    let urls = collectUrls url in 
    List.fold_left crawl (Set.add url sofar) urls;;
    
textB.Clear();;
crawl Set.empty "http://news.google.com";;
// ----------------------------
// HTTP Requests (Asynchronous)
open System.Threading
open Microsoft.FSharp.Idioms
open System.Collections.Generic
let httpAsync (url:string) (cont: string -> unit) =
  let req = WebRequest.Create(url) in
  let iar = 
    req.BeginGetResponse((fun iar -> 
      let rsp = req.EndGetResponse(iar) in
      let str = new StreamReader(rsp.GetResponseStream()) in
      let html = str.ReadToEnd() in
      rsp.Close();
      cont html), 0) in
  ()
do httpAsync  "http://www.microsoft.com" (fun html -> show html)
do httpAsync  "http://www.google.com" (fun html -> show html)
let collectUrlsAsync url cont = httpAsync url (getUrls >> cont)
do collectUrlsAsync "http://news.google.com" (fun urls -> show urls)

// ----------------------------
// Crawling (Asynchronous)

/// Spawn a worker thread
let spawn (f : unit -> unit) = ThreadPool.QueueUserWorkItem(fun _ -> f ()) |> ignore
/// Add text to the window from a worker thread
let appendTextRemote t = 
  form.Invoke(new MethodInvoker(fun () -> appendText t)) |> ignore
  
let addToSet (d: Dictionary<_,_>) x = 
   let res = d.ContainsKey(x) in 
   if not res then d.Add(x,1); 
   res
          
/// Async crawling 
let acrawl(url:string) = 
  appendTextRemote (sprintf "Crawling %s..." url);
  // Local state, protected by locks
  let sofar = new Dictionary<_,_>() in
  let rec search url = 
    let wasPresent = lock sofar (fun () -> addToSet sofar url) in 
    if not wasPresent && sofar.Count < crawlLimit * 2 then begin
      spawn (fun () -> 
        collectUrlsAsync url (fun urls -> List.iter search urls; 
                                          appendTextRemote url)
      );
     end in 
  spawn (fun () -> search url)
do textB.Clear()
do acrawl "http://news.google.com"
// ---------------------------------------------
// Random web walk 
// List functions
// Random numbers
let rand = new System.Random()
let dice     n = rand.Next(n)
let diceList xs dflt =
  let n = List.length xs in
  if n=0 then dflt else List.nth xs (dice n);;
// Web browser control inside a form
open System
open System.Windows.Forms;;
let wb = new WebBrowser();;
wb.Dock <- DockStyle.Fill;;
wb.AllowNavigation <- true;;
let webForm = new Form();;
webForm.Controls.Add(wb);;
webForm.Visible <- true;;
webForm.Size <- new Size(600,400);;
webForm.TopMost <- true;;
// Point it at pages and get the text
wb.Navigate("http://news.bbc.co.uk");;
let text = wb.DocumentText
// Regular expressions  
open System.Text.RegularExpressions
let rx   = new Regex("http://news.bbc.co.uk/[a-z0-9_.-/]+stm")
// Regular expression to filter urls 
let urlsOfDocument (doc : HtmlDocument) =   
  let urls = doc.Links |> IEnumerable.map_with_type (fun (elt:HtmlElement) -> elt.GetAttribute("href")) in
  let urls = urls |> IEnumerable.filter (fun url -> rx.IsMatch(url)) in
  let urls = urls |> IEnumerable.to_list in 
  urls;;
// Test it
urlsOfDocument (wb.Document) |> show;;
let randomLink doc =
  let urls = urlsOfDocument doc in
  let url  = diceList urls "http://news.bbc.co.uk" in
  Printf.printf "JUMP: %s\n" url;
  url;;
// Test it 
randomLink wb.Document |> show;;
// Click on a timer event
let randomClick () = wb.Navigate(randomLink(wb.Document))
let timer = new Timer();;
timer.Interval <- 1500;;
timer.Tick.Add(fun _ -> randomClick ());;
timer.Start();;
// Enough!
timer.Stop();;
posted on 2010-05-21 16:17 杨剑阅读(278) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
大家好，我是杨剑

Interactive Web Crawling with F#

导航

公告