erlang实现简单爬虫

-module(spider).
-compile(export_all).
-import(lists,[reverse/1,reverse/2,map/2]).

nano_get_url(Host) ->
    {ok,Socket} = gen_tcp:connect(Host, 80, [binary,{packet,0}]),
    ok = gen_tcp:send(Socket,"GET / HTTP/1.0\r\n\r\n"),
    receive_data(Socket,[]).

receive_data(Socket,SoFar) ->
     receive
         {tcp,Socket,Bin} ->
                        receive_data(Socket,[Bin|SoFar]);
         {tcp_closed,Socket} -> 
                 list_to_binary(lists:reverse(SoFar))
     end.

urls2htmlFile(Urls,File) ->
      file:write_file(File,urls2html(Urls)).

bin2urls(Bin) -> gather_urls(binary_to_list(Bin),[]).

bin2urls2(Bin) -> gather_urls2(binary_to_list(Bin),[]).

urls2html(Urls) -> [h1("Urls"),make_list(Urls)].


h1(Title) -> ["<h1>",Title,"</h1>\n"].

make_list(L) ->
 ["<u1>\n",
   map(fun(I) -> ["<li>",I,"</li>\n"] end, L),
   "</u1>\n"].
 

 gather_urls("<a href" ++ T,L) ->
    {Url,T1}=collect_url_body(T,reverse("<a href")),
     
%%      case spider1:gather_urls(Url, []) of
%%                              []->
%%                                  nothing;
%%                              [SubUrl]->
%%                                  io:format("SubUrl:~p~n",[SubUrl]),
%%                                  make2(SubUrl)
%%                          end,
  gather_urls(T1,[Url|L]);
    
 gather_urls([_|T],L) -> 
    gather_urls(T,L); 

 gather_urls([],L) -> 
    L.
 
 gather_urls2("<a href" ++ T,L) ->
    {Url,T1}=collect_url_body(T,reverse("<a href")),
    gather_urls2(T1,[Url|L]);

 gather_urls2("<link href" ++ T,L) ->
    {Url,T1}=collect_url_body(T,reverse("<link href")),
    gather_urls2(T1,[Url|L]);

 gather_urls2([_|T],L) -> 
    gather_urls2(T,L);
 gather_urls2([],L) -> 
    L.

collect_url_body("</a>" ++ T,L) -> {reverse(L,"</a>"),T};
collect_url_body(">" ++ T,L) -> {reverse(L,">"),T};
collect_url_body([H|T],L)       -> collect_url_body(T,[H|L]);
collect_url_body([],_)          -> {[],[]}.

make()->
    B=nano_get_url("www.baidu.com"),
    L=bin2urls(B),
    
  MakeSubFun  =  fun(Url)->
                         io:format("Url1:~p~n",[Url]),
                         case spider1:gather_urls(Url, []) of
                             []->
                                 nothing;
                             [SubUrl]->
                                 io:format("SubUrl:~p~n",[SubUrl]),
                                 make2(SubUrl)
                         end
                 end,
    lists:foreach(MakeSubFun , L),
    urls2htmlFile(L,"http\\1.html").

make2(SubUrl)->
    B=nano_get_url(SubUrl),
%%     io:format(B),
    L=bin2urls2(B),
    urls2htmlFile(L,"http\\"++SubUrl++".html").
-module(spider1).

-compile(export_all).

-import(lists,[reverse/1,reverse/2]).
 
gather_urls("http://" ++ T,L) ->
    {Url,T1}=collect_url_body(T,reverse("")),
    gather_urls(T1,[Url|L]);

gather_urls([_|T],L) -> 
    gather_urls(T,L);
gather_urls([],L) -> 
    L.

collect_url_body("/" ++ T, W) -> {reverse(W,""),T};
collect_url_body("\"" ++ T, W) -> {reverse(W,""),T};
collect_url_body([Q|T1],W) -> collect_url_body(T1,[Q|W]);
collect_url_body([],_) -> {[],[]}.

运行结果如下:

 

上述实现抓取链接及它的二级链接,抓取<a href />标签后的链接,只能爬取链接.现在看到不少关于erlang爬虫的文章.可以爬取图片等.代码写的有点儿凌乱,没有优化,仅供参考.

posted @ 2012-10-31 17:28  孤独信徒  阅读(1003)  评论(0编辑  收藏  举报