用Erlang写一个抓写真图片的程序

某论坛有写真图片,时不时会更新,写一个Erlang抓图片的程序,一来有需求,二来练习一下Erlang编程。

好的,那么开始。

 

首先,程序需要分成几个模块,一个处理网络请求的模块httpDownload,一个处理网络返回数据的模块dealPageData,控制流程的模块downImg。由于网络请求需要控制并发数量,还需要构建一个信号量模块semaphore。

首先是semaphore.erl的代码

-module(semaphore).
-export([start/1, stop/0]).
-export([wait/0, signal/0]).

start(ResCount) ->
        register(mutex, spawn(fun() -> init(ResCount) end)).
    
stop() ->
        mutex ! stop.
    
wait() ->
        mutex ! {wait, self()},
        receive ok -> ok end.
    
    
signal() ->
        mutex ! {signal, self()}, ok.
    
init(InitialValue) ->
        free(InitialValue).
    
free(0) ->
        io:format("wait, busy~n", []),
        busy();
free(Available) ->
        receive
            {wait, Pid} ->
                Pid ! ok,
                io:format("wait, ok~n", []),
                free(Available - 1);
            {signal, _} ->
                io:format("signal, ok~n", []),
                free(Available + 1);
            stop ->
                terminate()
        end.
    
    
busy() ->
        receive
            {signal, _} ->
                io:format("signal, ok~n", []),
                free(1)
        end.
    
    
terminate() ->
        receive
            {wait, Pid} ->
                exit(Pid, kill),
                terminate()
        after
            0 -> ok
        end.
    

然后在semaphore的基础上可以构建httpDownload.erl

-module(httpDownload).
-export([request/2, init/1, stop/0]).

init(DownloadThreadCount) ->
        semaphore:start(DownloadThreadCount),
        inets:start(),
        register(requestable, spawn(fun() -> theMapLoop(sets:new()) end)),
        register(queueCount, spawn(fun() -> requestQueueCount(0) end)),
        register(httpDownload, spawn(fun() -> loop() end)),
        ok.
        

stop() -> httpDownload ! exit.


requestQueueCount(Count) ->
        NewCount = receive
            add -> Count + 1;
            remove -> Count - 1
        after 
            1000 -> Count
        end,
        io:format("--------------   ~p   --------------~n", [NewCount]),
        requestQueueCount(NewCount).


requestThread(From, Url, State) ->
        case httpc:request(Url) of
            {ok, {_, _, Data} } ->
                io:format("ok request: ~p~n", [Url]),
                From ! {ok, {State, {Url, Data} } };
            _ ->
                From ! {error, Url ++ " request failed"}
        end,
        semaphore:signal().


theMapLoop(Set) ->
        receive
            {From, El} ->
                case sets:is_element(El, Set) of
                    true -> From ! false, theMapLoop(Set);
                    false -> From ! true, theMapLoop(sets:add_element(El, Set))
                end;
            _ -> theMapLoop(Set)
        end.


loop() ->
        receive
            {From, Url, State} ->
                semaphore:wait(),
                %io:format("begin request: ~p~n", [Url]),
                queueCount ! remove,
                spawn(fun() -> requestThread(From, Url, State) end),
                loop();
            exit ->
                stop
        end.




request(Url, State) ->
        %io:format("request: ~p~n", [Url]),
        requestable ! {self(), Url},
        receive
            true -> 
                queueCount ! add,
                httpDownload ! {self(), Url, State};
            false -> io:format("request already exists: ~p~n", [Url])
        end,
        ok.
    
    

接下来是处理网络请求数据的dealPageData.erl

-module(dealPageData).
-export([deal/2, fileExists/1]).


fileExists(Url) ->
        FileFullPath = <<(imgSavePath())/binary, (binary:replace(binary:list_to_bin(Url), [<<":">>,<<"/">>],<<"_">>,[global]))/binary>>,
        case file:read_file_info(FileFullPath) of
            {ok, _} -> true;
            _ -> {false, FileFullPath}
        end.

    
imgSavePath() -> <<"d:/images/">>.
    

captureData(_, [], Result) ->
        Result;
captureData(Data, [Capture|T], Result) ->
        {Start, Len} = lists:last(Capture),
        captureData(Data, T, [string:substr(Data, Start + 1, Len) | Result]).
    
deal(main, {_, Data}) ->
        case re:compile("thread-[0-9]+-1-1.html") of
            {ok, Reg} ->
                case re:run(Data, Reg, [global]) of
                    {match, Captured} ->
                        {main, captureData(Data, Captured, [])};
                    nomatch ->
                        {error, "nomatch page link at main page"}
                end;
            {error, _} ->
                {error, "error when create page link regexp"}
        end;
deal(page, {_, Data}) ->
        case  re:compile("src=\"([^\"]+\\.jpg)\"") of
            {ok, Reg} ->
                case re:run(Data, Reg, [global]) of
                    {match, Captured} ->
                        {page, captureData(Data, Captured, [])};
                    nomatch ->
                        {error, "nomatch image link at page"}
                end;
            {error, _} ->
                {error, "error when create image link regexp"}
        end;
deal(img, {Url, Data}) ->
        case fileExists(Url) of
            true -> {error, "file already exists"};
            {false, FilePath} -> 
                case file:write_file(FilePath, Data) of
                    ok -> {img, Url ++ " saved"};
                    {error, _} -> {error, "error when save file"}
                end
        end.

    

最后是将这些功能模块串起来的downImg.erl

-module(downImg).
-export([start/0]).



httpRoot() -> "http://f1.avzcf.info/bbs/".
    
mainPage() -> "forum-13-1.html".


loop() ->
        receive
            {ok, {State, Data} } ->
                case dealPageData:deal(State, Data) of
                    {main,  StrList} -> 
                        lists:foreach(fun(Str) -> httpDownload:request((httpRoot() ++ Str), page) end, StrList);
                        
                    {page, StrList} ->
                        lists:foreach(fun(Str) -> 
                            case dealPageData:fileExists(Str) of
                                true -> noDownload;
                                {false, _} -> httpDownload:request(Str, img)
                            end
                        end, StrList);
                        
                    {img, Msg} ->
                        io:format("~p~n",[Msg]);
                        
                    {error, Msg} ->
                        io:format("~p~n",[Msg])
                end;
            {error, Msg} ->
                io:format("~p~n",[Msg])
        end,
        loop().


start() -> 
        httpDownload:init(3),
        httpDownload:request((httpRoot() ++ mainPage()), main),
        loop().

代码贴完,start函数启动一个同时发起3个网络请求的任务。

posted on 2012-06-17 21:35  盐味  阅读(907)  评论(0编辑  收藏  举报