用Erlang写一个抓写真图片的程序
某论坛有写真图片,时不时会更新,写一个Erlang抓图片的程序,一来有需求,二来练习一下Erlang编程。
好的,那么开始。
首先,程序需要分成几个模块,一个处理网络请求的模块httpDownload,一个处理网络返回数据的模块dealPageData,控制流程的模块downImg。由于网络请求需要控制并发数量,还需要构建一个信号量模块semaphore。
首先是semaphore.erl的代码
-module(semaphore). -export([start/1, stop/0]). -export([wait/0, signal/0]). start(ResCount) -> register(mutex, spawn(fun() -> init(ResCount) end)). stop() -> mutex ! stop. wait() -> mutex ! {wait, self()}, receive ok -> ok end. signal() -> mutex ! {signal, self()}, ok. init(InitialValue) -> free(InitialValue). free(0) -> io:format("wait, busy~n", []), busy(); free(Available) -> receive {wait, Pid} -> Pid ! ok, io:format("wait, ok~n", []), free(Available - 1); {signal, _} -> io:format("signal, ok~n", []), free(Available + 1); stop -> terminate() end. busy() -> receive {signal, _} -> io:format("signal, ok~n", []), free(1) end. terminate() -> receive {wait, Pid} -> exit(Pid, kill), terminate() after 0 -> ok end.
然后在semaphore的基础上可以构建httpDownload.erl
-module(httpDownload). -export([request/2, init/1, stop/0]). init(DownloadThreadCount) -> semaphore:start(DownloadThreadCount), inets:start(), register(requestable, spawn(fun() -> theMapLoop(sets:new()) end)), register(queueCount, spawn(fun() -> requestQueueCount(0) end)), register(httpDownload, spawn(fun() -> loop() end)), ok. stop() -> httpDownload ! exit. requestQueueCount(Count) -> NewCount = receive add -> Count + 1; remove -> Count - 1 after 1000 -> Count end, io:format("-------------- ~p --------------~n", [NewCount]), requestQueueCount(NewCount). requestThread(From, Url, State) -> case httpc:request(Url) of {ok, {_, _, Data} } -> io:format("ok request: ~p~n", [Url]), From ! {ok, {State, {Url, Data} } }; _ -> From ! {error, Url ++ " request failed"} end, semaphore:signal(). theMapLoop(Set) -> receive {From, El} -> case sets:is_element(El, Set) of true -> From ! false, theMapLoop(Set); false -> From ! true, theMapLoop(sets:add_element(El, Set)) end; _ -> theMapLoop(Set) end. loop() -> receive {From, Url, State} -> semaphore:wait(), %io:format("begin request: ~p~n", [Url]), queueCount ! remove, spawn(fun() -> requestThread(From, Url, State) end), loop(); exit -> stop end. request(Url, State) -> %io:format("request: ~p~n", [Url]), requestable ! {self(), Url}, receive true -> queueCount ! add, httpDownload ! {self(), Url, State}; false -> io:format("request already exists: ~p~n", [Url]) end, ok.
接下来是处理网络请求数据的dealPageData.erl
-module(dealPageData). -export([deal/2, fileExists/1]). fileExists(Url) -> FileFullPath = <<(imgSavePath())/binary, (binary:replace(binary:list_to_bin(Url), [<<":">>,<<"/">>],<<"_">>,[global]))/binary>>, case file:read_file_info(FileFullPath) of {ok, _} -> true; _ -> {false, FileFullPath} end. imgSavePath() -> <<"d:/images/">>. captureData(_, [], Result) -> Result; captureData(Data, [Capture|T], Result) -> {Start, Len} = lists:last(Capture), captureData(Data, T, [string:substr(Data, Start + 1, Len) | Result]). deal(main, {_, Data}) -> case re:compile("thread-[0-9]+-1-1.html") of {ok, Reg} -> case re:run(Data, Reg, [global]) of {match, Captured} -> {main, captureData(Data, Captured, [])}; nomatch -> {error, "nomatch page link at main page"} end; {error, _} -> {error, "error when create page link regexp"} end; deal(page, {_, Data}) -> case re:compile("src=\"([^\"]+\\.jpg)\"") of {ok, Reg} -> case re:run(Data, Reg, [global]) of {match, Captured} -> {page, captureData(Data, Captured, [])}; nomatch -> {error, "nomatch image link at page"} end; {error, _} -> {error, "error when create image link regexp"} end; deal(img, {Url, Data}) -> case fileExists(Url) of true -> {error, "file already exists"}; {false, FilePath} -> case file:write_file(FilePath, Data) of ok -> {img, Url ++ " saved"}; {error, _} -> {error, "error when save file"} end end.
最后是将这些功能模块串起来的downImg.erl
-module(downImg). -export([start/0]). httpRoot() -> "http://f1.avzcf.info/bbs/". mainPage() -> "forum-13-1.html". loop() -> receive {ok, {State, Data} } -> case dealPageData:deal(State, Data) of {main, StrList} -> lists:foreach(fun(Str) -> httpDownload:request((httpRoot() ++ Str), page) end, StrList); {page, StrList} -> lists:foreach(fun(Str) -> case dealPageData:fileExists(Str) of true -> noDownload; {false, _} -> httpDownload:request(Str, img) end end, StrList); {img, Msg} -> io:format("~p~n",[Msg]); {error, Msg} -> io:format("~p~n",[Msg]) end; {error, Msg} -> io:format("~p~n",[Msg]) end, loop(). start() -> httpDownload:init(3), httpDownload:request((httpRoot() ++ mainPage()), main), loop().
代码贴完,start函数启动一个同时发起3个网络请求的任务。