第十八章 构造基于OTP的系统
18.1 通用的事件处理
%% 创建一个注册名为Name的进程, 这个进程执行my_handler函数 make(Name) -> register(Name, spawn(fun() ->my_handler(fun no_op/1) end)). %% 等待事件处理 %% 如果是{event, _}类型则调用Fun处理 %% 如果是{add, _}类型则更新处理函数 my_handler(Fun) -> receive {add, Fun1} -> my_handler(Fun1); {event, Any} -> (catch Fun(Any)), my_handler(Fun) end. %% 更新Name进程的处理函数为Fun add_handler(Name, Fun) ->Name ! {add, Fun}. %% 向Name进程发送事件X event(Name, X) ->Name ! {event, X}. %% 事件处理函数, 什么都不做 no_op(_) ->void.
1> c(event_handler). {ok,event_handler} 2> event_handler:make(errors). true 3> event_handler:event(errors, hi). {event,hi}
-module(motor_controller). -export([add_event_handler/0]). %% 更新errors进程的处理函数 add_event_handler() -> event_handler:add_handler(errors, fun controller/1). %% 具体处理, 一些简单的输出 controller(too_hot) -> io:format("Turn off the motor~n"); controller(X) -> io:format("~w ignored event: ~p~n",[?MODULE, X]).
4> c(motor_controller). {ok,motor_controller} 5> motor_controller:add_event_handler(). {add,#Fun<motor_controller.0.125151531>} 6> event_handler:event(errors, cool). motor_controller ignored event: cool {event,cool} 7> event_handler:event(errors, too_hot). Turn off the motor {event,too_hot}
从这个简单的例子可以看到Erlang的强大之处:非常延迟的绑定^_^, 正如作者所说:使用这个机制, 我们可以构造一个可以随着时间推移不断演化的系统, 而演化的过程无需停机, 甚至也不需要升级代码。
18.2 错误日志
错误日志的三种视角:如何调用函数记录错误日志; 如何配置错误日志的存储; 如何对错误日志进行分析。
18.2.1 记录一个错误
# 错误日志的API # 向错误日志发送一个错误消息 # error_logger:error_msg(String) -> ok 8> error_logger:error_msg("An error has occurred\n"). ok 9> =ERROR REPORT==== 8-Nov-2013::13:22:42 === An error has occurred # 向错误日志发送一个带参数的错误消息 # error_logger:error_msg(Format, Data) -> ok 10> error_logger:error_msg("~s, An error has occurred\n", ["Joe"]). =ERROR REPORT==== 8-Nov-2013::13:25:39 === Joe, An error has occurred ok # 向错误日志发送一个标准错误报告 # error_logger:error_report(Report) -> ok # @type Report = [{Tag, Data} | term()] | string() | term() # @type Tag = term() # @type Data = term() 11> error_logger:error_report([{tag1, data1}, a_term, {tag2, data}]). =ERROR REPORT==== 8-Nov-2013::13:29:14 === tag1: data1 a_term tag2: data ok
18.2.2 配置错误日志
- 1. 标准错误日志
# 适合程序开发的环境 $ erl -boot start_clean # 适合产品化系统的环境 $ erl -boot start_sasl
- 2. 不进行配置的SASL
matrix@MBP:18 $ erl -boot start_sasl Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false] =PROGRESS REPORT==== 8-Nov-2013::13:32:52 === supervisor: {local,sasl_safe_sup} started: [{pid,<0.34.0>}, {name,alarm_handler}, {mfargs,{alarm_handler,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:32:52 === supervisor: {local,sasl_safe_sup} started: [{pid,<0.35.0>}, {name,overload}, {mfargs,{overload,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:32:52 === supervisor: {local,sasl_sup} started: [{pid,<0.33.0>}, {name,sasl_safe_sup}, {mfargs, {supervisor,start_link, [{local,sasl_safe_sup},sasl,safe]}}, {restart_type,permanent}, {shutdown,infinity}, {child_type,supervisor}] =PROGRESS REPORT==== 8-Nov-2013::13:32:52 === supervisor: {local,sasl_sup} started: [{pid,<0.36.0>}, {name,release_handler}, {mfargs,{release_handler,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:32:52 === application: sasl started_at: nonode@nohost Eshell V5.9.1 (abort with ^G) 1> error_logger:error_msg("This is an error\n"). ok 2> =ERROR REPORT==== 8-Nov-2013::13:33:45 === This is an error
- 3. 控制记录何种日志
监管报告, 在监管进程启动或者停止被监管的进程时;进程报告, 每次OTP监管进程启动或者停止的时候;崩溃报告, 当被监管的进程退出时, 如果退出原因不是normal或者shutdown则产生此报告。
%% elog1.config [{sasl, [{sasl_error_logger, false}]}].
matrix@MBP:18 $ erl -boot start_sasl -config elog1 Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false] Eshell V5.9.1 (abort with ^G) 1> error_logger:error_msg("This is an error\n"). ok =ERROR REPORT==== 8-Nov-2013::13:40:09 === This is an error
- 4. 文本文件和shell
%% elog2.config [{sasl, [{sasl_error_logger, {file, "/Users/matrix/Documents/Erlang/erlp/18/error_logs/THELOG"}}]}].
matrix@MBP:18 $ cat error_logs/THELOG =PROGRESS REPORT==== 8-Nov-2013::13:45:27 === supervisor: {local,sasl_safe_sup} started: [{pid,<0.35.0>}, {name,alarm_handler}, {mfargs,{alarm_handler,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:45:27 === supervisor: {local,sasl_safe_sup} started: [{pid,<0.36.0>}, {name,overload}, {mfargs,{overload,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:45:27 === supervisor: {local,sasl_sup} started: [{pid,<0.34.0>}, {name,sasl_safe_sup}, {mfargs, {supervisor,start_link, [{local,sasl_safe_sup},sasl,safe]}}, {restart_type,permanent}, {shutdown,infinity}, {child_type,supervisor}] =PROGRESS REPORT==== 8-Nov-2013::13:45:27 === supervisor: {local,sasl_sup} started: [{pid,<0.37.0>}, {name,release_handler}, {mfargs,{release_handler,start_link,[]}}, {restart_type,permanent}, {shutdown,2000}, {child_type,worker}] =PROGRESS REPORT==== 8-Nov-2013::13:45:27 === application: sasl started_at: nonode@nohost
- 5. 循环日志和shell
%% elog3.config [{sasl, [ {sasl_error_logger, false}, %% 指定输出目录 {error_logger_mf_dir, "/Users/matrix/Documents/Erlang/erlp/18/error_logs"}, %% 指定单个文件最大值 {error_logger_mf_maxbytes, 10485760}, %% 指定文件最大数量 {error_logger_mf_maxfiles, 10} ]}].
- 6. 产品化环境
%% elog4.config [{sasl, [ {sasl_error_logger, false}, %% 只记录错误日志 {errorlog_type, error}, %% 指定输出目录 {error_logger_mf_dir, "/Users/matrix/Documents/Erlang/erlp/18/error_logs"}, %% 指定单个文件最大值 {error_logger_mf_maxbytes, 10485760}, %% 指定文件最大数量 {error_logger_mf_maxfiles, 10} ]}].
18.2.3 分析错误
18.3 警报管理
-module(my_alarm_handler). -behaviour(gen_event). %% 需要自定义的gen_event的回调函数 -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2]). %% 完成初始化, 必须返回{ok, State} init(Args) -> io:format("*** my_alarm_handler init:~p~n",[Args]), {ok, 0}. %% handle_event(Event, State) -> {ok, NewState} %% 其中Event是形如{EventType, EventArgs}的元组 %% EventType可以是set_event或clear_event %% EventArgs是用户提供的参数 %% 设置警报 handle_event({set_alarm, tooHot}, N) -> error_logger:error_msg("*** Tell the Engineer to turn on the fan~n"), {ok, N+1}; %% 清除警报 handle_event({clear_alarm, tooHot}, N) -> error_logger:error_msg("*** Danger over. Turn off the fan~n"), {ok, N}; %% 其它类型的警报 handle_event(Event, N) -> io:format("*** unmatched event:~p~n",[Event]), {ok, N}. %% 其它需要实现的回调函数 handle_call(_Request, N) ->Reply = N, {ok, Reply, N}. handle_info(_Info, N) ->{ok, N}. terminate(_Reason, _N) ->ok.
matrix@MBP:18 $ erl -boot start_sasl -config elog3 Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false] Eshell V5.9.1 (abort with ^G) 1> c(my_alarm_handler). {ok,my_alarm_handler} # 普通的警报 2> alarm_handler:set_alarm(tooHot). ok 3> =INFO REPORT==== 8-Nov-2013::14:02:43 === alarm_handler: {set,tooHot} # 使用自定义的警报 3> gen_event:swap_handler(alarm_handler, {alarm_handler, swap}, {my_alarm_handler, xyz}). *** my_alarm_handler init:{xyz,{alarm_handler,[tooHot]}} ok # 触发警报 4> alarm_handler:set_alarm(tooHot). =ERROR REPORT==== 8-Nov-2013::14:03:51 === *** Tell the Engineer to turn on the fan ok # 清除警报 5> alarm_handler:clear_alarm(tooHot). =ERROR REPORT==== 8-Nov-2013::14:04:07 === *** Danger over. Turn off the fan ok # 查看错误日志报告 6> rb:start([{max, 20}]). rb: reading report...done. rb: reading report...done. rb: reading report...done. {ok,<0.51.0>} 7> rb:list(). No Type Process Date Time == ==== ======= ==== ==== 20 progress <0.30.0> 2013-11-08 13:51:15 19 progress <0.30.0> 2013-11-08 13:51:15 18 progress <0.23.0> 2013-11-08 13:51:15 17 error <0.24.0> 2013-11-08 13:51:25 16 progress <0.30.0> 2013-11-08 13:55:08 15 progress <0.30.0> 2013-11-08 13:55:08 14 progress <0.30.0> 2013-11-08 13:55:08 13 progress <0.30.0> 2013-11-08 13:55:08 12 progress <0.23.0> 2013-11-08 13:55:08 11 crash_report rb 2013-11-08 13:55:23 10 error <0.24.0> 2013-11-08 13:56:16 9 crash_report rb 2013-11-08 13:56:20 8 progress <0.30.0> 2013-11-08 14:01:19 7 progress <0.30.0> 2013-11-08 14:01:19 6 progress <0.30.0> 2013-11-08 14:01:19 5 progress <0.30.0> 2013-11-08 14:01:19 4 progress <0.23.0> 2013-11-08 14:01:19 3 info_report <0.30.0> 2013-11-08 14:02:43 2 error <0.30.0> 2013-11-08 14:03:51 1 error <0.30.0> 2013-11-08 14:04:07 ok 8> rb:show(1). ERROR REPORT <0.34.0> 2013-11-08 14:04:07 =============================================================================== *** Danger over. Turn off the fan ok 9> rb:show(2). ERROR REPORT <0.34.0> 2013-11-08 14:03:51 =============================================================================== *** Tell the Engineer to turn on the fan ok
18.4 应用服务
18.4.1 素数服务
使用gen_server模块快速生成代码, 然后添加获取素数的的方法, 并添加警报处理。
-module(prime_server). -behaviour(gen_server). -export([new_prime/1, start_link/0]). %% 必须自定义实现的gen_server回调函数 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). %% 生成素数的服务, 设置了超时时间 new_prime(N) -> gen_server:call(?MODULE, {prime, N}, 20000). init([]) -> process_flag(trap_exit, true), io:format("~p starting~n",[?MODULE]), {ok, 0}. handle_call({prime, K}, _From, N) -> {reply, make_new_prime(K), N+1}. handle_cast(_Msg, N) ->{noreply, N}. handle_info(_Info, N) ->{noreply, N}. terminate(_Reason, _N) -> io:format("~p stopping~n",[?MODULE]), ok. code_change(_OldVsn, N, _Extra) ->{ok, N}. %% 生成素数时添加了警报处理 make_new_prime(K) -> if K > 100 -> alarm_handler:set_alarm(tooHot), N = lib_primes:make_prime(K), alarm_handler:clear_alarm(tooHot), N; true -> lib_primes:make_prime(K) end.
18.4.2 面积服务
与素数服务代码相差无几, 但是计算面积的函数对于长方形需要提供两个参数而 hadle_call({area, Thing}, …) 只提供了对一个参数的支持。
18.5 监控树
监控树有两种类型:one-for-one, 如果一个进程失效, 它的监控进程就会重启它; all-for-one, 如果一个进程失效, 所有的工作进程都会被终止, 然后被重启。
%% RestartStrategy可以是one-for-one或all-for-one %% MaxRestarts, Time限定了在Time时间内最多可以重启MaxRestarts次, 否则将终止所有工作进程, 以防止出现死循环 init(...) -> {ok, {RestartStrategy, MaxRestarts, Time}, [Worker1, Worker2, ...]}.
-module(sellaprime_supervisor). -behaviour(supervisor). -export([start/0, start_in_shell_for_testing/0, start_link/1, init/1]). %% 调用 supervisor:start_link 启动监控树, 将会调用自定义的init/1函数 start() -> spawn(fun() -> supervisor:start_link({local,?MODULE}, ?MODULE, _Arg = []) end). start_in_shell_for_testing() -> {ok, Pid} = supervisor:start_link({local,?MODULE}, ?MODULE, _Arg = []), unlink(Pid). start_link(Args) -> supervisor:start_link({local,?MODULE}, ?MODULE, Args). init([]) -> %% 使用自定义的警报器 gen_event:swap_handler(alarm_handler, {alarm_handler, swap}, {my_alarm_handler, xyz}), %% {ok, {SupFlags, Children}} %% SupFlags : {supervision_strategy(), maxR(), maxT()} %% supervision_strategy() : one_for_one | one_for_all | simple_one_for_one {ok, {{one_for_one, 3, 10}, %% Children : [ChildStartSpecification] %% ChildStartSpecification : {internal_name(), %% {module(), function(), args()}, %% shutdown_time(), %% child_type(), %% modules()} [{tag1, {area_server, start_link, []}, permanent, 10000, worker, [area_server]}, {tag2, {prime_server, start_link, []}, permanent, 10000, worker, [prime_server]} ]}}.
18.6 启动整个系统
1> sellaprime_supervisor:start_in_shell_for_testing(). *** my_alarm_handler init:{xyz,error} area_server starting prime_server starting true # 正常计算面积 2> area_server:area({square, 10}). 100 # 错误的参数 3> area_server:area({rectangle, 10, 20}). area_server stopping area_server starting ** exception exit: {{function_clause,[{area_server,compute_area, [{rectangle,10,20}], [{file,"area_server.erl"},{line,44}]}, {area_server,handle_call,3, [{file,"area_server.erl"},{line,32}]}, {gen_server,handle_msg,5, [{file,"gen_server.erl"},{line,588}]}, {proc_lib,init_p_do_apply,3, [{file,"proc_lib.erl"},{line,227}]}]}, {gen_server,call,[area_server,{area,{rectangle,10,20}}]}} in function gen_server:call/2 (gen_server.erl, line 180) 4> =ERROR REPORT==== 8-Nov-2013::14:52:19 === ** Generic server area_server terminating ** Last message in was {area,{rectangle,10,20}} ** When Server state == 1 ** Reason for termination == ** {function_clause,[{area_server,compute_area, [{rectangle,10,20}], [{file,"area_server.erl"},{line,44}]}, {area_server,handle_call,3, [{file,"area_server.erl"},{line,32}]}, {gen_server,handle_msg,5, [{file,"gen_server.erl"},{line,588}]}, {proc_lib,init_p_do_apply,3, [{file,"proc_lib.erl"},{line,227}]}]} # 求阶乘 5> prime_server:new_prime(20). Generating a 20 digit prime ................................. 46877940609417488267 # 触发警报 6> prime_server:new_prime(120). Generating a 120 digit prime . =ERROR REPORT==== 8-Nov-2013::14:58:22 === *** Tell the Engineer to turn on the fan =ERROR REPORT==== 8-Nov-2013::14:58:22 === *** Tell the Engineer to turn on the fan ...................................... =ERROR REPORT==== 8-Nov-2013::14:58:22 === *** Danger over. Turn off the fan 296183949730132038037133059736194002935394554326329071373943691961695511107247366466697700357121237051115084833162714631 =ERROR REPORT==== 8-Nov-2013::14:58:22 === *** Danger over. Turn off the fan # 查看错误日志报告 7> rb:start([{max, 20}]). rb: reading report...done. {ok,<0.96.0>} 8> rb:list(). No Type Process Date Time == ==== ======= ==== ==== 20 error <0.24.0> 2013-11-08 14:52:19 19 crash_report area_server 2013-11-08 14:52:19 18 supervisor_report <0.24.0> 2013-11-08 14:52:19 17 progress <0.24.0> 2013-11-08 14:52:19 16 error <0.24.0> 2013-11-08 14:53:17 15 crash_report prime_server 2013-11-08 14:53:17 14 supervisor_report <0.24.0> 2013-11-08 14:53:17 13 progress <0.24.0> 2013-11-08 14:53:17 12 error <0.24.0> 2013-11-08 14:54:48 11 crash_report prime_server 2013-11-08 14:54:48 10 supervisor_report <0.24.0> 2013-11-08 14:54:48 9 progress <0.24.0> 2013-11-08 14:54:48 8 error <0.24.0> 2013-11-08 14:57:28 7 crash_report prime_server 2013-11-08 14:57:28 6 supervisor_report <0.24.0> 2013-11-08 14:57:28 5 progress <0.24.0> 2013-11-08 14:57:28 4 error <0.30.0> 2013-11-08 14:58:22 3 error <0.30.0> 2013-11-08 14:58:22 2 error <0.30.0> 2013-11-08 14:58:22 1 error <0.30.0> 2013-11-08 14:58:22 ok 9> rb:show(19). CRASH REPORT <0.56.0> 2013-11-08 14:52:19 =============================================================================== Crashing process initial_call {area_server,init,['Argument__1']} pid <0.56.0> registered_name area_server error_info {exit, {function_clause, [{area_server,compute_area, [{rectangle,10,20}], [{file,"area_server.erl"},{line,44}]}, {area_server,handle_call,3, [{file,"area_server.erl"},{line,32}]}, {gen_server,handle_msg,5, [{file,"gen_server.erl"},{line,588}]}, {proc_lib,init_p_do_apply,3, [{file,"proc_lib.erl"},{line,227}]}]}, [{gen_server,terminate,6,[{file,"gen_server.erl"},{line,747}]}, {proc_lib,init_p_do_apply,3, [{file,"proc_lib.erl"},{line,227}]}]} ancestors [sellaprime_supervisor,<0.43.0>] messages [] links [<0.55.0>] dictionary [] trap_exit true status running heap_size 610 stack_size 24 reductions 176 ok
18.7 应用程序
%% 名称 {application, sellaprime, %% 概述 [{description, "The Prime Number Shop"}, %% 版本 {vsn, "1.0"}, %% 包含的模块 {modules, [sellaprime_app, sellaprime_supervisor, area_server, prime_server, lib_lin, lib_primes, my_alarm_handler]}, %% 注册进程 {registered,[area_server, prime_server, sellaprime_super]}, %% 依赖的应用程序 {applications, [kernel,stdlib]}, %% 模块 {mod, {sellaprime_app,[]}}, {start_phases, []} ]}.
-module(sellaprime_app). -behaviour(application). -export([start/2, stop/1]). start(_Type, StartArgs) -> sellaprime_supervisor:start_link(StartArgs). stop(_State) -> ok.
# 显示加载的应用程序 1> application:loaded_applications(). [{kernel,"ERTS CXC 138 10","2.15.1"}, {sasl,"SASL CXC 138 11","2.2.1"}, {stdlib,"ERTS CXC 138 10","1.18.1"}] # 加载指定的应用程序 2> application:load(sellaprime). ok 3> application:loaded_applications(). [{kernel,"ERTS CXC 138 10","2.15.1"}, {sasl,"SASL CXC 138 11","2.2.1"}, {stdlib,"ERTS CXC 138 10","1.18.1"}, {sellaprime,"The Prime Number Shop","1.0"}] # 启动应用程序 4> application:start(sellaprime). *** my_alarm_handler init:{xyz,{alarm_handler,[]}} area_server starting prime_server starting ok # 停止应用程序 5> application:stop(sellaprime). prime_server stopping area_server stopping ok 6> =INFO REPORT==== 8-Nov-2013::15:12:35 === application: sellaprime exited: stopped type: temporary # 卸载应用程序 7> application:unload(sellaprime). ok 8> application:loaded_applications(). [{kernel,"ERTS CXC 138 10","2.15.1"}, {sasl,"SASL CXC 138 11","2.2.1"}, {stdlib,"ERTS CXC 138 10","1.18.1"}]
18.8 文件系统的组织
文件 | 功能 |
area_server.erl | 面积服务, gen_server回调 |
prime_server.erl | 素数服务, gen_server回调 |
sellaprime_supervisor.erl | 监控回调 |
sellaprime_app.erl | 应用程序回调 |
my_alarm_handler.erl | gen_event的事件回调 |
sellaprime.app | 应用程序规范 |
elog4.config | 错误日志配置文件 |
18.9 应用程序监视器
18.10 进一步深入
gen_server, gen_event, supervisors