第十八章 构造基于OTP的系统
18.1 通用的事件处理
%% 创建一个注册名为Name的进程, 这个进程执行my_handler函数
make(Name) ->
register(Name, spawn(fun() ->my_handler(fun no_op/1) end)).
%% 等待事件处理
%% 如果是{event, _}类型则调用Fun处理
%% 如果是{add, _}类型则更新处理函数
my_handler(Fun) ->
receive
{add, Fun1} ->
my_handler(Fun1);
{event, Any} ->
(catch Fun(Any)),
my_handler(Fun)
end.
%% 更新Name进程的处理函数为Fun
add_handler(Name, Fun) ->Name ! {add, Fun}.
%% 向Name进程发送事件X
event(Name, X) ->Name ! {event, X}.
%% 事件处理函数, 什么都不做
no_op(_) ->void.
运行结果:
1> c(event_handler).
{ok,event_handler}
2> event_handler:make(errors).
true
3> event_handler:event(errors, hi).
{event,hi}
为处理程序添加一个简单的回调模块
-module(motor_controller).
-export([add_event_handler/0]).
%% 更新errors进程的处理函数
add_event_handler() ->
event_handler:add_handler(errors, fun controller/1).
%% 具体处理, 一些简单的输出
controller(too_hot) ->
io:format("Turn off the motor~n");
controller(X) ->
io:format("~w ignored event: ~p~n",[?MODULE, X]).
运行结果:
4> c(motor_controller).
{ok,motor_controller}
5> motor_controller:add_event_handler().
{add,#Fun<motor_controller.0.125151531>}
6> event_handler:event(errors, cool).
motor_controller ignored event: cool
{event,cool}
7> event_handler:event(errors, too_hot).
Turn off the motor
{event,too_hot}
从这个简单的例子可以看到Erlang的强大之处:非常延迟的绑定_, 正如作者所说:使用这个机制, 我们可以构造一个可以随着时间推移不断演化的系统, 而演化的过程无需停机, 甚至也不需要升级代码。
18.2 错误日志
错误日志的三种视角:如何调用函数记录错误日志; 如何配置错误日志的存储; 如何对错误日志进行分析。
18.2.1 记录一个错误
# 错误日志的API
# 向错误日志发送一个错误消息
# error_logger:error_msg(String) -> ok
8> error_logger:error_msg("An error has occurred\n").
ok
9>
=ERROR REPORT==== 8-Nov-2013::13:22:42 ===
An error has occurred
# 向错误日志发送一个带参数的错误消息
# error_logger:error_msg(Format, Data) -> ok
10> error_logger:error_msg("~s, An error has occurred\n", ["Joe"]).
=ERROR REPORT==== 8-Nov-2013::13:25:39 ===
Joe, An error has occurred
ok
# 向错误日志发送一个标准错误报告
# error_logger:error_report(Report) -> ok
# @type Report = [{Tag, Data} | term()] | string() | term()
# @type Tag = term()
# @type Data = term()
11> error_logger:error_report([{tag1, data1}, a_term, {tag2, data}]).
=ERROR REPORT==== 8-Nov-2013::13:29:14 ===
tag1: data1
a_term
tag2: data
ok
18.2.2 配置错误日志
- 标准错误日志
# 适合程序开发的环境
$ erl -boot start_clean
# 适合产品化系统的环境
$ erl -boot start_sasl
- 不进行配置的SASL
matrix@MBP:18 $ erl -boot start_sasl
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false]
=PROGRESS REPORT==== 8-Nov-2013::13:32:52 ===
supervisor: {local,sasl_safe_sup}
started: [{pid,<0.34.0>},
{name,alarm_handler},
{mfargs,{alarm_handler,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:32:52 ===
supervisor: {local,sasl_safe_sup}
started: [{pid,<0.35.0>},
{name,overload},
{mfargs,{overload,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:32:52 ===
supervisor: {local,sasl_sup}
started: [{pid,<0.33.0>},
{name,sasl_safe_sup},
{mfargs,
{supervisor,start_link,
[{local,sasl_safe_sup},sasl,safe]}},
{restart_type,permanent},
{shutdown,infinity},
{child_type,supervisor}]
=PROGRESS REPORT==== 8-Nov-2013::13:32:52 ===
supervisor: {local,sasl_sup}
started: [{pid,<0.36.0>},
{name,release_handler},
{mfargs,{release_handler,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:32:52 ===
application: sasl
started_at: nonode@nohost
Eshell V5.9.1 (abort with ^G)
1> error_logger:error_msg("This is an error\n").
ok
2>
=ERROR REPORT==== 8-Nov-2013::13:33:45 ===
This is an error
- 控制记录何种日志
监管报告, 在监管进程启动或者停止被监管的进程时;进程报告, 每次OTP监管进程启动或者停止的时候;崩溃报告, 当被监管的进程退出时, 如果退出原因不是normal或者shutdown则产生此报告。
%% elog1.config
[{sasl, [{sasl_error_logger, false}]}].
运行结果:
matrix@MBP:18 $ erl -boot start_sasl -config elog1
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> error_logger:error_msg("This is an error\n").
ok
=ERROR REPORT==== 8-Nov-2013::13:40:09 ===
This is an error
- 文本文件和shell
%% elog2.config
[{sasl, [{sasl_error_logger, {file, "/Users/matrix/Documents/Erlang/erlp/18/error_logs/THELOG"}}]}].
运行结果:
matrix@MBP:18 $ cat error_logs/THELOG
=PROGRESS REPORT==== 8-Nov-2013::13:45:27 ===
supervisor: {local,sasl_safe_sup}
started: [{pid,<0.35.0>},
{name,alarm_handler},
{mfargs,{alarm_handler,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:45:27 ===
supervisor: {local,sasl_safe_sup}
started: [{pid,<0.36.0>},
{name,overload},
{mfargs,{overload,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:45:27 ===
supervisor: {local,sasl_sup}
started: [{pid,<0.34.0>},
{name,sasl_safe_sup},
{mfargs,
{supervisor,start_link,
[{local,sasl_safe_sup},sasl,safe]}},
{restart_type,permanent},
{shutdown,infinity},
{child_type,supervisor}]
=PROGRESS REPORT==== 8-Nov-2013::13:45:27 ===
supervisor: {local,sasl_sup}
started: [{pid,<0.37.0>},
{name,release_handler},
{mfargs,{release_handler,start_link,[]}},
{restart_type,permanent},
{shutdown,2000},
{child_type,worker}]
=PROGRESS REPORT==== 8-Nov-2013::13:45:27 ===
application: sasl
started_at: nonode@nohost
- 循环日志和shell
%% elog3.config
[{sasl, [
{sasl_error_logger, false},
%% 指定输出目录
{error_logger_mf_dir, "/Users/matrix/Documents/Erlang/erlp/18/error_logs"},
%% 指定单个文件最大值
{error_logger_mf_maxbytes, 10485760},
%% 指定文件最大数量
{error_logger_mf_maxfiles, 10}
]}].
运行后会在错误日志存储目录下发现已经生成了名字为1的日志文件。
6. 产品化环境
%% elog4.config
[{sasl, [
{sasl_error_logger, false},
%% 只记录错误日志
{errorlog_type, error},
%% 指定输出目录
{error_logger_mf_dir, "/Users/matrix/Documents/Erlang/erlp/18/error_logs"},
%% 指定单个文件最大值
{error_logger_mf_maxbytes, 10485760},
%% 指定文件最大数量
{error_logger_mf_maxfiles, 10}
]}].
18.2.3 分析错误
使用rb模块读取错误日志
18.3 警报管理
-module(my_alarm_handler).
-behaviour(gen_event).
%% 需要自定义的gen_event的回调函数
-export([init/1, handle_event/2, handle_call/2,
handle_info/2, terminate/2]).
%% 完成初始化, 必须返回{ok, State}
init(Args) ->
io:format("*** my_alarm_handler init:~p~n",[Args]),
{ok, 0}.
%% handle_event(Event, State) -> {ok, NewState}
%% 其中Event是形如{EventType, EventArgs}的元组
%% EventType可以是set_event或clear_event
%% EventArgs是用户提供的参数
%% 设置警报
handle_event({set_alarm, tooHot}, N) ->
error_logger:error_msg("*** Tell the Engineer to turn on the fan~n"),
{ok, N+1};
%% 清除警报
handle_event({clear_alarm, tooHot}, N) ->
error_logger:error_msg("*** Danger over. Turn off the fan~n"),
{ok, N};
%% 其它类型的警报
handle_event(Event, N) ->
io:format("*** unmatched event:~p~n",[Event]),
{ok, N}.
%% 其它需要实现的回调函数
handle_call(_Request, N) ->Reply = N, {ok, Reply, N}.
handle_info(_Info, N) ->{ok, N}.
terminate(_Reason, _N) ->ok.
运行结果:
matrix@MBP:18 $ erl -boot start_sasl -config elog3
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:0] [hipe] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> c(my_alarm_handler).
{ok,my_alarm_handler}
# 普通的警报
2> alarm_handler:set_alarm(tooHot).
ok
3>
=INFO REPORT==== 8-Nov-2013::14:02:43 ===
alarm_handler: {set,tooHot}
# 使用自定义的警报
3> gen_event:swap_handler(alarm_handler, {alarm_handler, swap}, {my_alarm_handler, xyz}).
*** my_alarm_handler init:{xyz,{alarm_handler,[tooHot]}}
ok
# 触发警报
4> alarm_handler:set_alarm(tooHot).
=ERROR REPORT==== 8-Nov-2013::14:03:51 ===
*** Tell the Engineer to turn on the fan
ok
# 清除警报
5> alarm_handler:clear_alarm(tooHot).
=ERROR REPORT==== 8-Nov-2013::14:04:07 ===
*** Danger over. Turn off the fan
ok
# 查看错误日志报告
6> rb:start([{max, 20}]).
rb: reading report...done.
rb: reading report...done.
rb: reading report...done.
{ok,<0.51.0>}
7> rb:list().
No Type Process Date Time
== ==== ======= ==== ====
20 progress <0.30.0> 2013-11-08 13:51:15
19 progress <0.30.0> 2013-11-08 13:51:15
18 progress <0.23.0> 2013-11-08 13:51:15
17 error <0.24.0> 2013-11-08 13:51:25
16 progress <0.30.0> 2013-11-08 13:55:08
15 progress <0.30.0> 2013-11-08 13:55:08
14 progress <0.30.0> 2013-11-08 13:55:08
13 progress <0.30.0> 2013-11-08 13:55:08
12 progress <0.23.0> 2013-11-08 13:55:08
11 crash_report rb 2013-11-08 13:55:23
10 error <0.24.0> 2013-11-08 13:56:16
9 crash_report rb 2013-11-08 13:56:20
8 progress <0.30.0> 2013-11-08 14:01:19
7 progress <0.30.0> 2013-11-08 14:01:19
6 progress <0.30.0> 2013-11-08 14:01:19
5 progress <0.30.0> 2013-11-08 14:01:19
4 progress <0.23.0> 2013-11-08 14:01:19
3 info_report <0.30.0> 2013-11-08 14:02:43
2 error <0.30.0> 2013-11-08 14:03:51
1 error <0.30.0> 2013-11-08 14:04:07
ok
8> rb:show(1).
ERROR REPORT <0.34.0> 2013-11-08 14:04:07
===============================================================================
*** Danger over. Turn off the fan
ok
9> rb:show(2).
ERROR REPORT <0.34.0> 2013-11-08 14:03:51
===============================================================================
*** Tell the Engineer to turn on the fan
ok
18.4 应用服务
18.4.1 素数服务
使用gen_server模块快速生成代码, 然后添加获取素数的的方法, 并添加警报处理。
-module(prime_server).
-behaviour(gen_server).
-export([new_prime/1, start_link/0]).
%% 必须自定义实现的gen_server回调函数
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
%% 生成素数的服务, 设置了超时时间
new_prime(N) ->
gen_server:call(?MODULE, {prime, N}, 20000).
init([]) ->
process_flag(trap_exit, true),
io:format("~p starting~n",[?MODULE]),
{ok, 0}.
handle_call({prime, K}, _From, N) ->
{reply, make_new_prime(K), N+1}.
handle_cast(_Msg, N) ->{noreply, N}.
handle_info(_Info, N) ->{noreply, N}.
terminate(_Reason, _N) ->
io:format("~p stopping~n",[?MODULE]),
ok.
code_change(_OldVsn, N, _Extra) ->{ok, N}.
%% 生成素数时添加了警报处理
make_new_prime(K) ->
if
K > 100 ->
alarm_handler:set_alarm(tooHot),
N = lib_primes:make_prime(K),
alarm_handler:clear_alarm(tooHot),
N;
true ->
lib_primes:make_prime(K)
end.
18.4.2 面积服务
与素数服务代码相差无几, 但是计算面积的函数对于长方形需要提供两个参数而 hadle_call({area, Thing}, …) 只提供了对一个参数的支持。
18.5 监控树
监控树有两种类型:one-for-one, 如果一个进程失效, 它的监控进程就会重启它; all-for-one, 如果一个进程失效, 所有的工作进程都会被终止, 然后被重启。
监控树的形式:
%% RestartStrategy可以是one-for-one或all-for-one
%% MaxRestarts, Time限定了在Time时间内最多可以重启MaxRestarts次, 否则将终止所有工作进程, 以防止出现死循环
init(...) ->
{ok, {RestartStrategy, MaxRestarts, Time},
[Worker1, Worker2, ...]}.
实现自己的监控树:
-module(sellaprime_supervisor).
-behaviour(supervisor).
-export([start/0, start_in_shell_for_testing/0, start_link/1, init/1]).
%% 调用 supervisor:start_link 启动监控树, 将会调用自定义的init/1函数
start() ->
spawn(fun() ->
supervisor:start_link({local,?MODULE}, ?MODULE, _Arg = [])
end).
start_in_shell_for_testing() ->
{ok, Pid} = supervisor:start_link({local,?MODULE}, ?MODULE, _Arg = []),
unlink(Pid).
start_link(Args) ->
supervisor:start_link({local,?MODULE}, ?MODULE, Args).
init([]) ->
%% 使用自定义的警报器
gen_event:swap_handler(alarm_handler,
{alarm_handler, swap},
{my_alarm_handler, xyz}),
%% {ok, {SupFlags, Children}}
%% SupFlags : {supervision_strategy(), maxR(), maxT()}
%% supervision_strategy() : one_for_one | one_for_all | simple_one_for_one
{ok, {{one_for_one, 3, 10},
%% Children : [ChildStartSpecification]
%% ChildStartSpecification : {internal_name(),
%% {module(), function(), args()},
%% shutdown_time(),
%% child_type(),
%% modules()}
[{tag1,
{area_server, start_link, []},
permanent,
10000,
worker,
[area_server]},
{tag2,
{prime_server, start_link, []},
permanent,
10000,
worker,
[prime_server]}
]}}.
18.6 启动整个系统
1> sellaprime_supervisor:start_in_shell_for_testing().
*** my_alarm_handler init:{xyz,error}
area_server starting
prime_server starting
true
# 正常计算面积
2> area_server:area({square, 10}).
100
# 错误的参数
3> area_server:area({rectangle, 10, 20}).
area_server stopping
area_server starting
** exception exit: {{function_clause,[{area_server,compute_area,
[{rectangle,10,20}],
[{file,"area_server.erl"},{line,44}]},
{area_server,handle_call,3,
[{file,"area_server.erl"},{line,32}]},
{gen_server,handle_msg,5,
[{file,"gen_server.erl"},{line,588}]},
{proc_lib,init_p_do_apply,3,
[{file,"proc_lib.erl"},{line,227}]}]},
{gen_server,call,[area_server,{area,{rectangle,10,20}}]}}
in function gen_server:call/2 (gen_server.erl, line 180)
4>
=ERROR REPORT==== 8-Nov-2013::14:52:19 ===
** Generic server area_server terminating
** Last message in was {area,{rectangle,10,20}}
** When Server state == 1
** Reason for termination ==
** {function_clause,[{area_server,compute_area,
[{rectangle,10,20}],
[{file,"area_server.erl"},{line,44}]},
{area_server,handle_call,3,
[{file,"area_server.erl"},{line,32}]},
{gen_server,handle_msg,5,
[{file,"gen_server.erl"},{line,588}]},
{proc_lib,init_p_do_apply,3,
[{file,"proc_lib.erl"},{line,227}]}]}
# 求阶乘
5> prime_server:new_prime(20).
Generating a 20 digit prime .................................
46877940609417488267
# 触发警报
6> prime_server:new_prime(120).
Generating a 120 digit prime .
=ERROR REPORT==== 8-Nov-2013::14:58:22 ===
*** Tell the Engineer to turn on the fan
=ERROR REPORT==== 8-Nov-2013::14:58:22 ===
*** Tell the Engineer to turn on the fan
......................................
=ERROR REPORT==== 8-Nov-2013::14:58:22 ===
*** Danger over. Turn off the fan
296183949730132038037133059736194002935394554326329071373943691961695511107247366466697700357121237051115084833162714631
=ERROR REPORT==== 8-Nov-2013::14:58:22 ===
*** Danger over. Turn off the fan
# 查看错误日志报告
7> rb:start([{max, 20}]).
rb: reading report...done.
{ok,<0.96.0>}
8> rb:list().
No Type Process Date Time
== ==== ======= ==== ====
20 error <0.24.0> 2013-11-08 14:52:19
19 crash_report area_server 2013-11-08 14:52:19
18 supervisor_report <0.24.0> 2013-11-08 14:52:19
17 progress <0.24.0> 2013-11-08 14:52:19
16 error <0.24.0> 2013-11-08 14:53:17
15 crash_report prime_server 2013-11-08 14:53:17
14 supervisor_report <0.24.0> 2013-11-08 14:53:17
13 progress <0.24.0> 2013-11-08 14:53:17
12 error <0.24.0> 2013-11-08 14:54:48
11 crash_report prime_server 2013-11-08 14:54:48
10 supervisor_report <0.24.0> 2013-11-08 14:54:48
9 progress <0.24.0> 2013-11-08 14:54:48
8 error <0.24.0> 2013-11-08 14:57:28
7 crash_report prime_server 2013-11-08 14:57:28
6 supervisor_report <0.24.0> 2013-11-08 14:57:28
5 progress <0.24.0> 2013-11-08 14:57:28
4 error <0.30.0> 2013-11-08 14:58:22
3 error <0.30.0> 2013-11-08 14:58:22
2 error <0.30.0> 2013-11-08 14:58:22
1 error <0.30.0> 2013-11-08 14:58:22
ok
9> rb:show(19).
CRASH REPORT <0.56.0> 2013-11-08 14:52:19
===============================================================================
Crashing process
initial_call {area_server,init,['Argument__1']}
pid <0.56.0>
registered_name area_server
error_info
{exit,
{function_clause,
[{area_server,compute_area,
[{rectangle,10,20}],
[{file,"area_server.erl"},{line,44}]},
{area_server,handle_call,3,
[{file,"area_server.erl"},{line,32}]},
{gen_server,handle_msg,5,
[{file,"gen_server.erl"},{line,588}]},
{proc_lib,init_p_do_apply,3,
[{file,"proc_lib.erl"},{line,227}]}]},
[{gen_server,terminate,6,[{file,"gen_server.erl"},{line,747}]},
{proc_lib,init_p_do_apply,3,
[{file,"proc_lib.erl"},{line,227}]}]}
ancestors [sellaprime_supervisor,<0.43.0>]
messages []
links [<0.55.0>]
dictionary []
trap_exit true
status running
heap_size 610
stack_size 24
reductions 176
ok
18.7 应用程序
应用程序信息文件
%% 名称
{application, sellaprime,
%% 概述
[{description, "The Prime Number Shop"},
%% 版本
{vsn, "1.0"},
%% 包含的模块
{modules, [sellaprime_app, sellaprime_supervisor, area_server,
prime_server, lib_lin, lib_primes, my_alarm_handler]},
%% 注册进程
{registered,[area_server, prime_server, sellaprime_super]},
%% 依赖的应用程序
{applications, [kernel,stdlib]},
%% 模块
{mod, {sellaprime_app,[]}},
{start_phases, []}
]}.
应用程序回调模块
-module(sellaprime_app).
-behaviour(application).
-export([start/2, stop/1]).
start(_Type, StartArgs) ->
sellaprime_supervisor:start_link(StartArgs).
stop(_State) ->
ok.
运行结果:
# 显示加载的应用程序
1> application:loaded_applications().
[{kernel,"ERTS CXC 138 10","2.15.1"},
{sasl,"SASL CXC 138 11","2.2.1"},
{stdlib,"ERTS CXC 138 10","1.18.1"}]
# 加载指定的应用程序
2> application:load(sellaprime).
ok
3> application:loaded_applications().
[{kernel,"ERTS CXC 138 10","2.15.1"},
{sasl,"SASL CXC 138 11","2.2.1"},
{stdlib,"ERTS CXC 138 10","1.18.1"},
{sellaprime,"The Prime Number Shop","1.0"}]
# 启动应用程序
4> application:start(sellaprime).
*** my_alarm_handler init:{xyz,{alarm_handler,[]}}
area_server starting
prime_server starting
ok
# 停止应用程序
5> application:stop(sellaprime).
prime_server stopping
area_server stopping
ok
6>
=INFO REPORT==== 8-Nov-2013::15:12:35 ===
application: sellaprime
exited: stopped
type: temporary
# 卸载应用程序
7> application:unload(sellaprime).
ok
8> application:loaded_applications().
[{kernel,"ERTS CXC 138 10","2.15.1"},
{sasl,"SASL CXC 138 11","2.2.1"},
{stdlib,"ERTS CXC 138 10","1.18.1"}]
18.8 文件系统的组织
文件 | 功能 |
---|---|
area_server.erl | 面积服务, gen_server回调 |
prime_server.erl | 素数服务, gen_server回调 |
sellaprime_supervisor.erl | 监控回调 |
sellaprime_app.erl | 应用程序回调 |
my_alarm_handler.erl | gen_event的事件回调 |
sellaprime.app | 应用程序规范 |
elog4.config | 错误日志配置文件 |
18.9 应用程序监视器
使用appmon:start()启动图形界面的查看程序。
18.10 进一步深入
5.5.5版文档
gen_server, gen_event, supervisors
http://www.erlang.org/documentation/doc-5.5.5/pdf/design_principles-5.5.5.pdf
如何创建一个启动文件
http://www.erlang.org/documentation/doc-5.5.5/pdf/system_principles-5.5.5.pdf
应用程序监视器
http://www.erlang.org/documentation/doc-5.5.5/pdf/appmon-2.1.9.pdf
更多文档
http://www.erlang.org/documentation/