从开始学习编程的时候Array就是基础数据结构,也是被使用最频繁的,但是在Erlang中一等公民是List和tuple,在项目中到处都是List的各种处理,但是Array却少见踪迹.好奇心驱使,最近了翻看了一下Array的代码实现.
 

array基础

  [1] array可动态扩展大小;可固定大小,可按需自动增长.
  [2] 如果没有明确赋值会使用默认值undefined,要区分是否赋值过可以使用其它的值
  [3] 索引计数从0开始,这一设计决策是出于性能考虑
  [4] array从不自动收缩,如果索引i的位置被赋值了,中间的[0,i]都会处于可访问的状态(注意resize/2可能和你猜测的效果不同)
  [5] 数组不可直接比较大小
Eshell V5.9  (abort with ^G)
1>  A1 = array:set(17, true, array:new()).
{array,18,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,true,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
2>   array:set(17, undefined, array:new()). %%即使赋值的时候使用的是默认值,对应数据组依然会展开
{array,18,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
3>  array:set(1214, true, A1).
{array,1215,10000,undefined,
       {{{10,
          {undefined,undefined,undefined,undefined,undefined,
                     undefined,undefined,true,undefined,undefined},
          10,10,10,10,10,10,10,10,10},
         100,100,100,100,100,100,100,100,100,100},
        {100,100,
         {10,
          {undefined,undefined,undefined,undefined,true,undefined,
                     undefined,undefined,undefined,undefined},
          10,10,10,10,10,10,10,10,10},
         100,100,100,100,100,100,100,100},
        1000,1000,1000,1000,1000,1000,1000,1000,1000}}

%% array:reset

2> array:set(12,test,array:new()).
{array,13,100,undefined,
       {10,
        {undefined,undefined,test,undefined,undefined,undefined,
                   undefined,undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
4> array:reset(12,v(2)).
{array,13,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
5> 

看一下扩展的例子:
12> array:new({default,null}).
{array,0,10,null,10}
13>  array:get(24,  v(12)).  
null
15>  array:default( v(12)).   
null
16>  array:set(124, true, A1).
{array,125,1000,undefined,
       {{10,
         {undefined,undefined,undefined,undefined,undefined,
                    undefined,undefined,true,undefined,undefined},
         10,10,10,10,10,10,10,10,10},
        {10,10,
         {undefined,undefined,undefined,undefined,true,undefined,
                    undefined,undefined,undefined,undefined},
         10,10,10,10,10,10,10,10},
        100,100,100,100,100,100,100,100,100}}
17>  array:set(1214, true, A1).
{array,1215,10000,undefined,
       {{{10,
          {undefined,undefined,undefined,undefined,undefined,
                     undefined,undefined,true,undefined,undefined},
          10,10,10,10,10,10,10,10,10},
         100,100,100,100,100,100,100,100,100,100},
        {100,100,
         {10,
          {undefined,undefined,undefined,undefined,true,undefined,
                     undefined,undefined,undefined,undefined},
          10,10,10,10,10,10,10,10,10},
         100,100,100,100,100,100,100,100},
        1000,1000,1000,1000,1000,1000,1000,1000,1000}}
18>

 

还是tuple

   array数据结构的实现还是tuple,array被定义一个record,其中包含了一个tuple trees.
-record(array, {size :: non_neg_integer(),     %% number of defined entries
          max  :: non_neg_integer(),     %% maximum number of entries
                              %% in current tree
          default,     %% the default value (usually 'undefined')
          elements     %% the tuple tree
            }).
 
由于record本质上也是tuple,所以array.erl到处可见tuple的各种操作.先复习一下两个使用频率最高的函数erlang:make_tuple/2,3和setelement/3
Eshell V5.9  (abort with ^G)
1> erlang:make_tuple(5,a).
{a,a,a,a,a}
2>  erlang:make_tuple(4, []).
{[],[],[],[]}
3>  erlang:make_tuple(9, [],[{1,a},{3,q}]).
{a,[],q,[],[],[],[],[],[]}
4>

Eshell V5.9  (abort with ^G)
1> T={a,b,c,d}.
{a,b,c,d}
2> setelement(1,T,hello).
{hello,b,c,d}
3> setelement(2,T,kk).
{a,kk,c,d}
在array中下面这些操作其实都是在处理tuple,一起复习一下:
-define(NEW_NODE(S),  % beware of argument duplication!
     setelement((?NODESIZE+1),erlang:make_tuple((?NODESIZE+1),(S)),(S))).
    
-define(NEW_LEAF(D), erlang:make_tuple(?LEAFSIZE,(D))).


is_array(#array{size = Size, max = Max})
  when is_integer(Size), is_integer(Max) ->
    true;
is_array(_) ->
    false.

size(#array{size = N}) -> N;
size(_) -> erlang:error(badarg).


default(#array{default = D}) -> D;
default(_) -> erlang:error(badarg).

is_fix(#array{max = 0}) -> true;
is_fix(#array{}) -> false.

fix(#array{}=A) ->
    A#array{max = 0}.

relax(#array{size = N}=A) ->
    A#array{max = find_max(N-1, ?LEAFSIZE)}.   %%LEAFSIZE = 10
 

find_max(I, M) when I >= M ->
    find_max(I, ?extend(M));
find_max(_I, M) ->
    M.
 
    elements的构成是tuple tree,每LEAFSIZE(默认基数是10)个元素一组,如果对应数据位上的元素有值,那么该组的数据会展开,如果整组数据都没有赋值,那么该组数据将以LEAFSIZE占位即10.即使赋值的时候使用的是默认值,对应数据组依然会展开;内部节点的最后一个元素缓存了每一个子树可能存储的子元素个数.
    从一个set方法就可以了解array中element的数据结构在运行时是如何使用的.这里会有两个概念,第一个是"节点的展开",在节点没有展开的时候是一个数字占位,当对节点内数据位进行赋值就需要展开节点即变成一个tuple.另外一个概念就是array的扩容,在array没有固定大小的情况下,如果赋值的数据位置超过了当前array的最大值就会进行扩容.
set(I, Value, #array{size = N, max = M, default = D, elements = E}=A)
  when is_integer(I), I >= 0 ->
    if I < N ->  %%%小于当前Size 
               A#array{elements = set_1(I, E, Value, D)};
       I < M ->  %%%超过当前的Size但是小于MAX 点
               %% (note that this cannot happen if M == 0, since N >= 0)
               A#array{size = I+1,elements = set_1(I, E, Value, D)};
       M > 0 ->  %%%没有Fix大小 会进行扩容
             {E1, M1} = grow(I, E, M),
             A#array{size = I+1, max = M1,elements = set_1(I, E1, Value, D)};
       true ->
              erlang:error(badarg)
    end;
set(_I, _V, _A) ->
    erlang:error(badarg).

%% See get_1/3 for details about switching and the NODEPATTERN macro.

set_1(I, E=?NODEPATTERN(S), X, D) ->
    I1 = I div S + 1,
    setelement(I1, E, set_1(I rem S, element(I1, E), X, D));
set_1(I, E, X, D) when is_integer(E) ->
    expand(I, E, X, D);
set_1(I, E, X, _D) ->
    setelement(I+1, E, X).


%% Enlarging the array upwards to accommodate an index `I'

grow(I, E, _M) when is_integer(E) ->
    M1 = find_max(I, E),
    {M1, M1};
grow(I, E, M) ->
    grow_1(I, E, M).

grow_1(I, E, M) when I >= M ->
    grow(I, setelement(1, ?NEW_NODE(M), E), ?extend(M));
grow_1(_I, E, M) ->
    {E, M}.


%% Insert an element in an unexpanded node, expanding it as necessary.

expand(I, S, X, D) when S > ?LEAFSIZE ->
    S1 = ?reduce(S),
    setelement(I div S1 + 1, ?NEW_NODE(S1),
            expand(I rem S1, S1, X, D));
expand(I, _S, X, D) ->
    setelement(I+1, ?NEW_LEAF(D), X).
   对于size在array中是一个比较弱化的概念,size变化规则:元素位置参数i是非负整数,如果array固定了size(执行了fix函数)那么如果i大于array的size就会报错;对于没有固定大小的array,如果i>size(Array)-1,Array的Size会扩展到i+1;
Eshell V5.9  (abort with ^G)
1>  A1 = array:set(17, true, array:new()).
{array,18,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,true,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
   对于数据量较大的情况,提高速度的关键是减少尝试的次数.尽量最有可能的路径尽量的短. 特别是要注意的是对于较大的树,访问到一个叶子节点的概率比访问到一个内部节点的概率要小很多.如果要微调 set_1 get_1一定要反复测试,参数的顺序都会影响效率.get方法使用了稀疏遍历的方式尽可能快的定位到要访问的元素.
  
%% -define(NODEPATTERN(S), {_,_,_,_,_,_,_,_,_,_,S}). % NODESIZE+1 elements!

get(I, #array{size = N, max = M, elements = E, default = D})
  when is_integer(I), I >= 0 ->
    if I < N ->
         get_1(I, E, D);
       M > 0 ->
         D;
       true ->
         erlang:error(badarg)
    end;
get(_I, _A) ->
    erlang:error(badarg).

%% The use of NODEPATTERN(S) to select the right clause is just a hack,
%% but it is the only way to get the maximum speed out of this loop
%% (using the Beam compiler in OTP 11).

get_1(I, E=?NODEPATTERN(S), D) ->
    get_1(I rem S, element(I div S + 1, E), D);
get_1(_I, E, D) when is_integer(E) ->
    D;
get_1(I, E, _D) ->
    element(I+1, E).
  写个例子看看 NODEPATTERN(S):
20> A = array:from_list([1,2,3,4,5,a,b,c,d,e,f,g]).
{array,12,100,undefined,
       {{1,2,3,4,5,a,b,c,d,e},
        {f,g,undefined,undefined,undefined,undefined,undefined,
           undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}

23> {_,_,_,_,E} =A.
{array,12,100,undefined,
       {{1,2,3,4,5,a,b,c,d,e},
        {f,g,undefined,undefined,undefined,undefined,undefined,
           undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
24>  {_,_,_,_,_,_,_,_,_,_,S} = E .
{{1,2,3,4,5,a,b,c,d,e},
{f,g,undefined,undefined,undefined,undefined,undefined,
    undefined,undefined,undefined},
10,10,10,10,10,10,10,10,10}
25> S.
10
  array有两种遍历方式,一种是逐一遍历比如方法foldl foldr map,另外一种就是稀疏遍历比如sparse_to_list sparse_to_orddict sparse_map sparse_size sparse_to_list sparse_to_orddict
1> A= array:set(17, true, array:new()).
{array,18,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,true,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
2> A2= array:set(30, ki, A).
{array,31,100,undefined,
       {10,
        {undefined,undefined,undefined,undefined,undefined,
                   undefined,undefined,true,undefined,undefined},
        10,
        {ki,undefined,undefined,undefined,undefined,undefined,
            undefined,undefined,undefined,undefined},
        10,10,10,10,10,10,10}}
3> array:sparse_to_list(A2).
[true,ki]
4>
 

数据结构转换

  上面也提到了array可以在List和orddict之间进行转换,稀疏方式转换会忽略掉默认值.所以如果使用稀疏方式反复进行转换,得到的array内部结构可能不同.看一下下面v(7) v(8)的值.
from_list(List::list()) -> array()
from_list(List::list(), Default::term()) -> array()
to_list(Array::array()) -> list()
sparse_to_list(Array::array()) -> list()  
Eshell V5.9  (abort with ^G)
1> 
1> array:from_list(lists:seq(1,12)).
{array,12,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,
         undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
2>  array:to_list(v(1)).
[1,2,3,4,5,6,7,8,9,10,11,12]
3>  array:sparse_to_list(v(1)).
[1,2,3,4,5,6,7,8,9,10,11,12]
4> array:set(17,kk,v(1)).
{array,18,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,kk,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
5> array:to_list(v(4)).
[1,2,3,4,5,6,7,8,9,10,11,12,undefined,undefined,undefined,
undefined,undefined,kk]
6>  array:sparse_to_list(v(4)).
[1,2,3,4,5,6,7,8,9,10,11,12,kk]
7> array:from_list(v(6)).
{array,13,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,kk,undefined,undefined,undefined,undefined,undefined,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
8> array:from_list(v(5)).
{array,18,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,kk,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
 
 orddict和array之间的转换类似,只不过由于orddict已经包含了数据索引的位置,所以反复转换依然可以重建成为一致的数据结构.
 
 Eshell V5.9  (abort with ^G)
1> array:from_list(lists:seq(1,12)).
{array,12,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,
         undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
2>  array:set(17,kk,v(1)).
{array,18,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,kk,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
3> array:to_orddict(v(1)).
[{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},{10,11},{11,12}]
4> array:to_orddict(v(2)).
[{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},{10,11},{11,12},{12,undefined},{13,undefined},{14,undefined},{15,undefined},{16,undefined},{17,kk}]
5> array:sparse_to_orddict(v(2)).
[{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},{10,11},{11,12},{17,kk}]
6> array:from_orddict(v(4)).
{array,18,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,kk,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
7> array:from_orddict(v(5)).
{array,18,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,kk,
         undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
8> array:from_orddict(v(3)).
{array,12,100,undefined,
       {{1,2,3,4,5,6,7,8,9,10},
        {11,12,undefined,undefined,undefined,undefined,undefined,
         undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10}}
9> 

 

Code snippet

 
代码有两处细节平时很少这样写:
Eshell V5.9  (abort with ^G)
1> F=fun(M,N)->{M,N} end.
#Fun<erl_eval.12.111823515>
2> F2=fun(K)->F(K,if K>0->K;true -> 100 end) end.
#Fun<erl_eval.6.111823515>
3> F2(10).
{10,10}
4> F2(-1).
{-1,100}
5>

5> F=fun(E={_,_,A})-> {A} end .  
#Fun<erl_eval.6.111823515>
7> F({1,2,a}).
{a}
8> F2=fun({_,_,A}=E)-> {A} end .
#Fun<erl_eval.6.111823515>
9> F2({1,2,a}).
{a}

 

Ya解决方案

2012-07-11 15:40 更新

https://github.com/baryluk/ral

% Random access list. % Random access lists (ral) is a functional (immutable) data % structure which are better (in speed) both from normal lists, % and from balanced binary search trees. % Time complexity of various basic operations % % Operation RAL List BST % head O(1) O(1) O(log n) % tail O(1) O(1) O(log n) % cons O(1) O(1) O(log n) % nth O(log n) O(n) O(log n) % nthtail O(log n) O(n) O(log n) % % in some situations nth or nthtail can be faster in RAL. O(log n) is worst case complexity. % for some values it is faster.

LYSE上一段很好的总结:

Erlang arrays, at the opposite of their imperative counterparts, are not able to have such things as constant-time insertion or lookup. Because they're usually slower than those in languages which support destructive assignment and that the style of programming done with Erlang doesn't necessary lend itself too well to arrays and matrices, they are rarely used in practice.

Generally, Erlang programmers who need to do matrix manipulations and other uses requiring arrays tend to use concepts called Ports to let other languages do the heavy lifting, or C-Nodes, Linked in drivers and NIFs (Experimental, R13B03+).

Arrays are also weird in the sense that they're one of the few data structures to be 0-indexed (at the opposite of tuples or lists), along with indexing in the regular expressions module. Be careful with them.

Link: http://learnyousomeerlang.com/a-short-visit-to-common-data-structures#arrays

 

 

 

 

 

 

 

 

 

 

 

 

 

 

                                                        晚安!