栈和寄存器虚拟机比较(以python和lua为例)

指令长度

python

python的指令定长,长度为16bit,其中8bit操作码,8bit操作数。

///@file: Python-3.6.0\Include\code.h
typedef uint16_t _Py_CODEUNIT;

#ifdef WORDS_BIGENDIAN
#  define _Py_OPCODE(word) ((word) >> 8)
#  define _Py_OPARG(word) ((word) & 255)
#else
#  define _Py_OPCODE(word) ((word) & 255)
#  define _Py_OPARG(word) ((word) >> 8)
#endif

///@file: Python-3.6.0\Python\ceval.c
PyObject *
_PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
{
///...
#define NEXTOPARG()  do { \
        _Py_CODEUNIT word = *next_instr; \
        opcode = _Py_OPCODE(word); \
        oparg = _Py_OPARG(word); \
        next_instr++; \
    } while (0)
}

lua

lua一条指令的长度为一个int,也就是32bit,其中操作码为6bit,剩余的操作数有26bit。由于操作数中有一些用来编码寄存器的编号,所以单条指令比python的指令要长(32bit vs 16 bits)。

///@file: lua-5.3.4\src\lopcodes.h
/*===========================================================================
  We assume that instructions are unsigned numbers.
  All instructions have an opcode in the first 6 bits.
  Instructions can have the following fields:
	'A' : 8 bits
	'B' : 9 bits
	'C' : 9 bits
	'Ax' : 26 bits ('A', 'B', and 'C' together)
	'Bx' : 18 bits ('B' and 'C' together)
	'sBx' : signed Bx

  A signed argument is represented in excess K; that is, the number
  value is the unsigned value minus K. K is exactly the maximum value
  for that argument (so that -max is represented by 0, and +max is
  represented by 2*max), which is half the maximum for the corresponding
  unsigned argument.
===========================================================================*/


enum OpMode {iABC, iABx, iAsBx, iAx};  /* basic instruction format */

add为例

add是我们最为熟悉的基础操作,可能大家上学最开始学习的就是加法,设置不需要上学大家都需要使用加法。

python

python的加法默认就是取栈顶最顶端的两个元素,求和之后再次压入堆栈顶端。

        TARGET(BINARY_AND) {
            PyObject *right = POP();
            PyObject *left = TOP();
            PyObject *res = PyNumber_And(left, right);
            Py_DECREF(left);
            Py_DECREF(right);
            SET_TOP(res);
            if (res == NULL)
                goto error;
            DISPATCH();
        }

lua

在lua的操作中,加法的a = b + c,三个操作数都是由指令中的寄存器指定,而不是跟栈顶位置强绑定。

      vmcase(OP_ADD) {
        TValue *rb = RKB(i);
        TValue *rc = RKC(i);
        lua_Number nb; lua_Number nc;
        if (ttisinteger(rb) && ttisinteger(rc)) {
          lua_Integer ib = ivalue(rb); lua_Integer ic = ivalue(rc);
          setivalue(ra, intop(+, ib, ic));
        }
        else if (tonumber(rb, &nb) && tonumber(rc, &nc)) {
          setfltvalue(ra, luai_numadd(L, nb, nc));
        }
        else { Protect(luaT_trybinTM(L, rb, rc, ra, TM_ADD)); }
        vmbreak;
      }

指令生成

在lua的指令生成中,对于a = b * c + d * e的解析时,在第一个表达式b * c 完成之后,解析 d * e之前,已经为b * c的结果分配了寄存器。这也是lua的一个重要特点:它并没有语法树,虚拟机指令是在语法解析的同时实时生成的。作为对比,如果要使用基于堆栈的虚拟机,那么在对a计算加法的时候,必须要先完成bc和de两个子表达式的计算只有才能生成两个子表达式的加法。

/*
** Process 1st operand 'v' of binary operation 'op' before reading
** 2nd operand.
*/
void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
  switch (op) {
    case OPR_AND: {
      luaK_goiftrue(fs, v);  /* go ahead only if 'v' is true */
      break;
    }
    case OPR_OR: {
      luaK_goiffalse(fs, v);  /* go ahead only if 'v' is false */
      break;
    }
    case OPR_CONCAT: {
      luaK_exp2nextreg(fs, v);  /* operand must be on the 'stack' */
      break;
    }
    case OPR_ADD: case OPR_SUB:
    case OPR_MUL: case OPR_DIV: case OPR_IDIV:
    case OPR_MOD: case OPR_POW:
    case OPR_BAND: case OPR_BOR: case OPR_BXOR:
    case OPR_SHL: case OPR_SHR: {
      if (!tonumeral(v, NULL))
        luaK_exp2RK(fs, v);
      /* else keep numeral, which may be folded with 2nd operand */
      break;
    }
    default: {
      luaK_exp2RK(fs, v);
      break;
    }
  }
}

lua寄存器编号溢出

寄存器编号最短是8bits,这8bits可表示的正整数最大值是256,如果表达式太复杂,是不是会造成寄存器编号无法编码到指令中呢?

生成测试代码

tsecer@harry: cat genloop.py 
import sys
sys.setrecursionlimit(1500)

def pexp(level):
    if level > 0:
        print('( a + ', end='')
        pexp(level - 1)
        print(')', end='')
print('a = 1')
pexp(1000)
tsecer@harry: python3 genloop.py  > reg.overflow.lua

生成的lua代码大致这样

a = 1
( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a + ( a

嵌套层数先溢出

执行之后发现还没有到寄存器分配就已经报错,而此时freereg只用到了99个。

tsecer@harry: src/luac -l reg.overflow.lua 
src/luac: reg.overflow.lua:2: too many C levels (limit is 200) in main function near 'a'
tsecer@harry: 
(gdb) bt
#0  errorlimit (fs=0x7fffffffbab0, limit=200, what=0x4268ae "C levels") at lparser.c:80
#1  0x00000000004101c5 in checklimit (fs=0x7fffffffbab0, v=201, l=200, what=0x4268ae "C levels") at lparser.c:93
#2  0x0000000000410cff in enterlevel (ls=0x7fffffffbaf0) at lparser.c:334
#3  0x0000000000412ab1 in subexpr (ls=0x7fffffffbaf0, v=0x7fffffff1320, limit=0) at lparser.c:1051
#4  0x0000000000412c15 in expr (ls=0x7fffffffbaf0, v=0x7fffffff1320) at lparser.c:1079
#5  0x00000000004124ac in primaryexp (ls=0x7fffffffbaf0, v=0x7fffffff1320) at lparser.c:877
#6  0x0000000000412541 in suffixedexp (ls=0x7fffffffbaf0, v=0x7fffffff1320) at lparser.c:898
#7  0x0000000000412871 in simpleexp (ls=0x7fffffffbaf0, v=0x7fffffff1320) at lparser.c:978
#8  0x0000000000412b25 in subexpr (ls=0x7fffffffbaf0, v=0x7fffffff1320, limit=10) at lparser.c:1059
#9  0x0000000000412b88 in subexpr (ls=0x7fffffffbaf0, v=0x7fffffff14d0, limit=0) at lparser.c:1069
(gdb) p fs->freereg
$6 = 99 'c'
(gdb) 

对应的代码

static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
  BinOpr op;
  UnOpr uop;
  enterlevel(ls);
///...
}

static void enterlevel (LexState *ls) {
  lua_State *L = ls->L;
  ++L->nCcalls;
  checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
}

/*
** maximum depth for nested C calls and syntactical nested non-terminals
** in a program. (Value must fit in an unsigned short int.)
*/
#if !defined(LUAI_MAXCCALLS)
#define LUAI_MAXCCALLS		200
#endif

修改LUAI_MAXCCALLS宏

如果把LUAI_MAXCCALLS这个值修改的足够大呢,例如1024?可以看到,lua依然有运行时检测会提前结束(而不是生成错误的字节码):

/* Maximum number of registers in a Lua function (must fit in 8 bits) */
#define MAXREGS		255
/*
** Check register-stack level, keeping track of its maximum size
** in field 'maxstacksize'
*/
void luaK_checkstack (FuncState *fs, int n) {
  int newstack = fs->freereg + n;
  if (newstack > fs->f->maxstacksize) {
    if (newstack >= MAXREGS)
      luaX_syntaxerror(fs->ls,
        "function or expression needs too many registers");
    fs->f->maxstacksize = cast_byte(newstack);
  }
}

通过gdb看调用链入下图所示

Breakpoint 2, luaK_checkstack (fs=0x7fffffffbab0, n=1) at lcode.c:365
365           luaX_syntaxerror(fs->ls,
(gdb) p fs->freereg 
$2 = 254 '\376'
(gdb) bt
#0  luaK_checkstack (fs=0x7fffffffbab0, n=1) at lcode.c:365
#1  0x0000000000420e55 in luaK_reserveregs (fs=0x7fffffffbab0, n=1) at lcode.c:376
#2  0x0000000000421a4f in luaK_exp2nextreg (fs=0x7fffffffbab0, e=0x7ffffffe0d90) at lcode.c:706
#3  0x0000000000421b08 in luaK_exp2anyreg (fs=0x7fffffffbab0, e=0x7ffffffe0d90) at lcode.c:725
#4  0x0000000000421cb2 in luaK_exp2RK (fs=0x7fffffffbab0, e=0x7ffffffe0d90) at lcode.c:775
#5  0x0000000000422800 in luaK_infix (fs=0x7fffffffbab0, op=OPR_ADD, v=0x7ffffffe0d90) at lcode.c:1106
#6  0x0000000000412bb9 in subexpr (ls=0x7fffffffbaf0, v=0x7ffffffe0d90, limit=0) at lparser.c:1067
#7  0x0000000000412c67 in expr (ls=0x7fffffffbaf0, v=0x7ffffffe0d90) at lparser.c:1079
#8  0x00000000004124fc in primaryexp (ls=0x7fffffffbaf0, v=0x7ffffffe0d90) at lparser.c:877
#9  0x0000000000412591 in suffixedexp (ls=0x7fffffffbaf0, v=0x7ffffffe0d90) at lparser.c:898
#10 0x00000000004128c3 in simpleexp (ls=0x7fffffffbaf0, v=0x7ffffffe0d90) at lparser.c:978
#11 0x0000000000412b77 in subexpr (ls=0x7fffffffbaf0, v=0x7ffffffe0d90, limit=10) at lparser.c:1059
#12 0x0000000000412bda in subexpr (ls=0x7fffffffbaf0, v=0x7ffffffe0f40, limit=0) at lparser.c:1069

作者的说明

在lua作者的说明文档中,有大量的篇幅(第7部分"The Virtual Machine")说明了lua在实现时在基于stack和register之间的考虑。在5.0之前,lua一直使用的是基于堆栈的虚拟机,在2003年发布的5.0版本之后,lua开始使用基于寄存器的虚拟机。
作者提到两个优点:

  • 避免push/pop操作
    使用寄存器避免了代价很高的push/pop操作,因为操作数必须在栈顶,所以在执行动作之前必须要先push到堆栈上,反之在动作完成之后从堆栈上pop掉。考虑到这些push和pop可能会涉及到数值拷贝,或者操作数引用计数的增加,这个代价比较高。
  • 实时生成代码
    Some authors also defend registerbased virtual machines based on their suitability for on-the-fly compilation。

指令指针(pc)递增的时机

在lua和python中都有一个有意思的现象,就是先取出指令内容,然后递增pc指针,然后再解码并执行指令(注意:不是在指令解码并执行之后才递增pc)。这样的好处个人理解是对于call这种指令,在解码执行前就递增的话,当执行call的时候,自动保存的返回地址就是下一条指令,这样实现call指令更简单一些。
这一点应该和硬件CPU的处理方法相同,这也意味着,当在gdb中设置数据断点时,命中之后pc指针指向的是触发该断点指令的下一条指令。

lua

lua的解码执行

/* fetch an instruction and prepare its execution */
#define vmfetch()	{ \
  i = *(ci->u.l.savedpc++); 
  
  /* main loop of interpreter */
  for (;;) {
    Instruction i;
    StkId ra;
    vmfetch();

python

lua的解码执行

#define NEXTOPARG()  do { \
        _Py_CODEUNIT word = *next_instr; \
        opcode = _Py_OPCODE(word); \
        oparg = _Py_OPARG(word); \
        next_instr++; \
    } while (0)
    
    
for (;;) {
        NEXTOPARG();
    dispatch_opcode:
#ifdef DYNAMIC_EXECUTION_PROFILE
#ifdef DXPAIRS
        dxpairs[lastopcode][opcode]++;
        lastopcode = opcode;
#endif
        dxp[opcode]++;
#endif

#ifdef LLTRACE
        /* Instruction tracing */

        if (lltrace) {
            if (HAS_ARG(opcode)) {
                printf("%d: %d, %d\n",
                       f->f_lasti, opcode, oparg);
            }
            else {
                printf("%d: %d\n",
                       f->f_lasti, opcode);
            }
        }
#endif

        switch (opcode) {

        /* BEWARE!
           It is essential that any operation that fails sets either
           x to NULL, err to nonzero, or why to anything but WHY_NOT,
           and that no operation that succeeds does this! */

        TARGET(NOP)
            FAST_DISPATCH();

文档资料

指令周期这个文档还可以看到提前递增pc的重要原因:现在的cpu都是流水线的,在取指令之后递增pc,这样在指令解码执行的时候,取地址的逻辑单元就可以继续取指令,从而形成流水线。

In simpler CPUs, the instruction cycle is executed sequentially, each instruction being processed before the next one is started. In most modern CPUs, the instruction cycles are instead executed concurrently, and often in parallel, through an instruction pipeline: the next instruction starts being processed before the previous instruction has finished, which is possible because the cycle is broken up into separate steps.[1]

posted on 2023-01-12 20:14  tsecer  阅读(180)  评论(0编辑  收藏  举报

导航