自己写个JavaScript parser (分析器)系列 (2)

注:参考自http://dukeland.hk,本博客系列内容为自己解读的成果,以备将来自己回顾使用。所有版权归原作者所有,如有任何问题,请联系原作者

 

考虑到“字”分析比较简单,所以和“词”分析写到一起,形成这篇文章:

1, reader.js

//读字符是要为读词调用的,所以无外乎要做三件事儿:1,存数据;2,存自己的状态;3,提供读字符和回退字符的函数供调用
function Reader(str){
    this.data = str;
    this.currPos = 0;
    this.dataLength = str.length;
}

//机械式地读取下一个字符,所有字符读完了就返回-1
Reader.prototype.nextChar = function (){
    if (this.currPos >= this.dataLength){
        return -1; //end of stream
    }
    return this.data[this.currPos++];
}

//回退一个字符,也是为了读二义性的词方便
Reader.prototype.retract = function (n){
    if (n == undefined){
        n = 1;
    }
    this.currPos -= n;
    if (this.currPos < 0){
        this.currPos = 0;
    }
}

2,scanner.js

//因为读词也是为释义准备的,所以和读字符类似:1,保存数据;2,保存状态;3,提供创建一个词以及读下一个词的函数供调用
function Scanner(reader){
    this.reader = reader;
    this.currentToken = new Token(); //storing the current analysed token
    this.currLine = 0; //the line number of the current line being read
    this.state = Scanner.START_STATE;
}

//开始状态也是结束状态,读到了可以处理的字符就进入到了标识符读取状态。
//有歧义的类似'/'这样的专门定义一个状态来处理,防止逻辑上乱掉了。
Scanner.START_STATE = 1;
Scanner.IDENTIFIER_STATE = Scanner.START_STATE + 1;
Scanner.SLASH_STATE = Scanner.IDENTIFIER_STATE + 1;

//创建一个词,保存两种数据:类型和内容
Scanner.prototype.makeToken = function (type, text){
    this.currentToken.type = type;
    this.currentToken.text = text;
    return type;
}

和读取下一个字符类似,这里是在读取下一个关键字
Scanner.prototype.nextToken = function(){
    var bufferStr = "";
    while (true){
        switch (this.state){
            case Scanner.START_STATE:
                var c = this.reader.nextChar();
                
                                //先把字符类的词读进来
                if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){
                    this.state = Scanner.IDENTIFIER_STATE;
                    //we need to remember what the token's text is
                    bufferStr = c;
                }else if (c >= "0" && c <= "9"){ //标识数字
                    bufferStr = c;
                    var d;
                    while (true){
                        d = this.reader.nextChar();
                        if (d >= "0" && d <= "9"){
                            bufferStr += d;
                        }else{
                            this.reader.retract();
                            return this.makeToken(Token.tokens.INTLITERAL_TOKEN, bufferStr);
                        }
                    }
                }else{
                    switch (c){
                        case ":":
                            return this.makeToken(Token.tokens.COLON_TOKEN);
                        break;                                                
                        //这里略去了':', ';', '(', ')', '{', '}', '%'等的处理 .... 

                                                //往下是一堆二义性符号的处理
                        case "!":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.NOTEQUAL_TOKEN);
                            }else{
                                //记住,如果不成功这里要回退,否则会漏掉一个字符
                                this.reader.retract();
                                return this.makeToken(Token.tokens.NOT_TOKEN);
                            }
                        break;
                        case "+":
                            var d = this.reader.nextChar();
                            if (d == "="){
                                return this.makeToken(Token.tokens.PLUSASSIGN_TOKEN);
                            }else if (d == "+"){
                                return this.makeToken(Token.tokens.PLUSPLUS_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.PLUS_TOKEN);
                            }
                        break;
                        case "-":
                            var d = this.reader.nextChar();
                            if (d == "="){
                                return this.makeToken(Token.tokens.MINUSASSIGN_TOKEN);
                            }else if (d == "-"){
                                return this.makeToken(Token.tokens.MINUSMINUS_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.MINUS_TOKEN);
                            }
                        break;
                        case "*":
                            return this.makeToken(Token.tokens.MULT_TOKEN);
                        break;
                        case "=":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.EQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.ASSIGN_TOKEN);
                            }
                        break;
                        case ">":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.GREATEREQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.GREATER_TOKEN);
                            }
                        break;
                        case "<":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.LESSEQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.LESS_TOKEN);
                            }
                        break;
                        
                        case "/":
                            this.state = Scanner.SLASH_STATE;
                        break;
                        
                        case "&":
                            if (this.reader.nextChar() == "&"){
                                return this.makeToken(Token.tokens.AND_TOKEN);
                            }else{
                                this.reader.retract();
                                Errors.push({
                                    type: Errors.SYNTAX_ERROR,
                                    msg: "You have only one &",
                                    line: this.currLine
                                });
                            }
                        break;
                        case "|":
                            if (this.reader.nextChar() == "|"){
                                return this.makeToken(Token.tokens.OR_TOKEN);
                            }else{
                                this.reader.retract();
                                Errors.push({
                                    type: Errors.SYNTAX_ERROR,
                                    msg: "You have only one |",
                                    line: this.currLine
                                });
                            }
                        break;
                        
                        
                        case -1:
                            return this.makeToken(Token.tokens.EOS_TOKEN);
                        break;
                        case "\r": case "\n":
                            this.currLine++;
                        default:
                            //ignore them
                    }
                }
            break;
            //一旦遇到非特殊字符的字母时编程标识符状态,那么读下一个字符的时候就到这里进行处理了。
//这里处理的关键字都是连续性的字母,所以一旦出现空格或者其它的字符,这一串字母的读取就要中断了
case Scanner.IDENTIFIER_STATE: var c = this.reader.nextChar(); if (c === -1) { return this.makeToken(Token.tokens.EOS_TOKEN); } else if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){ bufferStr += c; }else{ this.reader.retract(); this.state = Scanner.START_STATE; switch (bufferStr){ case "var": return this.makeToken(Token.tokens.VAR_TOKEN); case "true": case "false": case "TRUE": case "FALSE": return this.makeToken(Token.tokens.BOOLLITERAL_TOKEN, bufferStr); case "if": return this.makeToken(Token.tokens.IF_TOKEN); case "else": return this.makeToken(Token.tokens.ELSE_TOKEN); case "while": return this.makeToken(Token.tokens.WHILE_TOKEN); case "print": return this.makeToken(Token.tokens.PRINT_TOKEN); default: return this.makeToken(Token.tokens.IDENTIFIER_TOKEN, bufferStr); } } break; //因为没有处理读完所有代码还没有碰到结尾符的问题,所以有bug,自己处理吧 case Scanner.SLASH_STATE: var d = this.reader.nextChar(); if (d == "/"){ //line comment bufferStr = ""; d = this.reader.nextChar(); if (d != "\r" && d != "\n"){ while (d != "\r" && d != "\n"){ bufferStr += d; d = this.reader.nextChar(); } //to retract the line break char this.reader.retract(); } this.state = Scanner.START_STATE; return this.makeToken(Token.tokens.LINECOMMENT_TOKEN, bufferStr); }else if (d == "*"){ //block comment bufferStr = ""; var end = false; while (! end){ d = this.reader.nextChar(); if (d != -1){ if (d == "\r" || d == "\n"){ this.currLine++; } if (d == "*"){ var e = this.reader.nextChar(); if (e == "/"){ //meet */ end = true; }else{ bufferStr += "*" + e; } }else{ bufferStr += d; } }else{ end = true; } } this.state = Scanner.START_STATE; return this.makeToken(Token.tokens.BLOCKCOMMENT_TOKEN, bufferStr); }else{ this.state = Scanner.START_STATE; this.reader.retract(); return this.makeToken(Token.tokens.DIV_TOKEN); } break; } } }

3,注意到代码里用到了一堆Token的常量,这里直接贴下来,不做解释。常量的值采用 +1的方式很讲究,这样你随时可以添加新的常量。

token.js

//Token class

//type: Token's type
//text: the actual text that makes this token, may be null if it is not important
function Token(type, text){
    this.type = type;
    this.text = text;
}

Token.tokens = {};
Token.tokens.EOS_TOKEN = 1; //end of stream
// using + 1 allows adding a new token easily later
Token.tokens.COLON_TOKEN = Token.tokens.EOS_TOKEN + 1;
Token.tokens.SEMICOLON_TOKEN = Token.tokens.COLON_TOKEN + 1;
Token.tokens.LEFTPAREN_TOKEN = Token.tokens.SEMICOLON_TOKEN + 1;
Token.tokens.RIGHTPAREN_TOKEN = Token.tokens.LEFTPAREN_TOKEN + 1;
Token.tokens.LEFTBRACE_TOKEN = Token.tokens.RIGHTPAREN_TOKEN + 1;
Token.tokens.RIGHTBRACE_TOKEN = Token.tokens.LEFTBRACE_TOKEN + 1;
Token.tokens.MOD_TOKEN = Token.tokens.RIGHTBRACE_TOKEN + 1;

Token.tokens.VAR_TOKEN = Token.tokens.MOD_TOKEN + 1;
Token.tokens.TYPE_TOKEN = Token.tokens.VAR_TOKEN + 1;
Token.tokens.BOOLLITERAL_TOKEN = Token.tokens.TYPE_TOKEN + 1;
Token.tokens.INTLITERAL_TOKEN = Token.tokens.BOOLLITERAL_TOKEN + 1;
Token.tokens.IF_TOKEN = Token.tokens.INTLITERAL_TOKEN + 1;
Token.tokens.ELSE_TOKEN = Token.tokens.IF_TOKEN + 1;
Token.tokens.WHILE_TOKEN = Token.tokens.ELSE_TOKEN + 1;
Token.tokens.PRINT_TOKEN = Token.tokens.WHILE_TOKEN + 1;
Token.tokens.IDENTIFIER_TOKEN = Token.tokens.PRINT_TOKEN + 1;

Token.tokens.PLUS_TOKEN = Token.tokens.IDENTIFIER_TOKEN + 1;
Token.tokens.PLUSPLUS_TOKEN = Token.tokens.PLUS_TOKEN + 1;
Token.tokens.PLUSASSIGN_TOKEN = Token.tokens.PLUSPLUS_TOKEN + 1;
Token.tokens.MINUS_TOKEN = Token.tokens.PLUSASSIGN_TOKEN + 1;
Token.tokens.MINUSMINUS_TOKEN = Token.tokens.MINUS_TOKEN + 1;
Token.tokens.MINUSASSIGN_TOKEN = Token.tokens.MINUSMINUS_TOKEN + 1;
Token.tokens.MULT_TOKEN = Token.tokens.MINUSASSIGN_TOKEN + 1;
Token.tokens.DIV_TOKEN = Token.tokens.MULT_TOKEN + 1;
Token.tokens.ASSIGN_TOKEN = Token.tokens.DIV_TOKEN + 1;
Token.tokens.EQUAL_TOKEN = Token.tokens.ASSIGN_TOKEN + 1;
Token.tokens.NOTEQUAL_TOKEN = Token.tokens.EQUAL_TOKEN + 1;
Token.tokens.GREATER_TOKEN = Token.tokens.NOTEQUAL_TOKEN + 1;
Token.tokens.GREATEREQUAL_TOKEN = Token.tokens.GREATER_TOKEN + 1;
Token.tokens.LESS_TOKEN = Token.tokens.GREATEREQUAL_TOKEN + 1;
Token.tokens.LESSEQUAL_TOKEN = Token.tokens.LESS_TOKEN + 1;
Token.tokens.AND_TOKEN = Token.tokens.LESSEQUAL_TOKEN + 1;
Token.tokens.OR_TOKEN = Token.tokens.AND_TOKEN + 1;
Token.tokens.NOT_TOKEN = Token.tokens.OR_TOKEN + 1;

Token.tokens.LINECOMMENT_TOKEN = Token.tokens.NOT_TOKEN + 1;
Token.tokens.BLOCKCOMMENT_TOKEN = Token.tokens.LINECOMMENT_TOKEN + 1;


Token.backwardMap = {}; //for inverse look-up
for (var x in Token.tokens){
    Token.backwardMap[Token.tokens[x]] = x;
}
posted @ 2012-09-11 11:00  yunfan85  阅读(524)  评论(0编辑  收藏  举报