自己写个JavaScript parser （分析器）系列（2）

注：参考自http://dukeland.hk，本博客系列内容为自己解读的成果，以备将来自己回顾使用。所有版权归原作者所有，如有任何问题，请联系原作者。

考虑到“字”分析比较简单，所以和“词”分析写到一起，形成这篇文章：

1, reader.js

//读字符是要为读词调用的，所以无外乎要做三件事儿：1，存数据；2，存自己的状态；3，提供读字符和回退字符的函数供调用
function Reader(str){
    this.data = str;
    this.currPos = 0;
    this.dataLength = str.length;
}

//机械式地读取下一个字符，所有字符读完了就返回-1
Reader.prototype.nextChar = function (){
    if (this.currPos >= this.dataLength){
        return -1; //end of stream
    }
    return this.data[this.currPos++];
}

//回退一个字符，也是为了读二义性的词方便
Reader.prototype.retract = function (n){
    if (n == undefined){
        n = 1;
    }
    this.currPos -= n;
    if (this.currPos < 0){
        this.currPos = 0;
    }
}

2，scanner.js

//因为读词也是为释义准备的，所以和读字符类似：1，保存数据；2，保存状态；3，提供创建一个词以及读下一个词的函数供调用
function Scanner(reader){
    this.reader = reader;
    this.currentToken = new Token(); //storing the current analysed token
    this.currLine = 0; //the line number of the current line being read
    this.state = Scanner.START_STATE;
}

//开始状态也是结束状态，读到了可以处理的字符就进入到了标识符读取状态。
//有歧义的类似'/'这样的专门定义一个状态来处理，防止逻辑上乱掉了。
Scanner.START_STATE = 1;
Scanner.IDENTIFIER_STATE = Scanner.START_STATE + 1;
Scanner.SLASH_STATE = Scanner.IDENTIFIER_STATE + 1;

//创建一个词，保存两种数据：类型和内容
Scanner.prototype.makeToken = function (type, text){
    this.currentToken.type = type;
    this.currentToken.text = text;
    return type;
}

和读取下一个字符类似，这里是在读取下一个关键字
Scanner.prototype.nextToken = function(){
    var bufferStr = "";
    while (true){
        switch (this.state){
            case Scanner.START_STATE:
                var c = this.reader.nextChar();
                
                                //先把字符类的词读进来
                if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){
                    this.state = Scanner.IDENTIFIER_STATE;
                    //we need to remember what the token's text is
                    bufferStr = c;
                }else if (c >= "0" && c <= "9"){ //标识数字
                    bufferStr = c;
                    var d;
                    while (true){
                        d = this.reader.nextChar();
                        if (d >= "0" && d <= "9"){
                            bufferStr += d;
                        }else{
                            this.reader.retract();
                            return this.makeToken(Token.tokens.INTLITERAL_TOKEN, bufferStr);
                        }
                    }
                }else{
                    switch (c){
                        case ":":
                            return this.makeToken(Token.tokens.COLON_TOKEN);
                        break;                                                
                        //这里略去了':', ';', '(', ')', '{', '}', '%'等的处理 .... 

                                                //往下是一堆二义性符号的处理
                        case "!":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.NOTEQUAL_TOKEN);
                            }else{
                                //记住，如果不成功这里要回退，否则会漏掉一个字符
                                this.reader.retract();
                                return this.makeToken(Token.tokens.NOT_TOKEN);
                            }
                        break;
                        case "+":
                            var d = this.reader.nextChar();
                            if (d == "="){
                                return this.makeToken(Token.tokens.PLUSASSIGN_TOKEN);
                            }else if (d == "+"){
                                return this.makeToken(Token.tokens.PLUSPLUS_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.PLUS_TOKEN);
                            }
                        break;
                        case "-":
                            var d = this.reader.nextChar();
                            if (d == "="){
                                return this.makeToken(Token.tokens.MINUSASSIGN_TOKEN);
                            }else if (d == "-"){
                                return this.makeToken(Token.tokens.MINUSMINUS_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.MINUS_TOKEN);
                            }
                        break;
                        case "*":
                            return this.makeToken(Token.tokens.MULT_TOKEN);
                        break;
                        case "=":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.EQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.ASSIGN_TOKEN);
                            }
                        break;
                        case ">":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.GREATEREQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.GREATER_TOKEN);
                            }
                        break;
                        case "<":
                            if (this.reader.nextChar() == "="){
                                return this.makeToken(Token.tokens.LESSEQUAL_TOKEN);
                            }else{
                                this.reader.retract();
                                return this.makeToken(Token.tokens.LESS_TOKEN);
                            }
                        break;
                        
                        case "/":
                            this.state = Scanner.SLASH_STATE;
                        break;
                        
                        case "&":
                            if (this.reader.nextChar() == "&"){
                                return this.makeToken(Token.tokens.AND_TOKEN);
                            }else{
                                this.reader.retract();
                                Errors.push({
                                    type: Errors.SYNTAX_ERROR,
                                    msg: "You have only one &",
                                    line: this.currLine
                                });
                            }
                        break;
                        case "|":
                            if (this.reader.nextChar() == "|"){
                                return this.makeToken(Token.tokens.OR_TOKEN);
                            }else{
                                this.reader.retract();
                                Errors.push({
                                    type: Errors.SYNTAX_ERROR,
                                    msg: "You have only one |",
                                    line: this.currLine
                                });
                            }
                        break;
                        
                        
                        case -1:
                            return this.makeToken(Token.tokens.EOS_TOKEN);
                        break;
                        case "\r": case "\n":
                            this.currLine++;
                        default:
                            //ignore them
                    }
                }
            break;
            //一旦遇到非特殊字符的字母时编程标识符状态，那么读下一个字符的时候就到这里进行处理了。
            //这里处理的关键字都是连续性的字母，所以一旦出现空格或者其它的字符，这一串字母的读取就要中断了
            case Scanner.IDENTIFIER_STATE:
                var c = this.reader.nextChar();
                
                if (c === -1) {
                    return this.makeToken(Token.tokens.EOS_TOKEN);
                } else if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){
                    bufferStr += c;
                }else{                    
                    this.reader.retract();                    
                    this.state = Scanner.START_STATE;                    
                    switch (bufferStr){
                        case "var":
                            return this.makeToken(Token.tokens.VAR_TOKEN);                        
                        case "true": case "false":
                        case "TRUE": case "FALSE":
                            return this.makeToken(Token.tokens.BOOLLITERAL_TOKEN, bufferStr);
                        case "if":
                            return this.makeToken(Token.tokens.IF_TOKEN);
                        case "else":
                            return this.makeToken(Token.tokens.ELSE_TOKEN);
                        case "while":
                            return this.makeToken(Token.tokens.WHILE_TOKEN);
                        case "print":
                            return this.makeToken(Token.tokens.PRINT_TOKEN);
                        default:
                            return this.makeToken(Token.tokens.IDENTIFIER_TOKEN, bufferStr);
                    }
                }
            break;
            //因为没有处理读完所有代码还没有碰到结尾符的问题，所以有bug，自己处理吧
            case Scanner.SLASH_STATE:
                var d = this.reader.nextChar();
                if (d == "/"){
                    //line comment
                    bufferStr = "";                    
                    d = this.reader.nextChar();
                    if (d != "\r" && d != "\n"){
                        while (d != "\r" && d != "\n"){
                            bufferStr += d;
                            d = this.reader.nextChar();
                        }
                        
                        //to retract the line break char
                        this.reader.retract();
                    }
                    
                    this.state = Scanner.START_STATE;
                    
                    return this.makeToken(Token.tokens.LINECOMMENT_TOKEN, bufferStr);
                }else if (d == "*"){
                    //block comment
                    bufferStr = "";
                    var end = false;
                    while (! end){
                        d = this.reader.nextChar();
                        if (d != -1){
                            if (d == "\r" || d == "\n"){
                                this.currLine++;
                            }
                            if (d == "*"){
                                var e = this.reader.nextChar();
                                if (e == "/"){
                                    //meet */
                                    end = true;
                                }else{
                                    bufferStr += "*" + e;
                                }
                            }else{
                                bufferStr += d;
                            }
                        }else{
                            end = true;
                        }
                    }
                    
                    this.state = Scanner.START_STATE;
                    
                    return this.makeToken(Token.tokens.BLOCKCOMMENT_TOKEN, bufferStr);
                }else{
                    this.state = Scanner.START_STATE;
                    this.reader.retract();
                    return this.makeToken(Token.tokens.DIV_TOKEN);
                }
            break;
        }
    }
}

3，注意到代码里用到了一堆Token的常量，这里直接贴下来，不做解释。常量的值采用 +1的方式很讲究，这样你随时可以添加新的常量。

token.js

//Token class

//type: Token's type
//text: the actual text that makes this token, may be null if it is not important
function Token(type, text){
    this.type = type;
    this.text = text;
}

Token.tokens = {};
Token.tokens.EOS_TOKEN = 1; //end of stream
// using + 1 allows adding a new token easily later
Token.tokens.COLON_TOKEN = Token.tokens.EOS_TOKEN + 1;
Token.tokens.SEMICOLON_TOKEN = Token.tokens.COLON_TOKEN + 1;
Token.tokens.LEFTPAREN_TOKEN = Token.tokens.SEMICOLON_TOKEN + 1;
Token.tokens.RIGHTPAREN_TOKEN = Token.tokens.LEFTPAREN_TOKEN + 1;
Token.tokens.LEFTBRACE_TOKEN = Token.tokens.RIGHTPAREN_TOKEN + 1;
Token.tokens.RIGHTBRACE_TOKEN = Token.tokens.LEFTBRACE_TOKEN + 1;
Token.tokens.MOD_TOKEN = Token.tokens.RIGHTBRACE_TOKEN + 1;

Token.tokens.VAR_TOKEN = Token.tokens.MOD_TOKEN + 1;
Token.tokens.TYPE_TOKEN = Token.tokens.VAR_TOKEN + 1;
Token.tokens.BOOLLITERAL_TOKEN = Token.tokens.TYPE_TOKEN + 1;
Token.tokens.INTLITERAL_TOKEN = Token.tokens.BOOLLITERAL_TOKEN + 1;
Token.tokens.IF_TOKEN = Token.tokens.INTLITERAL_TOKEN + 1;
Token.tokens.ELSE_TOKEN = Token.tokens.IF_TOKEN + 1;
Token.tokens.WHILE_TOKEN = Token.tokens.ELSE_TOKEN + 1;
Token.tokens.PRINT_TOKEN = Token.tokens.WHILE_TOKEN + 1;
Token.tokens.IDENTIFIER_TOKEN = Token.tokens.PRINT_TOKEN + 1;

Token.tokens.PLUS_TOKEN = Token.tokens.IDENTIFIER_TOKEN + 1;
Token.tokens.PLUSPLUS_TOKEN = Token.tokens.PLUS_TOKEN + 1;
Token.tokens.PLUSASSIGN_TOKEN = Token.tokens.PLUSPLUS_TOKEN + 1;
Token.tokens.MINUS_TOKEN = Token.tokens.PLUSASSIGN_TOKEN + 1;
Token.tokens.MINUSMINUS_TOKEN = Token.tokens.MINUS_TOKEN + 1;
Token.tokens.MINUSASSIGN_TOKEN = Token.tokens.MINUSMINUS_TOKEN + 1;
Token.tokens.MULT_TOKEN = Token.tokens.MINUSASSIGN_TOKEN + 1;
Token.tokens.DIV_TOKEN = Token.tokens.MULT_TOKEN + 1;
Token.tokens.ASSIGN_TOKEN = Token.tokens.DIV_TOKEN + 1;
Token.tokens.EQUAL_TOKEN = Token.tokens.ASSIGN_TOKEN + 1;
Token.tokens.NOTEQUAL_TOKEN = Token.tokens.EQUAL_TOKEN + 1;
Token.tokens.GREATER_TOKEN = Token.tokens.NOTEQUAL_TOKEN + 1;
Token.tokens.GREATEREQUAL_TOKEN = Token.tokens.GREATER_TOKEN + 1;
Token.tokens.LESS_TOKEN = Token.tokens.GREATEREQUAL_TOKEN + 1;
Token.tokens.LESSEQUAL_TOKEN = Token.tokens.LESS_TOKEN + 1;
Token.tokens.AND_TOKEN = Token.tokens.LESSEQUAL_TOKEN + 1;
Token.tokens.OR_TOKEN = Token.tokens.AND_TOKEN + 1;
Token.tokens.NOT_TOKEN = Token.tokens.OR_TOKEN + 1;

Token.tokens.LINECOMMENT_TOKEN = Token.tokens.NOT_TOKEN + 1;
Token.tokens.BLOCKCOMMENT_TOKEN = Token.tokens.LINECOMMENT_TOKEN + 1;


Token.backwardMap = {}; //for inverse look-up
for (var x in Token.tokens){
    Token.backwardMap[Token.tokens[x]] = x;
}

posted @ 2012-09-11 11:00 yunfan85 阅读(524) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

自己写个JavaScript parser （分析器）系列 （2）

公告

自己写个JavaScript parser （分析器）系列（2）