暴力词法分析

词法(类python):

\w+\b  -->NAME

'.*'|".*"  -->STRING

^\t+  -->TAB

(    -->TUPLELEFT

)    -->TUPLERIGHT

[^\s\w]+  -->SYMBOL

如此简单的词法,暴力模拟就好啦 : )

语法和翻译还没想好怎么处理。囧

#include <iostream>
#include <vector>
#include <string>
#include <map>
#include <ctype.h>
#include <stack>
#include <stdlib.h>
using namespace std;
typedef map<string, int> Dict;
typedef stack<int> Stack;

Dict vardict;
Stack varstack;
Stack varerror;

char keyword[][8] = {"if", "else","elif","while", "for", "in", "and", "or", 
    "def", "class","print", "raise", "except","\0"};

int exec(vector<string>& codes);
vector<string> gettoken(const string& code);
bool deal_tokens(vector<string>&);
void load_keyword();
void _print_tokens(vector<string> &tokens,string d= "#");

int print(string buf) {
    if (true) {
        cout << buf << endl;
    }
}
int printlog(string buf) {
    if (true) {
        cout << buf << endl;
    }
}

int main() {
    load_keyword();
    vector<string> codes;
    codes.push_back("print 'haha'");
    codes.push_back("if  (x +   1) == y:");
    codes.push_back("    s = \"sad\".pop()");
    exec(codes);
}

int exec(vector<string>& codes) {
    for (int i = 0; i < codes.size(); i++) {
        vector<string> tokens = gettoken(codes[i]);
        _print_tokens(tokens);
        if (!deal_tokens(tokens)) {
            break; // raise somthing
        }
    }
    return 0;
}

enum {    // deal_tokens state
    IF = 0, ELSE, ELIF, WHILE, FOR, IN, AND, OR, DEF, CLASS, PRINT, RAISE, EXCEPT, 
    NONE=30, ERROR, NOTDEF, WORD,
};
enum {
    STRING, NAME, EQUAL, TUPLELEFT, TUPLERIGHT, COLON, TAB, KEYWORD, SYMBOL
};


bool isstring(string token) {
    int len = token.size();
    if (token[len - 1] == '\'' && token[0] == '\'') return true;
    if (token[len - 1] == '\"' && token[0] == '\"') return true;
    return false;
}
bool istupleleft(string token) {
    if (token == "(") return true;
    return false;
}

vector<int> getsymbol(vector<string>& tokens) {
    vector<int> symbol;
    int len = tokens.size();
    for (int i = 0; i < len; i++) {
        if (isstring(tokens[i])) symbol.push_back(STRING);
        else if (istupleleft) symbol.push_back(TUPLELEFT);
        
    }
    return symbol;
}

bool deal_tokens(vector<string>& tokens) {
    int len = tokens.size();
    int state = NONE;
    if (!varstack.empty())state = varstack.top();
    
    vector<int> symbol = getsymbol(tokens);

    return true;
}







/*****************************************/

void load_keyword() {
    for (int i = 0; ; i++) {
        if (keyword[i][0] == '\0') break;
        vardict[keyword[i]] = i;
    }
}
void _print_tokens(vector<string> &tokens,string d) {
    for (int j = 0; j < tokens.size(); j++) {
        printlog(d + tokens[j] + d);
    }
}

vector<string> gettoken(const string& code) {
    vector<string> tokens;
    int len = code.size();
    string token;
    int flag = 0, lastflag = 0;
    for (int i = 0; i < len; i++) {
        char c = code[i];
        lastflag = flag;
        if (flag == 0) {
            if (c == '\'') flag = 1;
            else if (c == '\"') flag = 2;
            else if (isalnum(c)) flag = 3;
            else if (isspace(c)) {if (i == 0) flag = 4;}
            else if (i < len - 1 && c == '(' && code[i+1] == ')') {
                // this is for '()'
                tokens.push_back("(");
                tokens.push_back(")");
                i += 2;
            }
            else flag = 5;
            if (flag != 0)token += c;
        } else if (flag == 1) {            //''
            if (c == '\'') flag = 0;
            token += c;
        } else if (flag == 2) {            //""
            if (c == '\"') flag = 0;
            token += c;
        } else if (flag == 3) {            //word[alnum]
            if (isalnum(c)) token += c;
            else if (isspace(c)) flag = 0;
            else {flag = 0; i--;}
        } else if (flag == 4) {            //space
            if (isspace(c)) token += c;
            else {flag = 0; i--;}
        } else if (flag == 5) {            //else
            if (isalnum(c) || c == '\'' || c == '\"' || c == '(' || c == ')') {
                flag = 0; i--;
            } else if (isspace(c)) {flag = 0;}
            else token += c;
        }
        if (lastflag != 0 && flag == 0) {
            tokens.push_back(token);
            token = string();
        }
    }
    
    if (flag != 0) tokens.push_back(token);
    return tokens;
}

 

posted @ 2016-10-26 16:00  backinfile  阅读(253)  评论(0编辑  收藏  举报