暴力词法分析
词法(类python):
\w+\b -->NAME
'.*'|".*" -->STRING
^\t+ -->TAB
( -->TUPLELEFT
) -->TUPLERIGHT
[^\s\w]+ -->SYMBOL
如此简单的词法,暴力模拟就好啦 : )
语法和翻译还没想好怎么处理。囧
#include <iostream> #include <vector> #include <string> #include <map> #include <ctype.h> #include <stack> #include <stdlib.h> using namespace std; typedef map<string, int> Dict; typedef stack<int> Stack; Dict vardict; Stack varstack; Stack varerror; char keyword[][8] = {"if", "else","elif","while", "for", "in", "and", "or", "def", "class","print", "raise", "except","\0"}; int exec(vector<string>& codes); vector<string> gettoken(const string& code); bool deal_tokens(vector<string>&); void load_keyword(); void _print_tokens(vector<string> &tokens,string d= "#"); int print(string buf) { if (true) { cout << buf << endl; } } int printlog(string buf) { if (true) { cout << buf << endl; } } int main() { load_keyword(); vector<string> codes; codes.push_back("print 'haha'"); codes.push_back("if (x + 1) == y:"); codes.push_back(" s = \"sad\".pop()"); exec(codes); } int exec(vector<string>& codes) { for (int i = 0; i < codes.size(); i++) { vector<string> tokens = gettoken(codes[i]); _print_tokens(tokens); if (!deal_tokens(tokens)) { break; // raise somthing } } return 0; } enum { // deal_tokens state IF = 0, ELSE, ELIF, WHILE, FOR, IN, AND, OR, DEF, CLASS, PRINT, RAISE, EXCEPT, NONE=30, ERROR, NOTDEF, WORD, }; enum { STRING, NAME, EQUAL, TUPLELEFT, TUPLERIGHT, COLON, TAB, KEYWORD, SYMBOL }; bool isstring(string token) { int len = token.size(); if (token[len - 1] == '\'' && token[0] == '\'') return true; if (token[len - 1] == '\"' && token[0] == '\"') return true; return false; } bool istupleleft(string token) { if (token == "(") return true; return false; } vector<int> getsymbol(vector<string>& tokens) { vector<int> symbol; int len = tokens.size(); for (int i = 0; i < len; i++) { if (isstring(tokens[i])) symbol.push_back(STRING); else if (istupleleft) symbol.push_back(TUPLELEFT); } return symbol; } bool deal_tokens(vector<string>& tokens) { int len = tokens.size(); int state = NONE; if (!varstack.empty())state = varstack.top(); vector<int> symbol = getsymbol(tokens); return true; } /*****************************************/ void load_keyword() { for (int i = 0; ; i++) { if (keyword[i][0] == '\0') break; vardict[keyword[i]] = i; } } void _print_tokens(vector<string> &tokens,string d) { for (int j = 0; j < tokens.size(); j++) { printlog(d + tokens[j] + d); } } vector<string> gettoken(const string& code) { vector<string> tokens; int len = code.size(); string token; int flag = 0, lastflag = 0; for (int i = 0; i < len; i++) { char c = code[i]; lastflag = flag; if (flag == 0) { if (c == '\'') flag = 1; else if (c == '\"') flag = 2; else if (isalnum(c)) flag = 3; else if (isspace(c)) {if (i == 0) flag = 4;} else if (i < len - 1 && c == '(' && code[i+1] == ')') { // this is for '()' tokens.push_back("("); tokens.push_back(")"); i += 2; } else flag = 5; if (flag != 0)token += c; } else if (flag == 1) { //'' if (c == '\'') flag = 0; token += c; } else if (flag == 2) { //"" if (c == '\"') flag = 0; token += c; } else if (flag == 3) { //word[alnum] if (isalnum(c)) token += c; else if (isspace(c)) flag = 0; else {flag = 0; i--;} } else if (flag == 4) { //space if (isspace(c)) token += c; else {flag = 0; i--;} } else if (flag == 5) { //else if (isalnum(c) || c == '\'' || c == '\"' || c == '(' || c == ')') { flag = 0; i--; } else if (isspace(c)) {flag = 0;} else token += c; } if (lastflag != 0 && flag == 0) { tokens.push_back(token); token = string(); } } if (flag != 0) tokens.push_back(token); return tokens; }