编译原理之正则表达式转NFA
本文转载自http://chriszz.sinaapp.com/?p=257
输入一个正则表达式,输出一个NFA。
我的做法:输入一个字符串表示正则,输出则是把输出到一个.dot文件中并将dot文件编译成pdf,fedora需要sudo yum install dot,然后evince XXX.pdf就可以查看生成的NFA了。
具体算法是按照龙书上的Tompson算法来的。
废话不多说,放码过来:
/* Author:ChrisZZ(zchrissirhcz@gmail.com) Time:2013-12-25 14:13:09 输入:正则表达式 输出:自动机 算法步骤: 1.把正则表达式转化为后缀表达式 2.把后缀表达式转化为NFA 3.用dot语言把NFA输出到PDF 参考: 1.Regular Expression Matching Can Be Simple And Fast http://swtch.com/~rsc/regexp/regexp1.html 2.龙书 chap3.7.4 从正则表达式构造NFA 3.YCC学长的project中dot语言的使用 其他说明: 1.需要安装dot,并添加到系统path中 2.在windows下运行时,控制台因为编码不支持可能导致中文提示无法显示 */ #include <iostream> #include <string> #include <stdio.h> #include <stack> #include <string.h> #include <stdexcept> #include <stdlib.h> using namespace std; const int Match = 256; const int Split = 257;//表示epsilon分支 struct Paren{//括号结构体 int natom; int nalt; }; string re2post(string re){ Paren paren;//括号 stack<struct Paren>parenStk; string postExpr=""; int i, len=re.length(); int nalt=0, natom=0; const string invalidRegExp = "非法的正则表达式"; for(i=0; i<len; i++){ if(isspace(re[i])) continue; if(isalpha(re[i])){ if(natom>1){ natom--; postExpr = postExpr + '.'; } natom++; postExpr = postExpr + re[i]; } else if(re[i]=='('){ if(natom>1){ postExpr = postExpr + '.'; } paren.natom = natom; paren.nalt = nalt; parenStk.push(paren); nalt = 0; natom = 0; } else if(re[i]==')'){ if(natom==0 || parenStk.empty()) throw runtime_error(invalidRegExp+":括号不匹配"); while(--natom>0){//比如((a|b)(c|d))模式,当上一次匹配完倒数第二个右括号后,natom为2,需要添加'.' postExpr = postExpr + '.'; } while(nalt-->0){ postExpr = postExpr + '|'; } paren=parenStk.top(); parenStk.pop(); natom = paren.natom; nalt = paren.nalt; natom++; } else if(re[i]=='*'){ if(natom==0) throw runtime_error(invalidRegExp+":提前出现'*'"); postExpr = postExpr + re[i]; } else if(re[i]=='|'){ if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'"); while(--natom>0){ postExpr = postExpr + '.'; } nalt++; } else throw runtime_error(invalidRegExp); } if(!parenStk.empty()) throw runtime_error(invalidRegExp+":括号不匹配"); while(--natom>0){ postExpr = postExpr + '.'; } while(nalt-->0){ postExpr = postExpr + '|'; } return postExpr; } class NFA; /* * c<256表示edge权重为c; * c=256表示终结状态,匹配成功 * c=257表示分支(split) */ class State{ friend class NFA; friend void nfa2graph(State* head, const string& re); public: State(int c=256, State* out=NULL, State* out1=NULL){ this->c = c; this->out = out; this->out1 = out1; this->id = 0; } void setId(int id){ this->id = id; } private: int c; int id;//状态的编号 State* out;//从本状态出去的状态集合的头指针 State* out1;//两个分支的情况 }; class NFA{ public: NFA(){ head = NULL; tail = NULL; } NFA(const int& c){ tail = new State(Match, NULL, NULL); head = new State(c, tail, NULL); } void doCat(NFA& nfa){ tail->out = nfa.head; tail->c = Split; tail = nfa.tail; nfa.head = NULL; nfa.tail = NULL; } void doUnion(NFA& nfa){ State* newHead = new State(Split, head, nfa.head); State* newTail = new State(Match, NULL, NULL); tail->c = Split; tail->out = newTail; nfa.tail->c = Split; nfa.tail->out = newTail; tail = newTail; head = newHead; nfa.head = NULL; nfa.tail = NULL; } void doStar(){ State* newTail = new State(Match, NULL, NULL); State* newHead = new State(Split, head, newTail); tail->c = Split; tail->out = newTail; tail->out1 = head; tail = newTail; head = newHead; } void nfa2graph(const string& re){ char myfile[100]; printf("请输入一个文件名,用来保存生成的NFA-graph(不必提供后缀):\n"); scanf("%s", myfile); printf("已将DOT文件存储在\"%s.dot\",\n", myfile); printf("PDF文件则存储在\"%s.dot.pdf\".\n", myfile); int i; while(myfile[i]!='\0') i++; myfile[i] = '.'; myfile[i+1] = 'd'; myfile[i+2] = 'o'; myfile[i+3] = 't'; myfile[i+4] = '\0'; FILE *file = fopen(myfile, "w"); fputs("digraph {\n", file); fputs("\t\"", file); int len=re.length(); for(i=0; i<len; i++){ fprintf(file, "%c", re[i]); } fputs("\" [shape = plaintext]\n", file); fputs("\trankdir = LR\n", file); fputs("\t\"\" [shape = point]\n", file); fputs("\t\"\" -> 1 [label = Start]\n\n", file); int id = 1; char circle[2000]; memset(circle, 0, sizeof(circle)); State* p; stack<State*> staStk; head->setId(id++); staStk.push(head); while(!staStk.empty()){ p = staStk.top(); staStk.pop(); char flag = 1; cout << "p->c=" << p->c << endl; if(p->c < Match){ cout << "p->out->id=" << p->out->id << endl; if(p->out->id==0){ p->out->id = id++; cout << "id=" << id << endl; } else flag = 0; fprintf(file, "\t%d -> %d [label = \"%c\"]\n", p->id, (p->out)->id, p->c); State *what = p->out; if(flag) //push(*what); staStk.push(what); } else if(p->c == Match){ circle[p->id] = 1; } else{ //对应Split的情形 if(p->out->id==0) p->out->id = id++; else flag = 0; fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out->id); State *what = p->out; if(flag) staStk.push(what); if(p->out1!=NULL){ flag = 1; if(p->out1->id==0) p->out1->id = id++; else flag = 0; fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out1->id); what = p->out1; if(flag) staStk.push(what); } } } for(i=1; i<id; i++){ fprintf(file, "\t%d [shape = circle", i); if(circle[i]) fputs(", peripheries = 2", file); fprintf(file, "]\n"); } fputs("}", file); fclose(file); char cmd[108]; sprintf(cmd, "dot %s -O -Tpdf", myfile); if(system(cmd)==0){ printf("成功生成pdf图像!\n"); //printf("Linux用户可以使用evince file.pdf &命令打开~\n"); } else printf("悲剧!生成pdf图像时出现错误..\n"); } private: State* head; State* tail; }; NFA post2nfa(const string& postExpr){ stack<NFA> nfaStk; NFA e1, e2, e; int i, len=postExpr.length(); for(i=0; i<len; i++){ switch(postExpr[i]){ case '.': e2 = nfaStk.top(); nfaStk.pop(); e1 = nfaStk.top(); nfaStk.pop(); e1.doCat(e2); nfaStk.push(e1); break; case '|': e2 = nfaStk.top(); nfaStk.pop(); e1 = nfaStk.top(); nfaStk.pop(); e1.doUnion(e2); nfaStk.push(e1); break; case '*': e = nfaStk.top(); nfaStk.pop(); e.doStar(); nfaStk.push(e); break; default:// NFA alpha(postExpr[i]); nfaStk.push(alpha); } } e = nfaStk.top(); nfaStk.pop(); if(!nfaStk.empty()) throw runtime_error("未知错误"); return e; } int main(){ string re; while(true){ cout << "请输入一个正则表达式:\n"; cin >> re; string postExpr = re2post(re); cout << "postExpr is : " << postExpr << endl; NFA nfa = post2nfa(postExpr); nfa.nfa2graph(re); cout << "继续吗?(y/n)\n" << endl; char c; cin >> c; while(c!='y' && c!='n'){ cout << "请输入'y'或'n'.\n"; c=getchar(); } if(c=='n') break; } cout << "Bye~\n"; return 0; }
Greatness is never a given, it must be earned.