[Compiling Principles] LEX基本功能的实现
学习编译原理的时候做的一个小型LEX,及词法分析产生器,可以根据输入的正则表达式生成自动机,从中识别出代码中的token串,存入一个二维表,也就是SymbolTable。
主体代码如下:
代码
1 #include<iostream>
2 #include "stdio.h"
3 using namespace std;
4 #include<String>
5 #include<Stack>
6 #include<set>
7 #include<queue>
8 #include<fstream>
9 #include<iomanip>
10 struct path
11 {
12 int from; //一个path代表从一个状态到另一个状态的路径
13 int to;
14 string strPath; //strPath代表这个结构体处理的路径
15 };
16 struct dealStruct
17 {
18 int stCount;
19 struct path* resPtr;
20 };
21 struct setNode
22 {
23 set<int> *Dset;
24 int num;
25 struct setNode * next;
26 };
27 bool judgeLD( char v ) //判断正则表达式中是否含有非法字符
28 {
29 if( ( int(v) > 47 && int(v) <= 57 ) ||
30 ( int(v) >= 65 && int(v) <= 90 )||
31 ( int(v) >= 97 && int(v) <= 122 ) ||
32 v == '>' || v == '<' || v == '=' ||
33 v == '`' || v == '+' || v == '-' )
34 return true;
35 else
36 return false;
37 }
38 struct dealStruct * dealWithStr( string, int ); //通过正则表达式生成NFA
39 int jihe( int, stack<struct path*> & ); //处理正则表达式时'*’的运算函数
40 int lianjie( int, stack<struct path*> & ); //处理正则表达式时'&’(自定义运算符ab相当于a&b)的运算函数
41 int huo( int, stack<struct path*> & );; //处理正则表达式时'|’的运算函数
42 int reduceNFA( int, int*, int ); //NFA->DFA函数
43 void BFS( int, int ); //计算一个状态的ξ闭包
44 void produceC( int, string * ); //生成词法分析器的.cpp文件
45 int arrForBFS[100] = {0}; //存放BFS函数生成的闭包的数组
46 string T[2000][2000]; //NFA的数组表示
47 string recT[2000][2000]; //DFA的数组表示
48 int stateCountDFA; //DFA状态数量
49 int reduceState[2][2000]; //化简DFA用的数组
50 set<char> tv; //所有终结符
51 set<char>::iterator Vt;
52 void reduceDFA(); //化简DFA
53 void combine(int,int); //化简DFA时合并两个状态
54 int regCount; //输入正则表达式的个数
55
56 int main()
57 {
58 regCount = 0;
59 int i,j,k;
60 int temp = 0;
61
62 string * regArr;
63 string * typeName;
64 int * typeIndex;
65 int **resultEnterExt;
66
67
68 ifstream inRegular;
69 inRegular.open("RegularExpressions.txt");
70 inRegular>>regCount; //正则表达式个数
71 typeName = new string[regCount];
72 typeIndex = new int[regCount];
73 resultEnterExt = new int*[regCount]; //定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态
74 for( i = 0;i < regCount;i++ )
75 resultEnterExt[i] = new int[2];
76 regArr = new string[regCount];
77 for( i = 0;i < regCount;i++ )
78 {
79 inRegular>>typeName[i];
80 inRegular>>regArr[i];
81 } //读入正则表达式
82
83 int stateCount = -1;
84 for( i = 0;i < regCount;i++ ) //遍历每个正则表达式
85 {
86 for( j = 0;j < regArr[i].length()-1;j++ )
87 if( ( judgeLD(regArr[i].at(j) ) && ( judgeLD(regArr[i].at(j+1)) || regArr[i].at(j+1) == '(' ) ) ||
88 ( regArr[i].at(j) == ')' && ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) ) ) ||
89 ( regArr[i].at(j) == '*' && ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) ) ) )
90 regArr[i].insert(j+1,"&"); //预处理正则表达式,添加自定义运算符'&'
91
92
93 regArr[i].insert(regArr[i].length(),"#%"); //在末尾添加结束符
94 cout<<regArr[i]<<endl; //输出处理后的正则表达式
95
96
97 j = 0;
98 struct path *p = NULL;
99 struct dealStruct * result = dealWithStr( regArr[i],stateCount ); //对于每个正则表达式调用NFA生成函数
100
101 stateCount = result->stCount;
102 resultEnterExt[i][0] = result->resPtr->from; //记录该正则表达式的起始状态和终结状态
103 resultEnterExt[i][1] = result->resPtr->to;
104 typeIndex[i] = resultEnterExt[i][1];
105 /* cout<<"The number of states now is "<<stateCount<<endl;
106 cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
107 cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
108 cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl;*/
109 }
110
111 stateCount++; //汇总所有正则表达式的NFA
112 cout<<"NFA合并完毕!"<<endl;
113
114 for( i = 0;i < regCount;i++ )
115 T[stateCount][resultEnterExt[i][0]] = '`'; //从公共起始状态出发
116
117 /* for( i = 0;i <= stateCount;i++ ) //输出产生的NFA以检验
118 {
119 for( j = 0;j <= stateCount;j++ )
120 {
121 if( T[i][j] == "" ) cout<<" |";
122 else cout<<T[i][j]<<'|';
123 }
124 cout<<endl;
125 }*/
126
127 stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); //调用NFA->DFA函数生成DFA
128 reduceDFA();
129
130 produceC( stateCountDFA, typeName ); //产生.cpp代码
131
132 system("pause");
133
134 return 0;
135 }
136
137 /*以下为NFA的生成函数,采用三个栈实现Thompson算法,第一个栈用于读入字符和运算符
138 第二个栈用于逆序,第三个栈专门用于处理一对括号内部的表达式,其实用两个栈就可以
139 实现,及把第三个栈舍弃,直接把括号的计算放在第一个栈中进行也可。为了计算更加
140 清楚不易出错故而多加了一个栈*
141 算法思路:读入正则表达式放入栈A中,*,&,|的优先级分别定为3,2,1,而表达式末
142 尾的'#'优先级最低,看做0。用一个三元数组存放当各个运算符的个数,字符均放入栈A中,
143 读到运算符时,当栈外的运算符的优先级大于等于站内所有运算符时,该运算符进栈,
144 如果低于所有站内运算符,计算站内所有运算符表的运算,直到栈内没有比栈外更高优先
145 级的运算符。
146 读到(的时候,(进栈,之后不论读到什么均进栈直到读到第一个),将这对()中的字符串
147 提出,用栈B逆序后放入栈C,在栈C中按上述方式处理直到只有一个元素,将元素返回栈A
148 栈中存储的是指向path结构体的指针*/
149 struct dealStruct * dealWithStr( string str, int stateCountV )
150 {
151 int j = 0;int temp = 0;
152 struct path *p = NULL;
153 int currentPri[3] = {0}; //用于储存栈中三种运算符(*,&,|)的个数,便于计算
154 stack<struct path *> stackA;
155 while( str.at(j) != '%' )
156 {
157 if( judgeLD(str.at(j)) ) //如果是字符,放入栈A中
158 {
159 tv.insert(str.at(j)); //放入tv中
160 stateCountV++;
161 temp = stateCountV;
162 stateCountV++;
163 T[temp][stateCountV] += str.at(j);
164 p = new path();
165 p->from = temp;
166 p->to = stateCountV;
167 p->strPath += str.at(j);
168 stackA.push(p);
169 //cout<<"push a new letter "<<p->strPath<<endl;
170 j++;
171 }
172 else if( str.at(j) == '*' ) //如果是'*'由于没有比它优先级更高的了,
173 { //故必然是入栈
174 currentPri[0]++;
175 p = new path();
176 p->from = -1;
177 p->to = -1;
178 p->strPath += "*";
179 stackA.push(p);
180 //cout<<"push *"<<endl;
181 j++;
182 }
183 else if( str.at(j) == '&' ) //如果是'&',检查之前有没有'*'
184 {
185 if( currentPri[0] == 0 ) //没有的话,入栈
186 currentPri[1]++;
187 else
188 {
189 currentPri[0]--; //有,计算*
190 currentPri[1]++;
191
192 stateCountV = jihe( stateCountV, stackA );
193 }
194 p = new path();
195 p->from = -2;
196 p->to = -2;
197 p->strPath = "&";
198 stackA.push(p); //不论有没有'*','&'必然会入栈
199 //cout<<"push &"<<endl;
200 j++;
201 }
202 else if( str.at(j) == '|' ) //如果是'|',查看前面有没有*,&
203 {
204 while( currentPri[0] > 0 || currentPri[1] > 0 )//一直算直到没有
205 {
206 if( currentPri[0] > 0 )
207 {
208 currentPri[0]--;
209 stateCountV = jihe( stateCountV, stackA );
210 }
211 else
212 {
213 currentPri[1]--;
214 stateCountV = lianjie( stateCountV, stackA );
215 }
216 }
217 currentPri[2]++; //不论有没有*,&,最后'|'必然还是要入栈
218 p = new path();
219 p->from = -3;
220 p->to = -3;
221 p->strPath = "|";
222 stackA.push(p);
223 //cout<<"push |"<<endl;
224 j++;
225 }
226 else if( str.at(j) == '#' ) //如果是'#',则计算前面所有的运算
227 {
228 while( currentPri[0] > 0 || currentPri[1] > 0 || currentPri[2] > 0 )
229 {
230 if( currentPri[0] > 0 )
231 {
232 currentPri[0]--;
233 stateCountV = jihe( stateCountV, stackA );
234 }
235 else if( currentPri[1] > 0 )
236 {
237 currentPri[1]--;
238 stateCountV = lianjie( stateCountV, stackA );
239 }
240 else
241 {
242 currentPri[2]--;
243 stateCountV = huo( stateCountV, stackA );
244 }
245
246 }
247 //cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;
248 j++;
249 //cout<<"It is "<<(stackA.top())->strPath<<endl;
250 }
251 else if( str.at(j) == '(' ) //发现左括号,进入括号处理过程
252 {
253 //cout<<"发现一个最外层括号"<<endl;
254 int parenthCount = 1;
255 bool flag = true;
256 while( parenthCount > 0 )
257 {
258 if( str.at(j) == '(' )
259 {
260 p = new path();
261 p->from = -5;
262 p->to = -5;
263 p->strPath += "(";
264 stackA.push(p);
265 //cout<<"放入一左个括号"<<endl;
266 if( !flag )parenthCount++;
267 flag = false;
268 }
269 else if( str.at(j) == ')' ) //发现第一个右括号
270 {
271 //cout<<"发现一个内层右括号"<<endl;
272 parenthCount--;
273
274 stack<struct path *> stackB; //定义另两个栈
275 stack<struct path *> stackC;
276
277 p = new path();
278 p->from = -6;
279 p->to = -6;
280 p->strPath += "#";
281 stackB.push(p);
282 while( (stackA.top())->from != -5 ) //栈Apop到第一个(
283 {
284 stackB.push( stackA.top() ); //将栈Apop出的放入B
285 // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
286 stackA.pop();
287
288 }
289 //cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl;
290 stackA.pop(); //去掉左小括号
291
292 int pri[3] = {0};
293 while( !stackB.empty() ) //开始处理栈B内的表达式
294 {
295 p = stackB.top();
296 stackB.pop();
297 //cout<<"stackB弹出一个"<<p->from<<endl;
298 if( p->from >= 0 )
299 stackC.push(p); //用栈C处理栈B的表达式
300 else if( p->from == -1 )
301 {
302 pri[0]++;
303 stackC.push(p);
304 }
305 else if( p->from == -2 )
306 {
307 if( pri[0] > 0 )
308 {
309 pri[0]--;
310 stateCountV = jihe( stateCountV, stackC );
311 }
312
313 stackC.push(p);
314 pri[1]++;
315 }
316 else if( p->from == -3 )
317 {
318 while( pri[0] > 0 || pri[1] > 0 )
319 {
320 if( pri[0] > 0 )
321 {
322 pri[0]--;
323 stateCountV = jihe( stateCountV, stackC );
324 }
325 else
326 {
327 pri[1]--;
328 stateCountV = lianjie( stateCountV, stackC );
329 }
330
331 }
332 pri[2]++;
333 stackC.push(p);
334 }
335 else if( p->from == -6 )
336 {
337 while( pri[0] > 0 || pri[1] > 0 || pri[2] > 0 )
338 {
339 if( pri[0] > 0 )
340 {
341 pri[0]--;
342 stateCountV = jihe( stateCountV, stackC );
343 }
344 else if( pri[1] > 0 )
345 {
346 pri[1]--;
347 stateCountV = lianjie( stateCountV, stackC );
348 }
349 else
350 {
351 pri[2]--;
352 stateCountV = huo( stateCountV, stackC );
353 }
354 }
355 }
356 else
357 cout<<"括号内含非法字符!"<<endl;
358 }
359 //cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
360
361 //cout<<"此时的parenthCount为"<<parenthCount<<endl;
362 stackA.push(stackC.top()); //处理完成,栈C弹出结果
363 stackC.pop(); //放入栈A
364 }
365 else if( str.at(j) == '*' )
366 {
367 p = new path();
368 p->from = -1;
369 p->to = -1;
370 p->strPath += "*";
371 stackA.push(p);
372 }
373 else if( str.at(j) == '&' )
374 {
375 p = new path();
376 p->from = -2;
377 p->to = -2;
378 p->strPath += "&";
379 stackA.push(p);
380 }
381 else if( str.at(j) == '|' )
382 {
383 p = new path();
384 p->from = -3;
385 p->to = -3;
386 p->strPath += "|";
387 stackA.push(p);
388 }
389 else
390 {
391 stateCountV++;
392 temp = stateCountV;
393 stateCountV++;
394 T[temp][stateCountV] += str.at(j);
395 p = new path();
396 p->from = temp;
397 p->to = stateCountV;
398 p->strPath += str.at(j);
399 stackA.push(p);
400 }
401 tv.insert(str.at(j)); //放入tv中
402 j++;
403 }
404 }
405 else
406 cout<<"含有非法字符!"<<endl;
407 }
408
409 struct dealStruct * res = new dealStruct();
410 res->stCount = stateCountV;
411 res->resPtr = stackA.top();
412 stackA.pop();
413 cout<<"NFA生成完毕!"<<endl;
414 return res;
415 }
416
417
418 int jihe( int stateV, stack<struct path*> &stackA )
419 {
420 struct path *b = stackA.top();stackA.pop();
421 struct path *a = stackA.top();stackA.pop();
422 //cout<<"pop the value for * "<<a->strPath<<endl;
423 //cout<<"pop the sign for * "<<b->strPath<<endl;
424
425 stateV++;
426 int temp = stateV;
427 stateV++;
428 T[temp][a->from] += "`";
429 T[temp][stateV] += "`";
430 T[a->to][stateV] += "`";
431 T[a->to][a->from] += "`";
432
433 struct path *p = new path();
434 p->from = temp;
435 p->to = stateV;
436 p->strPath = "(" + a->strPath + ")*";
437 stackA.push(p);
438 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
439
440 delete a;delete b;
441 return stateV;
442 }
443 int lianjie( int stateV, stack<struct path*> &stackV )
444 {
445 struct path *a = stackV.top();stackV.pop();
446 //cout<<"pop the second value for & "<<a->strPath<<endl;
447 struct path *b = stackV.top();stackV.pop();
448 //cout<<"pop the sign for & "<<b->strPath<<endl;
449 struct path *c = stackV.top();stackV.pop();
450 //cout<<"pop the first value for & "<<c->strPath<<endl;
451 T[c->to][a->from] += "`";
452
453 struct path *p = new path();
454 p->from = c->from;
455 p->to = a->to;
456 p->strPath = "(" + c->strPath + ")&(" + a->strPath + ")";
457 stackV.push(p);
458 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
459 delete a;delete b;delete c;
460
461 return stateV;
462 }
463 int huo( int stateV, stack<struct path*> &stackA )
464 {
465 struct path *a = stackA.top();stackA.pop();
466 struct path *b = stackA.top();stackA.pop();
467 struct path *c = stackA.top();stackA.pop();
468 //cout<<"pop the first value for | "<<c->strPath<<endl;
469 //cout<<"pop the second value for | "<<a->strPath<<endl;
470 //cout<<"pop the sign for | "<<b->strPath<<endl;
471
472 stateV++;
473 int temp = stateV;
474 stateV++;
475 T[temp][a->from] += "`";
476 T[temp][c->from] += "`";
477 T[a->to][stateV] += "`";
478 T[c->to][stateV] += "`";
479
480 struct path *p = new path();
481 p->from = temp;
482 p->to = stateV;
483 p->strPath = "(" + c->strPath + ")|(" + a->strPath + ")";
484 stackA.push(p);
485 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
486 delete a;delete b;delete c;
487
488 return stateV;
489 }
490
491 int reduceNFA( int stateCountV, int *typeIndexV, int typeNumV ) //stateCountV为NFA的起始状态
492 {
493 int i,j,k;
494 struct setNode *head,*setPtr,*end; //用于构造链表的指针,新状态用链表存储
495 struct setNode *currentPtr; //用于遍历链表中元素的指针
496
497 set<int>::iterator ite_int; //每个新的状态为一个set,封装到setNode中
498 int countNode = 1;
499 setPtr = new setNode();
500 setPtr->num = 1;
501 setPtr->Dset = new set<int>();
502 setPtr->next = NULL;
503 BFS(stateCountV, stateCountV);
504 for( i = 0;arrForBFS[i] >= 0;i++ )//先把起始状态的ε闭包求出作为新起始状态
505 setPtr->Dset->insert(arrForBFS[i]);
506
507 head = setPtr;
508 end = setPtr;
509 currentPtr = head;
510
511 string letter1[80]; //这里定义了两个数组,一个存放读入的字符
512 set<int> letter2[80]; //一个存放读入相应字符后所到达的状态集合
513 int up = 0;
514
515
516 while( currentPtr != NULL ) //遍历新状态链表
517 {
518 //cout<<"currentPtr遍历到"<<currentPtr->num<<endl;
519 //下面一个双重循环,第一个循环表示遍历新状态集合的每一个状态
520 //第二个循环用于从原NFA找从这个状态出发到达的所有状态
521 for( ite_int = currentPtr->Dset->begin(); ite_int != currentPtr->Dset->end(); ++ite_int )
522 for( i = 0;i <= stateCountV;i++ )
523 if( T[*ite_int][i] != "`" && T[*ite_int][i] != "" )
524 {
525 bool flag2 = 0;
526 for( j = 0;j < up;j++ )
527 if( letter1[j] == T[*ite_int][i] )
528 {
529 flag2 = 1;
530 break;
531 }
532 if( !flag2 )
533 {
534 up++;
535 letter1[up-1] = T[*ite_int][i]; //读入的字符都记录下
536 }
537 BFS( i, stateCountV );
538 for( k = 0;arrForBFS[k] >= 0;k++ )
539 letter2[j].insert(arrForBFS[k]);//所到的状态也记下
540 }
541
542 /*cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
543 for( i = 0;i < up;i++ )
544 {
545 cout<<letter1[i]<<" ";
546 for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
547 cout<<*ite_int<<" ";
548 cout<<endl;
549 }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集*/
550
551
552 for( i = 0;i < up;i++ ) //下面开始比较这些状态是不是以前有过的
553 {
554 struct setNode *p = head;
555 while( p != NULL )
556 {
557 if( letter2[i] == *(p->Dset) )
558 break;
559 p = p->next;
560 }
561 if( p != NULL ) //有这个状态,只要加一条路径即可
562 recT[currentPtr->num][p->num] += letter1[i];
563 else //没有这个状态,要在链表末加一个节点,并在recT中加路径
564 {
565 countNode++;
566 recT[currentPtr->num][countNode] += letter1[i];
567 setPtr = new setNode();
568 setPtr->num = countNode;
569 setPtr->Dset = new set<int>();
570 for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
571 setPtr->Dset->insert( *ite_int );
572 setPtr->next = NULL;
573 end->next = setPtr;
574 end = setPtr;
575
576 for( k = 0;k < typeNumV;k++ )
577 {
578 ite_int = setPtr->Dset->find(typeIndexV[k]);
579 if( ite_int != setPtr->Dset->end() || *(setPtr->Dset->end()) == stateCountV )
580 {
581 char temp[10];
582 string str = itoa(k,temp,10);
583 recT[setPtr->num][0] = "-"+str; //如果是终止状态,做记号
584 recT[0][setPtr->num] = "-"+str;
585 }
586 }
587 }
588 }
589 for( i = 0;i < up;i++ ) //清空letter1,letter2,用于下一轮遍历
590 {
591 letter1[i] = "";
592 letter2[i].clear();
593 }
594 up = 0;
595 currentPtr = currentPtr->next;
596
597 /*cout<<"此时的链表为"<<endl;
598 struct setNode *p2 = head;
599 while( p2 != NULL )
600 {
601 cout<<p2->num<<" ";
602 p2 = p2->next;
603 }
604 cout<<endl;*/
605 }
606
607 cout<<"DFA生成完毕!"<<endl;
608 /*cout<<"最后DFA为:"<<endl;
609 for( i = 0;i <= countNode;i++,cout<<endl )
610 for( j = 0; j <= countNode;j++ )
611 {
612 if( recT[i][j] == "" )cout<<" |";
613 else
614 cout<<setw(2)<<recT[i][j]<<"|";
615 }*/
616 return countNode;
617
618 }
619
620 void BFS( int row, int stateCountV ) //广度搜索法求一个状态的ε闭包
621 {
622 for( int i = 0;i < 100;i++ )
623 arrForBFS[i] = -1;
624 int ite = -1;
625 queue<int> que;
626 que.push(row);
627 ite++;
628 arrForBFS[ite] = row;
629 while( !que.empty() )
630 {
631 int t = que.front();
632 que.pop();
633 for( int i = 0;i <= stateCountV;i++ )
634 if( T[t][i] == "`" )
635 {
636 ite++;
637 arrForBFS[ite] = i;
638 que.push(i);
639 }
640 }
641 }
642
643 void reduceDFA() //化简DFA算法
644 {
645 /*
646 算法思想:
647 构造二维数组reduceState[2][stateCountDFA]
648 矩阵第一行存储各状态当前所属集合标记
649 矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
650 每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
651 第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
652 全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
653 简化完成
654 */
655 int pos = 1; //非终结状态所属集合标记
656 int poe; //终结状态所属集合标记
657 for (int i=1;i<=stateCountDFA;i++) //该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
658 {
659 if(recT[0][i].find('-')!=string::npos)
660 {
661 poe=stateCountDFA+atoi(&recT[0][i].at(1));//将不同正则表达式的终结状态标注为不同集合:stateCountDFA+正则表达式序号
662 reduceState[0][i]=poe;
663 reduceState[1][i]=poe;
664 }
665 else
666 {
667 reduceState[0][i]=pos;
668 reduceState[1][i]=pos;
669 }
670 }
671 poe=stateCountDFA+regCount+1;
672 bool flag; //bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
673 for(Vt=tv.begin();Vt!=tv.end();Vt++) //读入每个字符的循环
674 {
675 //cout<<*Vt;
676 flag=true;
677 while(flag) //该循环即为重新读入当前字符的循环
678 {
679 flag = false;
680 bool item = true; //用于标记某状态读入当前字符是否产生状态变化
681 for(int j=1;j<=stateCountDFA;j++)
682 {
683 item=false;
684 for(int k=1;k<=stateCountDFA;k++)//循环遍历DFA
685 {
686 if(recT[j][k].find(*Vt)!=string::npos)//若当前状态读入该字符产生状态变化
687 {
688 item=true;
689 reduceState[1][j]=reduceState[0][k];//目标集合赋值为目标状态所属集合
690 }
691 if(!item) //不产生状态变化
692 reduceState[1][j]=0; //目标集合标记为0
693 }
694
695 }
696 bool b1=false;//标记用于判断pos是否加1
697 bool b2=false;//标记用于判断poe是否加1
698 for(int p=1;p<=stateCountDFA;p++)//循环扫描reduceState矩阵.第一个标记位置
699 {
700 b1=false;b2=false;
701 for(int q=2;q<=stateCountDFA;q++)//循环扫描reduceState矩阵.第二个标记位置
702 {
703 if(reduceState[0][p]==reduceState[0][q]&&reduceState[1][p]!=reduceState[1][q])
704 {
705 //如果存在所属集合相等,目标集合不等的情况,将所属集合分开
706 if(reduceState[0][q]<stateCountDFA) //当前状态为非终结状态
707 {
708 b1=true;
709 reduceState[0][q]=pos;
710 }
711 else //当前状态为终结状态
712 {
713 b2=true;
714 reduceState[0][q]=poe;
715 }
716 flag=true; //发生集合调整,标记为true
717 }
718 }
719 if(b1)
720 pos++;
721 if(b2)
722 poe++;
723 }
724 }
725 }
726
727 // for(int i1=0;i1<stateCountDFA;i1++)
728 // cout<<reduceState[0][i1]<<" ";
729 // cout<<endl;
730 // for(i1=0;i1<stateCountDFA;i1++)
731 // cout<<reduceState[1][i1]<<" ";
732
733 for(int j=1;j<stateCountDFA;j++) //循环扫描reduceState矩阵第一行.第一个标记位置
734 {
735 if(reduceState[0][j]!=0)
736 {
737 for(int k=j+1;k<=stateCountDFA;k++) //循环扫描reduceState矩阵第一行.第二个标记位置
738 {
739 if(reduceState[0][j]==reduceState[0][k])//两个标记位置值相等,即所属集合相同
740 combine(j,k); //调用合并函数
741 }
742 }
743 }
744 /*cout<<"化简后的DFA为:"<<endl;
745 for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
746 for(int n = 0; n <= stateCountDFA;n++ )
747 {
748 if( recT[m][n] == "" )cout<<" |";
749 else
750 cout<<recT[m][n]<<"|";
751 }*/
752
753 cout<<"DFA简化完毕!"<<endl;
754 }
755
756 void combine(int j,int k)
757 {
758 /*
759 函数用于合并两个状态
760 把第K状态的相应的行和列的值加到第j状态上
761 并把k状态所有的值置空
762 */
763 for(int i=1;i<=stateCountDFA;i++)
764 {
765 recT[j][i]+=recT[k][i];
766 recT[i][j]+=recT[i][k];
767 }
768 for(i=0;i<=stateCountDFA;i++)
769 {
770 recT[k][i]="";
771 recT[i][k]="";
772 }
773 reduceState[0][k]=0;
774 }
775
776 void produceC( int nodeCountV, string *typeNameV ) //产生.cpp文件
777 {
778 int i, j ,k;
779 char buf[1024]; //临时保存读取出来的文件内容
780 string message = "";
781 string remainCode = "";
782 ifstream in;
783 in.open("Remain.txt");
784 if(in.is_open()) //文件打开成功,说明曾经写入过东西
785 {
786 while(in.good() && !in.eof())
787 {
788 memset(buf,0,1024);
789 in.getline(buf,1204);
790 message = buf;
791 message += "\n";
792 remainCode += message;
793 }
794 in.close();
795 }
796
797 ofstream fout("scanner.cpp",ios::out);
798 fout<<remainCode;
799 fout<<"while(flag)\n"<<"{\n";
800 fout<<"switch(state)\n"<<"{\n";
801 for( i = 1;i <= nodeCountV;i++ ) //遍历每个新状态
802 {
803 fout<<"case "<<i<<":\n";
804 fout<<"last++;\n";
805 fout<<"c = code[last];\n";
806 if( i == 1 )
807 {
808 fout<<"if( c == '%' )\n"; //读到文件末尾,终结
809 fout<<"{\n";
810 fout<<"flag = 0;\n";
811 fout<<"break;\n}\n";
812 }
813 bool flag3 = 0;bool flag2 = 0;
814 for( j = 1;j <= nodeCountV;j++ )
815 if( recT[i][j] != "'" && recT[i][j] != "" )
816 {
817 flag3 = 1;
818 if( flag2 )
819 fout<<"else ";
820
821 fout<<"if( contain( \""<<recT[i][j]<<"\",c ) ) ";
822 fout<<"state = "<<j<<";\n";
823 flag2 = 1;
824 }
825 if( flag3 )
826 fout<<"else\n{\n";
827 //fout<<"{\n"<<"endState.find("<<i<<");\n";
828 //fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
829 if( recT[i][0][0] == '-' )
830 {
831 string tempIndexStr = recT[i][0].substr(1,recT[i][0].length()-1);
832 int tempIndex = atoi(tempIndexStr.c_str());
833 string type = typeNameV[tempIndex];
834 fout<<"flag2 = true;\n";
835 fout<<"for( k = 0;k < count;k++ )\n";;
836 fout<<"{\n"<<"if( SymbolTable[k][1] == code.substr(first,last-first) )\n";
837 fout<<"{\nflag2 = false;\nbreak;\n}\n}\n";
838 fout<<"if(flag2)\n{\n";
839 fout<<"count++;\n";
840 fout<<"SymbolTable[count-1][0] = \""<<type<<"\";\n";
841 fout<<"SymbolTable[count-1][1] = code.substr(first,last-first);\n";
842 fout<<"}\n";
843
844 fout<<"cout<<code.substr(first,last-first)<<endl;\n";
845 fout<<"last--;\n";
846 fout<<"first = last+1;\n";
847 fout<<"state = 1;\n";
848 }
849 else
850 {
851 fout<<"cout<<\"编译错误!\\n\";\n";
852 fout<<"flag = 0;\n";
853 }
854 if( flag3 )
855 fout<<"}\n";
856 fout<<"break;\n";
857 }
858 fout<<"default:\n"<<"break;\n"<<"}\n}\n";
859
860 fout<<"cout<<\"Symbol 表如下:\"<<endl;\n";
861 fout<<"cout<<setw(3)<<\"序号\"<<setw(10)<<\"token类型\"<<\" \"<<\"token名字\"<<endl;\n";
862 fout<<"for( k2 = 0;k2 < count;k2++ )\n";
863 fout<<"{\n"<<"cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\" \"<<SymbolTable[k2][1]<<endl;\n";
864 fout<<"}\n";
865 fout<<"system(\"pause\");\n";
866 fout<<"return 0;\n}\n";
867
868 cout<<"代码生成完毕!见scanner.cpp文件。"<<endl;
869 }
2 #include "stdio.h"
3 using namespace std;
4 #include<String>
5 #include<Stack>
6 #include<set>
7 #include<queue>
8 #include<fstream>
9 #include<iomanip>
10 struct path
11 {
12 int from; //一个path代表从一个状态到另一个状态的路径
13 int to;
14 string strPath; //strPath代表这个结构体处理的路径
15 };
16 struct dealStruct
17 {
18 int stCount;
19 struct path* resPtr;
20 };
21 struct setNode
22 {
23 set<int> *Dset;
24 int num;
25 struct setNode * next;
26 };
27 bool judgeLD( char v ) //判断正则表达式中是否含有非法字符
28 {
29 if( ( int(v) > 47 && int(v) <= 57 ) ||
30 ( int(v) >= 65 && int(v) <= 90 )||
31 ( int(v) >= 97 && int(v) <= 122 ) ||
32 v == '>' || v == '<' || v == '=' ||
33 v == '`' || v == '+' || v == '-' )
34 return true;
35 else
36 return false;
37 }
38 struct dealStruct * dealWithStr( string, int ); //通过正则表达式生成NFA
39 int jihe( int, stack<struct path*> & ); //处理正则表达式时'*’的运算函数
40 int lianjie( int, stack<struct path*> & ); //处理正则表达式时'&’(自定义运算符ab相当于a&b)的运算函数
41 int huo( int, stack<struct path*> & );; //处理正则表达式时'|’的运算函数
42 int reduceNFA( int, int*, int ); //NFA->DFA函数
43 void BFS( int, int ); //计算一个状态的ξ闭包
44 void produceC( int, string * ); //生成词法分析器的.cpp文件
45 int arrForBFS[100] = {0}; //存放BFS函数生成的闭包的数组
46 string T[2000][2000]; //NFA的数组表示
47 string recT[2000][2000]; //DFA的数组表示
48 int stateCountDFA; //DFA状态数量
49 int reduceState[2][2000]; //化简DFA用的数组
50 set<char> tv; //所有终结符
51 set<char>::iterator Vt;
52 void reduceDFA(); //化简DFA
53 void combine(int,int); //化简DFA时合并两个状态
54 int regCount; //输入正则表达式的个数
55
56 int main()
57 {
58 regCount = 0;
59 int i,j,k;
60 int temp = 0;
61
62 string * regArr;
63 string * typeName;
64 int * typeIndex;
65 int **resultEnterExt;
66
67
68 ifstream inRegular;
69 inRegular.open("RegularExpressions.txt");
70 inRegular>>regCount; //正则表达式个数
71 typeName = new string[regCount];
72 typeIndex = new int[regCount];
73 resultEnterExt = new int*[regCount]; //定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态
74 for( i = 0;i < regCount;i++ )
75 resultEnterExt[i] = new int[2];
76 regArr = new string[regCount];
77 for( i = 0;i < regCount;i++ )
78 {
79 inRegular>>typeName[i];
80 inRegular>>regArr[i];
81 } //读入正则表达式
82
83 int stateCount = -1;
84 for( i = 0;i < regCount;i++ ) //遍历每个正则表达式
85 {
86 for( j = 0;j < regArr[i].length()-1;j++ )
87 if( ( judgeLD(regArr[i].at(j) ) && ( judgeLD(regArr[i].at(j+1)) || regArr[i].at(j+1) == '(' ) ) ||
88 ( regArr[i].at(j) == ')' && ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) ) ) ||
89 ( regArr[i].at(j) == '*' && ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) ) ) )
90 regArr[i].insert(j+1,"&"); //预处理正则表达式,添加自定义运算符'&'
91
92
93 regArr[i].insert(regArr[i].length(),"#%"); //在末尾添加结束符
94 cout<<regArr[i]<<endl; //输出处理后的正则表达式
95
96
97 j = 0;
98 struct path *p = NULL;
99 struct dealStruct * result = dealWithStr( regArr[i],stateCount ); //对于每个正则表达式调用NFA生成函数
100
101 stateCount = result->stCount;
102 resultEnterExt[i][0] = result->resPtr->from; //记录该正则表达式的起始状态和终结状态
103 resultEnterExt[i][1] = result->resPtr->to;
104 typeIndex[i] = resultEnterExt[i][1];
105 /* cout<<"The number of states now is "<<stateCount<<endl;
106 cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
107 cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
108 cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl;*/
109 }
110
111 stateCount++; //汇总所有正则表达式的NFA
112 cout<<"NFA合并完毕!"<<endl;
113
114 for( i = 0;i < regCount;i++ )
115 T[stateCount][resultEnterExt[i][0]] = '`'; //从公共起始状态出发
116
117 /* for( i = 0;i <= stateCount;i++ ) //输出产生的NFA以检验
118 {
119 for( j = 0;j <= stateCount;j++ )
120 {
121 if( T[i][j] == "" ) cout<<" |";
122 else cout<<T[i][j]<<'|';
123 }
124 cout<<endl;
125 }*/
126
127 stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); //调用NFA->DFA函数生成DFA
128 reduceDFA();
129
130 produceC( stateCountDFA, typeName ); //产生.cpp代码
131
132 system("pause");
133
134 return 0;
135 }
136
137 /*以下为NFA的生成函数,采用三个栈实现Thompson算法,第一个栈用于读入字符和运算符
138 第二个栈用于逆序,第三个栈专门用于处理一对括号内部的表达式,其实用两个栈就可以
139 实现,及把第三个栈舍弃,直接把括号的计算放在第一个栈中进行也可。为了计算更加
140 清楚不易出错故而多加了一个栈*
141 算法思路:读入正则表达式放入栈A中,*,&,|的优先级分别定为3,2,1,而表达式末
142 尾的'#'优先级最低,看做0。用一个三元数组存放当各个运算符的个数,字符均放入栈A中,
143 读到运算符时,当栈外的运算符的优先级大于等于站内所有运算符时,该运算符进栈,
144 如果低于所有站内运算符,计算站内所有运算符表的运算,直到栈内没有比栈外更高优先
145 级的运算符。
146 读到(的时候,(进栈,之后不论读到什么均进栈直到读到第一个),将这对()中的字符串
147 提出,用栈B逆序后放入栈C,在栈C中按上述方式处理直到只有一个元素,将元素返回栈A
148 栈中存储的是指向path结构体的指针*/
149 struct dealStruct * dealWithStr( string str, int stateCountV )
150 {
151 int j = 0;int temp = 0;
152 struct path *p = NULL;
153 int currentPri[3] = {0}; //用于储存栈中三种运算符(*,&,|)的个数,便于计算
154 stack<struct path *> stackA;
155 while( str.at(j) != '%' )
156 {
157 if( judgeLD(str.at(j)) ) //如果是字符,放入栈A中
158 {
159 tv.insert(str.at(j)); //放入tv中
160 stateCountV++;
161 temp = stateCountV;
162 stateCountV++;
163 T[temp][stateCountV] += str.at(j);
164 p = new path();
165 p->from = temp;
166 p->to = stateCountV;
167 p->strPath += str.at(j);
168 stackA.push(p);
169 //cout<<"push a new letter "<<p->strPath<<endl;
170 j++;
171 }
172 else if( str.at(j) == '*' ) //如果是'*'由于没有比它优先级更高的了,
173 { //故必然是入栈
174 currentPri[0]++;
175 p = new path();
176 p->from = -1;
177 p->to = -1;
178 p->strPath += "*";
179 stackA.push(p);
180 //cout<<"push *"<<endl;
181 j++;
182 }
183 else if( str.at(j) == '&' ) //如果是'&',检查之前有没有'*'
184 {
185 if( currentPri[0] == 0 ) //没有的话,入栈
186 currentPri[1]++;
187 else
188 {
189 currentPri[0]--; //有,计算*
190 currentPri[1]++;
191
192 stateCountV = jihe( stateCountV, stackA );
193 }
194 p = new path();
195 p->from = -2;
196 p->to = -2;
197 p->strPath = "&";
198 stackA.push(p); //不论有没有'*','&'必然会入栈
199 //cout<<"push &"<<endl;
200 j++;
201 }
202 else if( str.at(j) == '|' ) //如果是'|',查看前面有没有*,&
203 {
204 while( currentPri[0] > 0 || currentPri[1] > 0 )//一直算直到没有
205 {
206 if( currentPri[0] > 0 )
207 {
208 currentPri[0]--;
209 stateCountV = jihe( stateCountV, stackA );
210 }
211 else
212 {
213 currentPri[1]--;
214 stateCountV = lianjie( stateCountV, stackA );
215 }
216 }
217 currentPri[2]++; //不论有没有*,&,最后'|'必然还是要入栈
218 p = new path();
219 p->from = -3;
220 p->to = -3;
221 p->strPath = "|";
222 stackA.push(p);
223 //cout<<"push |"<<endl;
224 j++;
225 }
226 else if( str.at(j) == '#' ) //如果是'#',则计算前面所有的运算
227 {
228 while( currentPri[0] > 0 || currentPri[1] > 0 || currentPri[2] > 0 )
229 {
230 if( currentPri[0] > 0 )
231 {
232 currentPri[0]--;
233 stateCountV = jihe( stateCountV, stackA );
234 }
235 else if( currentPri[1] > 0 )
236 {
237 currentPri[1]--;
238 stateCountV = lianjie( stateCountV, stackA );
239 }
240 else
241 {
242 currentPri[2]--;
243 stateCountV = huo( stateCountV, stackA );
244 }
245
246 }
247 //cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;
248 j++;
249 //cout<<"It is "<<(stackA.top())->strPath<<endl;
250 }
251 else if( str.at(j) == '(' ) //发现左括号,进入括号处理过程
252 {
253 //cout<<"发现一个最外层括号"<<endl;
254 int parenthCount = 1;
255 bool flag = true;
256 while( parenthCount > 0 )
257 {
258 if( str.at(j) == '(' )
259 {
260 p = new path();
261 p->from = -5;
262 p->to = -5;
263 p->strPath += "(";
264 stackA.push(p);
265 //cout<<"放入一左个括号"<<endl;
266 if( !flag )parenthCount++;
267 flag = false;
268 }
269 else if( str.at(j) == ')' ) //发现第一个右括号
270 {
271 //cout<<"发现一个内层右括号"<<endl;
272 parenthCount--;
273
274 stack<struct path *> stackB; //定义另两个栈
275 stack<struct path *> stackC;
276
277 p = new path();
278 p->from = -6;
279 p->to = -6;
280 p->strPath += "#";
281 stackB.push(p);
282 while( (stackA.top())->from != -5 ) //栈Apop到第一个(
283 {
284 stackB.push( stackA.top() ); //将栈Apop出的放入B
285 // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
286 stackA.pop();
287
288 }
289 //cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl;
290 stackA.pop(); //去掉左小括号
291
292 int pri[3] = {0};
293 while( !stackB.empty() ) //开始处理栈B内的表达式
294 {
295 p = stackB.top();
296 stackB.pop();
297 //cout<<"stackB弹出一个"<<p->from<<endl;
298 if( p->from >= 0 )
299 stackC.push(p); //用栈C处理栈B的表达式
300 else if( p->from == -1 )
301 {
302 pri[0]++;
303 stackC.push(p);
304 }
305 else if( p->from == -2 )
306 {
307 if( pri[0] > 0 )
308 {
309 pri[0]--;
310 stateCountV = jihe( stateCountV, stackC );
311 }
312
313 stackC.push(p);
314 pri[1]++;
315 }
316 else if( p->from == -3 )
317 {
318 while( pri[0] > 0 || pri[1] > 0 )
319 {
320 if( pri[0] > 0 )
321 {
322 pri[0]--;
323 stateCountV = jihe( stateCountV, stackC );
324 }
325 else
326 {
327 pri[1]--;
328 stateCountV = lianjie( stateCountV, stackC );
329 }
330
331 }
332 pri[2]++;
333 stackC.push(p);
334 }
335 else if( p->from == -6 )
336 {
337 while( pri[0] > 0 || pri[1] > 0 || pri[2] > 0 )
338 {
339 if( pri[0] > 0 )
340 {
341 pri[0]--;
342 stateCountV = jihe( stateCountV, stackC );
343 }
344 else if( pri[1] > 0 )
345 {
346 pri[1]--;
347 stateCountV = lianjie( stateCountV, stackC );
348 }
349 else
350 {
351 pri[2]--;
352 stateCountV = huo( stateCountV, stackC );
353 }
354 }
355 }
356 else
357 cout<<"括号内含非法字符!"<<endl;
358 }
359 //cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
360
361 //cout<<"此时的parenthCount为"<<parenthCount<<endl;
362 stackA.push(stackC.top()); //处理完成,栈C弹出结果
363 stackC.pop(); //放入栈A
364 }
365 else if( str.at(j) == '*' )
366 {
367 p = new path();
368 p->from = -1;
369 p->to = -1;
370 p->strPath += "*";
371 stackA.push(p);
372 }
373 else if( str.at(j) == '&' )
374 {
375 p = new path();
376 p->from = -2;
377 p->to = -2;
378 p->strPath += "&";
379 stackA.push(p);
380 }
381 else if( str.at(j) == '|' )
382 {
383 p = new path();
384 p->from = -3;
385 p->to = -3;
386 p->strPath += "|";
387 stackA.push(p);
388 }
389 else
390 {
391 stateCountV++;
392 temp = stateCountV;
393 stateCountV++;
394 T[temp][stateCountV] += str.at(j);
395 p = new path();
396 p->from = temp;
397 p->to = stateCountV;
398 p->strPath += str.at(j);
399 stackA.push(p);
400 }
401 tv.insert(str.at(j)); //放入tv中
402 j++;
403 }
404 }
405 else
406 cout<<"含有非法字符!"<<endl;
407 }
408
409 struct dealStruct * res = new dealStruct();
410 res->stCount = stateCountV;
411 res->resPtr = stackA.top();
412 stackA.pop();
413 cout<<"NFA生成完毕!"<<endl;
414 return res;
415 }
416
417
418 int jihe( int stateV, stack<struct path*> &stackA )
419 {
420 struct path *b = stackA.top();stackA.pop();
421 struct path *a = stackA.top();stackA.pop();
422 //cout<<"pop the value for * "<<a->strPath<<endl;
423 //cout<<"pop the sign for * "<<b->strPath<<endl;
424
425 stateV++;
426 int temp = stateV;
427 stateV++;
428 T[temp][a->from] += "`";
429 T[temp][stateV] += "`";
430 T[a->to][stateV] += "`";
431 T[a->to][a->from] += "`";
432
433 struct path *p = new path();
434 p->from = temp;
435 p->to = stateV;
436 p->strPath = "(" + a->strPath + ")*";
437 stackA.push(p);
438 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
439
440 delete a;delete b;
441 return stateV;
442 }
443 int lianjie( int stateV, stack<struct path*> &stackV )
444 {
445 struct path *a = stackV.top();stackV.pop();
446 //cout<<"pop the second value for & "<<a->strPath<<endl;
447 struct path *b = stackV.top();stackV.pop();
448 //cout<<"pop the sign for & "<<b->strPath<<endl;
449 struct path *c = stackV.top();stackV.pop();
450 //cout<<"pop the first value for & "<<c->strPath<<endl;
451 T[c->to][a->from] += "`";
452
453 struct path *p = new path();
454 p->from = c->from;
455 p->to = a->to;
456 p->strPath = "(" + c->strPath + ")&(" + a->strPath + ")";
457 stackV.push(p);
458 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
459 delete a;delete b;delete c;
460
461 return stateV;
462 }
463 int huo( int stateV, stack<struct path*> &stackA )
464 {
465 struct path *a = stackA.top();stackA.pop();
466 struct path *b = stackA.top();stackA.pop();
467 struct path *c = stackA.top();stackA.pop();
468 //cout<<"pop the first value for | "<<c->strPath<<endl;
469 //cout<<"pop the second value for | "<<a->strPath<<endl;
470 //cout<<"pop the sign for | "<<b->strPath<<endl;
471
472 stateV++;
473 int temp = stateV;
474 stateV++;
475 T[temp][a->from] += "`";
476 T[temp][c->from] += "`";
477 T[a->to][stateV] += "`";
478 T[c->to][stateV] += "`";
479
480 struct path *p = new path();
481 p->from = temp;
482 p->to = stateV;
483 p->strPath = "(" + c->strPath + ")|(" + a->strPath + ")";
484 stackA.push(p);
485 //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
486 delete a;delete b;delete c;
487
488 return stateV;
489 }
490
491 int reduceNFA( int stateCountV, int *typeIndexV, int typeNumV ) //stateCountV为NFA的起始状态
492 {
493 int i,j,k;
494 struct setNode *head,*setPtr,*end; //用于构造链表的指针,新状态用链表存储
495 struct setNode *currentPtr; //用于遍历链表中元素的指针
496
497 set<int>::iterator ite_int; //每个新的状态为一个set,封装到setNode中
498 int countNode = 1;
499 setPtr = new setNode();
500 setPtr->num = 1;
501 setPtr->Dset = new set<int>();
502 setPtr->next = NULL;
503 BFS(stateCountV, stateCountV);
504 for( i = 0;arrForBFS[i] >= 0;i++ )//先把起始状态的ε闭包求出作为新起始状态
505 setPtr->Dset->insert(arrForBFS[i]);
506
507 head = setPtr;
508 end = setPtr;
509 currentPtr = head;
510
511 string letter1[80]; //这里定义了两个数组,一个存放读入的字符
512 set<int> letter2[80]; //一个存放读入相应字符后所到达的状态集合
513 int up = 0;
514
515
516 while( currentPtr != NULL ) //遍历新状态链表
517 {
518 //cout<<"currentPtr遍历到"<<currentPtr->num<<endl;
519 //下面一个双重循环,第一个循环表示遍历新状态集合的每一个状态
520 //第二个循环用于从原NFA找从这个状态出发到达的所有状态
521 for( ite_int = currentPtr->Dset->begin(); ite_int != currentPtr->Dset->end(); ++ite_int )
522 for( i = 0;i <= stateCountV;i++ )
523 if( T[*ite_int][i] != "`" && T[*ite_int][i] != "" )
524 {
525 bool flag2 = 0;
526 for( j = 0;j < up;j++ )
527 if( letter1[j] == T[*ite_int][i] )
528 {
529 flag2 = 1;
530 break;
531 }
532 if( !flag2 )
533 {
534 up++;
535 letter1[up-1] = T[*ite_int][i]; //读入的字符都记录下
536 }
537 BFS( i, stateCountV );
538 for( k = 0;arrForBFS[k] >= 0;k++ )
539 letter2[j].insert(arrForBFS[k]);//所到的状态也记下
540 }
541
542 /*cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
543 for( i = 0;i < up;i++ )
544 {
545 cout<<letter1[i]<<" ";
546 for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
547 cout<<*ite_int<<" ";
548 cout<<endl;
549 }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集*/
550
551
552 for( i = 0;i < up;i++ ) //下面开始比较这些状态是不是以前有过的
553 {
554 struct setNode *p = head;
555 while( p != NULL )
556 {
557 if( letter2[i] == *(p->Dset) )
558 break;
559 p = p->next;
560 }
561 if( p != NULL ) //有这个状态,只要加一条路径即可
562 recT[currentPtr->num][p->num] += letter1[i];
563 else //没有这个状态,要在链表末加一个节点,并在recT中加路径
564 {
565 countNode++;
566 recT[currentPtr->num][countNode] += letter1[i];
567 setPtr = new setNode();
568 setPtr->num = countNode;
569 setPtr->Dset = new set<int>();
570 for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
571 setPtr->Dset->insert( *ite_int );
572 setPtr->next = NULL;
573 end->next = setPtr;
574 end = setPtr;
575
576 for( k = 0;k < typeNumV;k++ )
577 {
578 ite_int = setPtr->Dset->find(typeIndexV[k]);
579 if( ite_int != setPtr->Dset->end() || *(setPtr->Dset->end()) == stateCountV )
580 {
581 char temp[10];
582 string str = itoa(k,temp,10);
583 recT[setPtr->num][0] = "-"+str; //如果是终止状态,做记号
584 recT[0][setPtr->num] = "-"+str;
585 }
586 }
587 }
588 }
589 for( i = 0;i < up;i++ ) //清空letter1,letter2,用于下一轮遍历
590 {
591 letter1[i] = "";
592 letter2[i].clear();
593 }
594 up = 0;
595 currentPtr = currentPtr->next;
596
597 /*cout<<"此时的链表为"<<endl;
598 struct setNode *p2 = head;
599 while( p2 != NULL )
600 {
601 cout<<p2->num<<" ";
602 p2 = p2->next;
603 }
604 cout<<endl;*/
605 }
606
607 cout<<"DFA生成完毕!"<<endl;
608 /*cout<<"最后DFA为:"<<endl;
609 for( i = 0;i <= countNode;i++,cout<<endl )
610 for( j = 0; j <= countNode;j++ )
611 {
612 if( recT[i][j] == "" )cout<<" |";
613 else
614 cout<<setw(2)<<recT[i][j]<<"|";
615 }*/
616 return countNode;
617
618 }
619
620 void BFS( int row, int stateCountV ) //广度搜索法求一个状态的ε闭包
621 {
622 for( int i = 0;i < 100;i++ )
623 arrForBFS[i] = -1;
624 int ite = -1;
625 queue<int> que;
626 que.push(row);
627 ite++;
628 arrForBFS[ite] = row;
629 while( !que.empty() )
630 {
631 int t = que.front();
632 que.pop();
633 for( int i = 0;i <= stateCountV;i++ )
634 if( T[t][i] == "`" )
635 {
636 ite++;
637 arrForBFS[ite] = i;
638 que.push(i);
639 }
640 }
641 }
642
643 void reduceDFA() //化简DFA算法
644 {
645 /*
646 算法思想:
647 构造二维数组reduceState[2][stateCountDFA]
648 矩阵第一行存储各状态当前所属集合标记
649 矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
650 每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
651 第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
652 全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
653 简化完成
654 */
655 int pos = 1; //非终结状态所属集合标记
656 int poe; //终结状态所属集合标记
657 for (int i=1;i<=stateCountDFA;i++) //该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
658 {
659 if(recT[0][i].find('-')!=string::npos)
660 {
661 poe=stateCountDFA+atoi(&recT[0][i].at(1));//将不同正则表达式的终结状态标注为不同集合:stateCountDFA+正则表达式序号
662 reduceState[0][i]=poe;
663 reduceState[1][i]=poe;
664 }
665 else
666 {
667 reduceState[0][i]=pos;
668 reduceState[1][i]=pos;
669 }
670 }
671 poe=stateCountDFA+regCount+1;
672 bool flag; //bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
673 for(Vt=tv.begin();Vt!=tv.end();Vt++) //读入每个字符的循环
674 {
675 //cout<<*Vt;
676 flag=true;
677 while(flag) //该循环即为重新读入当前字符的循环
678 {
679 flag = false;
680 bool item = true; //用于标记某状态读入当前字符是否产生状态变化
681 for(int j=1;j<=stateCountDFA;j++)
682 {
683 item=false;
684 for(int k=1;k<=stateCountDFA;k++)//循环遍历DFA
685 {
686 if(recT[j][k].find(*Vt)!=string::npos)//若当前状态读入该字符产生状态变化
687 {
688 item=true;
689 reduceState[1][j]=reduceState[0][k];//目标集合赋值为目标状态所属集合
690 }
691 if(!item) //不产生状态变化
692 reduceState[1][j]=0; //目标集合标记为0
693 }
694
695 }
696 bool b1=false;//标记用于判断pos是否加1
697 bool b2=false;//标记用于判断poe是否加1
698 for(int p=1;p<=stateCountDFA;p++)//循环扫描reduceState矩阵.第一个标记位置
699 {
700 b1=false;b2=false;
701 for(int q=2;q<=stateCountDFA;q++)//循环扫描reduceState矩阵.第二个标记位置
702 {
703 if(reduceState[0][p]==reduceState[0][q]&&reduceState[1][p]!=reduceState[1][q])
704 {
705 //如果存在所属集合相等,目标集合不等的情况,将所属集合分开
706 if(reduceState[0][q]<stateCountDFA) //当前状态为非终结状态
707 {
708 b1=true;
709 reduceState[0][q]=pos;
710 }
711 else //当前状态为终结状态
712 {
713 b2=true;
714 reduceState[0][q]=poe;
715 }
716 flag=true; //发生集合调整,标记为true
717 }
718 }
719 if(b1)
720 pos++;
721 if(b2)
722 poe++;
723 }
724 }
725 }
726
727 // for(int i1=0;i1<stateCountDFA;i1++)
728 // cout<<reduceState[0][i1]<<" ";
729 // cout<<endl;
730 // for(i1=0;i1<stateCountDFA;i1++)
731 // cout<<reduceState[1][i1]<<" ";
732
733 for(int j=1;j<stateCountDFA;j++) //循环扫描reduceState矩阵第一行.第一个标记位置
734 {
735 if(reduceState[0][j]!=0)
736 {
737 for(int k=j+1;k<=stateCountDFA;k++) //循环扫描reduceState矩阵第一行.第二个标记位置
738 {
739 if(reduceState[0][j]==reduceState[0][k])//两个标记位置值相等,即所属集合相同
740 combine(j,k); //调用合并函数
741 }
742 }
743 }
744 /*cout<<"化简后的DFA为:"<<endl;
745 for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
746 for(int n = 0; n <= stateCountDFA;n++ )
747 {
748 if( recT[m][n] == "" )cout<<" |";
749 else
750 cout<<recT[m][n]<<"|";
751 }*/
752
753 cout<<"DFA简化完毕!"<<endl;
754 }
755
756 void combine(int j,int k)
757 {
758 /*
759 函数用于合并两个状态
760 把第K状态的相应的行和列的值加到第j状态上
761 并把k状态所有的值置空
762 */
763 for(int i=1;i<=stateCountDFA;i++)
764 {
765 recT[j][i]+=recT[k][i];
766 recT[i][j]+=recT[i][k];
767 }
768 for(i=0;i<=stateCountDFA;i++)
769 {
770 recT[k][i]="";
771 recT[i][k]="";
772 }
773 reduceState[0][k]=0;
774 }
775
776 void produceC( int nodeCountV, string *typeNameV ) //产生.cpp文件
777 {
778 int i, j ,k;
779 char buf[1024]; //临时保存读取出来的文件内容
780 string message = "";
781 string remainCode = "";
782 ifstream in;
783 in.open("Remain.txt");
784 if(in.is_open()) //文件打开成功,说明曾经写入过东西
785 {
786 while(in.good() && !in.eof())
787 {
788 memset(buf,0,1024);
789 in.getline(buf,1204);
790 message = buf;
791 message += "\n";
792 remainCode += message;
793 }
794 in.close();
795 }
796
797 ofstream fout("scanner.cpp",ios::out);
798 fout<<remainCode;
799 fout<<"while(flag)\n"<<"{\n";
800 fout<<"switch(state)\n"<<"{\n";
801 for( i = 1;i <= nodeCountV;i++ ) //遍历每个新状态
802 {
803 fout<<"case "<<i<<":\n";
804 fout<<"last++;\n";
805 fout<<"c = code[last];\n";
806 if( i == 1 )
807 {
808 fout<<"if( c == '%' )\n"; //读到文件末尾,终结
809 fout<<"{\n";
810 fout<<"flag = 0;\n";
811 fout<<"break;\n}\n";
812 }
813 bool flag3 = 0;bool flag2 = 0;
814 for( j = 1;j <= nodeCountV;j++ )
815 if( recT[i][j] != "'" && recT[i][j] != "" )
816 {
817 flag3 = 1;
818 if( flag2 )
819 fout<<"else ";
820
821 fout<<"if( contain( \""<<recT[i][j]<<"\",c ) ) ";
822 fout<<"state = "<<j<<";\n";
823 flag2 = 1;
824 }
825 if( flag3 )
826 fout<<"else\n{\n";
827 //fout<<"{\n"<<"endState.find("<<i<<");\n";
828 //fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
829 if( recT[i][0][0] == '-' )
830 {
831 string tempIndexStr = recT[i][0].substr(1,recT[i][0].length()-1);
832 int tempIndex = atoi(tempIndexStr.c_str());
833 string type = typeNameV[tempIndex];
834 fout<<"flag2 = true;\n";
835 fout<<"for( k = 0;k < count;k++ )\n";;
836 fout<<"{\n"<<"if( SymbolTable[k][1] == code.substr(first,last-first) )\n";
837 fout<<"{\nflag2 = false;\nbreak;\n}\n}\n";
838 fout<<"if(flag2)\n{\n";
839 fout<<"count++;\n";
840 fout<<"SymbolTable[count-1][0] = \""<<type<<"\";\n";
841 fout<<"SymbolTable[count-1][1] = code.substr(first,last-first);\n";
842 fout<<"}\n";
843
844 fout<<"cout<<code.substr(first,last-first)<<endl;\n";
845 fout<<"last--;\n";
846 fout<<"first = last+1;\n";
847 fout<<"state = 1;\n";
848 }
849 else
850 {
851 fout<<"cout<<\"编译错误!\\n\";\n";
852 fout<<"flag = 0;\n";
853 }
854 if( flag3 )
855 fout<<"}\n";
856 fout<<"break;\n";
857 }
858 fout<<"default:\n"<<"break;\n"<<"}\n}\n";
859
860 fout<<"cout<<\"Symbol 表如下:\"<<endl;\n";
861 fout<<"cout<<setw(3)<<\"序号\"<<setw(10)<<\"token类型\"<<\" \"<<\"token名字\"<<endl;\n";
862 fout<<"for( k2 = 0;k2 < count;k2++ )\n";
863 fout<<"{\n"<<"cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\" \"<<SymbolTable[k2][1]<<endl;\n";
864 fout<<"}\n";
865 fout<<"system(\"pause\");\n";
866 fout<<"return 0;\n}\n";
867
868 cout<<"代码生成完毕!见scanner.cpp文件。"<<endl;
869 }
正则表达式在RegularExpressions.txt中写明,示例如下:
4
Identity
a(bc)*|bc*|d
Number
(1|2|3|4|5|6|7|8|9|0)*
key
if|else|then
operator
>(`|=)|<(`|=)|=(=|`)
Identity
a(bc)*|bc*|d
Number
(1|2|3|4|5|6|7|8|9|0)*
key
if|else|then
operator
>(`|=)|<(`|=)|=(=|`)
第一行为正则表达式的个数,之后输入表达式代表的类型名,回车,表达式,回车,以此类推。
remain.txt存放着生成的代码文件中固定的代码,生成代码时会先从此文件读取。
里面的代码如下:
remain.txt
#include<iostream>
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
for( int i = 0;i < str.length();i++ )
if( str[i] == v )
return true;
return false;
}
int main()
{
string tempCode = "";
string code = "";
int i,j,k,k2=0;
char buf[1024]; //临时保存读取出来的文件内容
string message = "";
ifstream infile;
infile.open("code.txt");
if(infile.is_open()) //文件打开成功,说明曾经写入过东西
{
while(infile.good() && !infile.eof())
{
memset(buf,0,1024);
infile.getline(buf,1204);
message = buf;
tempCode += message;
}
infile.close();
}
for( i = 0;i < tempCode.length();i++ )
{
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
code += tempCode[i];
}
code += "%";
cout<<code<<endl;
int first = 0, last = -1;
char c;
string tempStr = "";
int state = 1;
bool flag = 1;
bool flag2 = true;
int count = 0;
//set<int> endState;
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
for( int i = 0;i < str.length();i++ )
if( str[i] == v )
return true;
return false;
}
int main()
{
string tempCode = "";
string code = "";
int i,j,k,k2=0;
char buf[1024]; //临时保存读取出来的文件内容
string message = "";
ifstream infile;
infile.open("code.txt");
if(infile.is_open()) //文件打开成功,说明曾经写入过东西
{
while(infile.good() && !infile.eof())
{
memset(buf,0,1024);
infile.getline(buf,1204);
message = buf;
tempCode += message;
}
infile.close();
}
for( i = 0;i < tempCode.length();i++ )
{
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
code += tempCode[i];
}
code += "%";
cout<<code<<endl;
int first = 0, last = -1;
char c;
string tempStr = "";
int state = 1;
bool flag = 1;
bool flag2 = true;
int count = 0;
//set<int> endState;
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
表达式读入生成的DFA简化后生成.cpp文件,命名为scanner.cpp
根据上面的那个示例,生成的scanner.cpp文件内容为:
scanner.txt
#include<iostream>
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
for( int i = 0;i < str.length();i++ )
if( str[i] == v )
return true;
return false;
}
int main()
{
string tempCode = "";
string code = "";
int i,j,k,k2=0;
char buf[1024]; //临时保存读取出来的文件内容
string message = "";
ifstream infile;
infile.open("code.txt");
if(infile.is_open()) //文件打开成功,说明曾经写入过东西
{
while(infile.good() && !infile.eof())
{
memset(buf,0,1024);
infile.getline(buf,1204);
message = buf;
tempCode += message;
}
infile.close();
}
for( i = 0;i < tempCode.length();i++ )
{
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
code += tempCode[i];
}
code += "%";
cout<<code<<endl;
int first = 0, last = -1;
char c;
string tempStr = "";
int state = 1;
bool flag = 1;
bool flag2 = true;
int count = 0;
//set<int> endState;
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
while(flag)
{
switch(state)
{
case 1:
last++;
c = code[last];
if( c == '%' )
{
flag = 0;
break;
}
if( contain( "a",c ) ) state = 2;
else if( contain( "b",c ) ) state = 3;
else if( contain( "d",c ) ) state = 4;
else if( contain( "1234567890",c ) ) state = 5;
else if( contain( "i",c ) ) state = 15;
else if( contain( "e",c ) ) state = 16;
else if( contain( "t",c ) ) state = 17;
else if( contain( "><=",c ) ) state = 18;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 2:
last++;
c = code[last];
if( contain( "bb",c ) ) state = 21;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 3:
last++;
c = code[last];
if( contain( "c",c ) ) state = 3;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 4:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 5:
last++;
c = code[last];
if( contain( "1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000",c ) ) state = 5;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Number";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 6:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 7:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 8:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 9:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 10:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 11:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 12:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 13:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 14:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 15:
last++;
c = code[last];
if( contain( "f",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 16:
last++;
c = code[last];
if( contain( "l",c ) ) state = 24;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 17:
last++;
c = code[last];
if( contain( "h",c ) ) state = 25;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 18:
last++;
c = code[last];
if( contain( "===",c ) ) state = 26;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 19:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 20:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 21:
last++;
c = code[last];
if( contain( "c",c ) ) state = 2;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 22:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 23:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "key";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 24:
last++;
c = code[last];
if( contain( "s",c ) ) state = 30;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 25:
last++;
c = code[last];
if( contain( "e",c ) ) state = 31;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 26:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 27:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 28:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 29:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 30:
last++;
c = code[last];
if( contain( "e",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 31:
last++;
c = code[last];
if( contain( "n",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 32:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 33:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
default:
break;
}
}
cout<<"Symbol 表如下:"<<endl;
cout<<setw(3)<<"序号"<<setw(10)<<"token类型"<<" "<<"token名字"<<endl;
for( k2 = 0;k2 < count;k2++ )
{
cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<" "<<SymbolTable[k2][1]<<endl;
}
system("pause");
return 0;
}
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
for( int i = 0;i < str.length();i++ )
if( str[i] == v )
return true;
return false;
}
int main()
{
string tempCode = "";
string code = "";
int i,j,k,k2=0;
char buf[1024]; //临时保存读取出来的文件内容
string message = "";
ifstream infile;
infile.open("code.txt");
if(infile.is_open()) //文件打开成功,说明曾经写入过东西
{
while(infile.good() && !infile.eof())
{
memset(buf,0,1024);
infile.getline(buf,1204);
message = buf;
tempCode += message;
}
infile.close();
}
for( i = 0;i < tempCode.length();i++ )
{
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
code += tempCode[i];
}
code += "%";
cout<<code<<endl;
int first = 0, last = -1;
char c;
string tempStr = "";
int state = 1;
bool flag = 1;
bool flag2 = true;
int count = 0;
//set<int> endState;
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
while(flag)
{
switch(state)
{
case 1:
last++;
c = code[last];
if( c == '%' )
{
flag = 0;
break;
}
if( contain( "a",c ) ) state = 2;
else if( contain( "b",c ) ) state = 3;
else if( contain( "d",c ) ) state = 4;
else if( contain( "1234567890",c ) ) state = 5;
else if( contain( "i",c ) ) state = 15;
else if( contain( "e",c ) ) state = 16;
else if( contain( "t",c ) ) state = 17;
else if( contain( "><=",c ) ) state = 18;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 2:
last++;
c = code[last];
if( contain( "bb",c ) ) state = 21;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 3:
last++;
c = code[last];
if( contain( "c",c ) ) state = 3;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 4:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 5:
last++;
c = code[last];
if( contain( "1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000",c ) ) state = 5;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Number";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 6:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 7:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 8:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 9:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 10:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 11:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 12:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 13:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 14:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 15:
last++;
c = code[last];
if( contain( "f",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 16:
last++;
c = code[last];
if( contain( "l",c ) ) state = 24;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 17:
last++;
c = code[last];
if( contain( "h",c ) ) state = 25;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 18:
last++;
c = code[last];
if( contain( "===",c ) ) state = 26;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 19:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 20:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 21:
last++;
c = code[last];
if( contain( "c",c ) ) state = 2;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 22:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 23:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "key";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 24:
last++;
c = code[last];
if( contain( "s",c ) ) state = 30;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 25:
last++;
c = code[last];
if( contain( "e",c ) ) state = 31;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 26:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 27:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 28:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 29:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 30:
last++;
c = code[last];
if( contain( "e",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 31:
last++;
c = code[last];
if( contain( "n",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 32:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 33:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
default:
break;
}
}
cout<<"Symbol 表如下:"<<endl;
cout<<setw(3)<<"序号"<<setw(10)<<"token类型"<<" "<<"token名字"<<endl;
for( k2 = 0;k2 < count;k2++ )
{
cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<" "<<SymbolTable[k2][1]<<endl;
}
system("pause");
return 0;
}
等待编译的代码写在code.txt中,这个txt相当于代码输入界面,这里的内容对应着上面的那些正则表实例,内容如下:
if a >= 3
bcc = 123;
else if d < 9
abc = 12;
bcc = 123;
else if d < 9
abc = 12;
运行主体代码后,scanner.cpp里的代码编译后的程序会读入code.txt中的代码,再在运行界面上输出token串和Symbol Table,大功告成!
------------------------------------------------
Felix原创,转载请注明出处,感谢博客园!
posted on 2010-01-15 19:51 Felix Fang 阅读(499) 评论(0) 编辑 收藏 举报