语言:C/C++
关键字:词法分析,哈希表,链表法处理哈希冲突
摘要:开发一个简单的编译器。编译一个程序的步骤:词法分析,语法分析,语义分析,中间代码生成,目标代码生成。本文目的是实现词法分析。
1 项目需求描述
分析一个C语言程序的内容。将函数的变量名设置为灰色,将关键字设置为绿色,将整型常量、浮点型常量、字符常量、字符串常量等设置为褐色,将运算符设置为红色。
2 设计思路
2.1 预处理
2.1.1 定义存储字符串的结构体。
/*存储字符串的结构体定义*/ typedef struct TKWord { int tkcode; string spelling; struct TKWord* next; }TKWord, *pTKWord;
2.1.2 通过枚举对预置的字符串进行编号。
/*通过枚举对预置的字符串进行编号*/ enum e_TokenCode { /* 运算符及分隔符 */ TK_PLUS, // + 加号 TK_MINUS, // - 减号 TK_STAR, // * 星号 TK_DIVIDE, // / 除号 TK_MOD, // % 求余运算符 TK_EQ, // == 等于号 TK_NEQ, // != 不等于号 TK_LT, // < 小于号 TK_LEQ, // <= 小于等于号 TK_GT, // > 大于号 TK_GEQ, // >= 大于等于号 TK_ASSIGN, // = 赋值运算符 TK_POINTSTO, // -> 指向结构体成员运算符 TK_DOT, // . 结构体成员运算符 TK_AND, // & 地址与运算符 TK_OPENPA, // ( 左圆括号 TK_CLOSEPA, // ) 右圆括号 TK_OPENBR, // [ 左中括号 TK_CLOSEBR, // ] 右中括号 TK_BEGIN, // { 左大括号 TK_END, // } 右大括号 TK_SEMICOLON, // ; 分号 TK_COMMA, // , 逗号 TK_ELLIPSIS, // ... 省略号 TK_EOF, // 文件结束符 /* 常量 */ TK_CINT, // 整型常量 TK_CFLOAT, // 浮点型常量 TK_CCHAR, // 字符常量 TK_CSTR, // 字符串常量 /* 关键字 */ KW_CHAR, // char关键字 KW_SHORT, // short关键字 KW_INT, // int关键字 KW_VOID, // void关键字 KW_STRUCT, // struct关键字 KW_IF, // if关键字 KW_ELSE, // else关键字 KW_FOR, // for关键字 KW_CONTINUE, // continue关键字 KW_BREAK, // break关键字 KW_RETURN, // return关键字 /* 标识符 */ TK_IDENT // 函数名或变量名 }; typedef enum e_TokenCode tokencode;
2.1.3 定义一个静态结构数组,其中存放各种预置的字符串及其token。然后将他们分别通过哈希公式映射并添加到哈希表中。
/*定义一个静态结构数组*/ static TKWord keywords[] = { {TK_PLUS,"+",NULL}, {TK_MINUS,"-",NULL}, {TK_STAR,"*",NULL}, {TK_DIVIDE,"/",NULL}, {TK_MOD,"%",NULL}, {TK_EQ,"==",NULL}, {TK_NEQ,"!=",NULL}, {TK_LT,"<",NULL}, {TK_LEQ,"<=",NULL}, {TK_GT,">",NULL}, {TK_GEQ,">=",NULL}, {TK_ASSIGN,"=",NULL}, {TK_POINTSTO,"->",NULL}, {TK_DOT,".",NULL}, {TK_AND,"&",NULL}, {TK_OPENPA,"(",NULL}, {TK_CLOSEPA,")",NULL}, {TK_OPENBR,"[",NULL}, {TK_CLOSEBR,"]",NULL}, {TK_BEGIN,"{",NULL}, {TK_END,"}",NULL}, {TK_SEMICOLON,";",NULL}, {TK_COMMA,",",NULL}, {TK_ELLIPSIS,"...",NULL}, {TK_EOF,"End Of File",NULL}, {TK_CINT,"整形常量",NULL}, {TK_CFLOAT,"浮点型常量",NULL}, {TK_CCHAR,"字符常量",NULL}, {TK_CSTR,"字符串常量",NULL}, {KW_CHAR,"char",NULL}, {KW_SHORT,"short",NULL}, {KW_INT,"int",NULL}, {KW_VOID,"void",NULL}, {KW_STRUCT,"struct",NULL}, {KW_IF,"if",NULL}, {KW_ELSE,"else",NULL}, {KW_FOR,"for",NULL}, {KW_CONTINUE,"continue",NULL}, {KW_BREAK,"break",NULL}, {KW_RETURN,"return",NULL}, {40} };
/*定义哈希函数*/ int elf_hash(string key) { int h = 0, g, i = 0; while (key[i]) { h = (h << 4) + key[i++]; g = h & 0xf0000000; if (g) h ^= g >> 24; h &= ~g; } return h % MAXKEY; }
/*定义哈希表*/ TKWord* tk_Hashtable[MAXKEY] = { 0 };
/*定义函数,用于将静态结构数组中各种预置的字符串及其token分别通过哈希公式映射并添加到哈希表中*/ void InitKeywords(pTKWord keywords, int keywordsLen, pTKWord* tk_Hashtable) { for (int i = 0; i < keywordsLen; i++) { TKWord tmp = keywords[i]; if (tk_Hashtable[elf_hash(tmp.spelling)] == NULL) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->tkcode = tmp.tkcode; /*string snew = tmp.spelling; for (int j = 0; j < snew.size(); j++) { pnew->spelling.push_back(snew[j]); }*/ pnew->spelling = tmp.spelling;//拷贝字符串 tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } else { TKWord* p = tk_Hashtable[elf_hash(tmp.spelling)]; TKWord* q = p; while (p != NULL) { if (!p->spelling.compare(tmp.spelling)) { break; } q = p; p = p->next; } if (!p) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->spelling = tmp.spelling; pnew->tkcode = tmp.tkcode; tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } } } }
int keywordsLen = sizeof(keywords) / sizeof(TKWord); InitKeywords(keywords, keywordsLen, tk_Hashtable);
2.2 打开文件test.txt,每次取一行放入str中。
ifstream file; file.open("test.txt", ios::in); if (!file.is_open()) { return; } string str; while (getline(file, str)) { ... }
2.3 (在while函数中操作)对这一行str分割成一个个具有独立意义的字符串,并分别存放到向量(动态数组)中。
/*定义一个向量word,用于存放分割后的字符串*/ vector<TKWord>word;
/*定义函数,用于将str分割成一个个有独立意义的字符串,并存放在向量word中*/ void split(vector<TKWord>& word,const string str) { for (int i = 0; i < str.length(); i++) { if (isalpha(str[i]) || str[i] == '_') {//函数名或变量名 TKWord tmp; tmp.tkcode = TK_IDENT; tmp.next = NULL; while (isalnum(str[i])||str[i]=='_') { tmp.spelling.push_back(str[i]); i++; } word.push_back(tmp); i--; } else if(str[i]=='"') {//字符串常量 TKWord tmp; tmp.tkcode = TK_CSTR; tmp.next = NULL; tmp.spelling.push_back(str[i]); i++; while (str[i] != '"') { tmp.spelling.push_back(str[i]); i++; } tmp.spelling.push_back('"'); word.push_back(tmp); } else if (str[i] == '\'') {//字符常量 TKWord tmp; tmp.tkcode = TK_CCHAR; tmp.next = NULL; tmp.spelling.push_back(str[i]); i++; while (str[i] != '\'') { tmp.spelling.push_back(str[i]); i++; } tmp.spelling.push_back('\''); word.push_back(tmp); } else if (ispunct(str[i])) {//运算符 TKWord tmp; tmp.next = NULL; tmp.spelling.push_back(str[i]); word.push_back(tmp); } else if (str[i] == ' ') {//空格 TKWord tmp; tmp.next = NULL; tmp.spelling.push_back(str[i]); word.push_back(tmp); } else if(isdigit(str[i])){//纯数字(目前将整型常量和浮点型常量考虑在一起) TKWord tmp; tmp.tkcode = TK_CINT; tmp.next = NULL; while (isdigit(str[i]) || str[i] == '.') { tmp.spelling.push_back(str[i]); i++; } word.push_back(tmp); i--; } else if(str[i]=='\t') {//tab键 TKWord tmp; tmp.next = NULL; tmp.spelling = " "; word.push_back(tmp); } } }
if (str.empty()) { continue; } word.clear(); split(word, str);
2.4 (在while函数中操作)将word中存放好的一个个字符串们分别处理。首先,将字符串链接到哈希表上。通过哈希函数得到哈希值,通过哈希值在哈希表上定位,判断对应位置是否为NULL:若为NULL,将该字符串所在的结构体链接到该位置;若不为NULL,由于采用链表法处理哈希冲突,故此时应该在该位置所链接的链表中不断比较字符串直到找到匹配的字符串所在结构体,若最终没有匹配成功,就申请一个新的结构体并将该字符串所在结构体的所有信息拷贝到新的结构体中,并将这个新结构体链接到链表表尾。然后,根据该字符串的token颜色打印该字符串。
/*颜色打印函数*/ void printColor(string str, int token) { HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE); if (token >= TK_IDENT) {//变量名或者函数名为灰色 SetConsoleTextAttribute(h, FOREGROUND_INTENSITY); } else if (token >= KW_CHAR) {//关键字为绿色 SetConsoleTextAttribute(h, FOREGROUND_GREEN | FOREGROUND_INTENSITY); } else if (token >= TK_CINT) {//整型常量、浮点型常量、字符常量、字符串常量等为褐色 SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_GREEN); } else {//运算符为红色 SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_INTENSITY); } if (-1 == str[0]) { printf("\n ENd Of File"); SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_INTENSITY | FOREGROUND_BLUE | FOREGROUND_INTENSITY); } else { cout << str; } }
for (int i = 0; i < word.size(); i++) { TKWord tmp = word[i]; if (tk_Hashtable[elf_hash(tmp.spelling)] == NULL) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->tkcode = tmp.tkcode; /*string snew = tmp.spelling; for (int j = 0; j < snew.size(); j++) { pnew->spelling.push_back(snew[j]); }*/ pnew->spelling = tmp.spelling;//拷贝字符串 tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } else { TKWord* p = tk_Hashtable[elf_hash(tmp.spelling)]; TKWord* q = p; while (p != NULL) { if (!p->spelling.compare(tmp.spelling)) { word[i].tkcode = p->tkcode; break; } q = p; p = p->next; } if (!p) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->spelling = tmp.spelling; pnew->tkcode = tmp.tkcode; q->next = pnew; } } printColor(word[i].spelling, word[i].tkcode);
3 完整代码
#include <string> #include <Windows.h> #include <fstream> #include <iostream> #include <vector> #include <ctype.h> using namespace std; /*通过枚举对预置的字符串进行编号*/ enum e_TokenCode { /* 运算符及分隔符 */ TK_PLUS, // + 加号 TK_MINUS, // - 减号 TK_STAR, // * 星号 TK_DIVIDE, // / 除号 TK_MOD, // % 求余运算符 TK_EQ, // == 等于号 TK_NEQ, // != 不等于号 TK_LT, // < 小于号 TK_LEQ, // <= 小于等于号 TK_GT, // > 大于号 TK_GEQ, // >= 大于等于号 TK_ASSIGN, // = 赋值运算符 TK_POINTSTO, // -> 指向结构体成员运算符 TK_DOT, // . 结构体成员运算符 TK_AND, // & 地址与运算符 TK_OPENPA, // ( 左圆括号 TK_CLOSEPA, // ) 右圆括号 TK_OPENBR, // [ 左中括号 TK_CLOSEBR, // ] 右中括号 TK_BEGIN, // { 左大括号 TK_END, // } 右大括号 TK_SEMICOLON, // ; 分号 TK_COMMA, // , 逗号 TK_ELLIPSIS, // ... 省略号 TK_EOF, // 文件结束符 /* 常量 */ TK_CINT, // 整型常量 TK_CFLOAT, // 浮点型常量 TK_CCHAR, // 字符常量 TK_CSTR, // 字符串常量 /* 关键字 */ KW_CHAR, // char关键字 KW_SHORT, // short关键字 KW_INT, // int关键字 KW_VOID, // void关键字 KW_STRUCT, // struct关键字 KW_IF, // if关键字 KW_ELSE, // else关键字 KW_FOR, // for关键字 KW_CONTINUE, // continue关键字 KW_BREAK, // break关键字 KW_RETURN, // return关键字 /* 标识符 */ TK_IDENT // 函数名或变量名 }; typedef enum e_TokenCode tokencode; /*存储字符串的结构体定义*/ typedef struct TKWord { int tkcode; string spelling; struct TKWord* next; }TKWord, *pTKWord; #define MAXKEY 50 void printColor(string str, int token); int elf_hash(string key); void InitKeywords(pTKWord keywords, int keywordsLen, pTKWord* tk_Hashtable); void split(vector<TKWord>& word, const string str);
#include "complier.h" /*定义哈希函数*/ int elf_hash(string key) { int h = 0, g, i = 0; while (key[i]) { h = (h << 4) + key[i++]; g = h & 0xf0000000; if (g) h ^= g >> 24; h &= ~g; } return h % MAXKEY; } /*定义函数,用于将静态结构数组中各种预置的字符串及其token分别通过哈希公式映射并添加到哈希表中*/ void InitKeywords(pTKWord keywords, int keywordsLen, pTKWord* tk_Hashtable) { for (int i = 0; i < keywordsLen; i++) { TKWord tmp = keywords[i]; if (tk_Hashtable[elf_hash(tmp.spelling)] == NULL) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->tkcode = tmp.tkcode; /*string snew = tmp.spelling; for (int j = 0; j < snew.size(); j++) { pnew->spelling.push_back(snew[j]); }*/ pnew->spelling = tmp.spelling;//拷贝字符串 tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } else { TKWord* p = tk_Hashtable[elf_hash(tmp.spelling)]; TKWord* q = p; while (p != NULL) { if (!p->spelling.compare(tmp.spelling)) { break; } q = p; p = p->next; } if (!p) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->spelling = tmp.spelling; pnew->tkcode = tmp.tkcode; tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } } } } /*定义函数,用于将str分割成一个个有独立意义的字符串,并存放在向量word中*/ void split(vector<TKWord>& word,const string str) { for (int i = 0; i < str.length(); i++) { if (isalpha(str[i]) || str[i] == '_') {//函数名或变量名 TKWord tmp; tmp.tkcode = TK_IDENT; tmp.next = NULL; while (isalnum(str[i])||str[i]=='_') { tmp.spelling.push_back(str[i]); i++; } word.push_back(tmp); i--; } else if(str[i]=='"') {//字符串常量 TKWord tmp; tmp.tkcode = TK_CSTR; tmp.next = NULL; tmp.spelling.push_back(str[i]); i++; while (str[i] != '"') { tmp.spelling.push_back(str[i]); i++; } tmp.spelling.push_back('"'); word.push_back(tmp); } else if (str[i] == '\'') {//字符常量 TKWord tmp; tmp.tkcode = TK_CCHAR; tmp.next = NULL; tmp.spelling.push_back(str[i]); i++; while (str[i] != '\'') { tmp.spelling.push_back(str[i]); i++; } tmp.spelling.push_back('\''); word.push_back(tmp); } else if (ispunct(str[i])) {//运算符 TKWord tmp; tmp.next = NULL; tmp.spelling.push_back(str[i]); word.push_back(tmp); } else if (str[i] == ' ') {//空格 TKWord tmp; tmp.next = NULL; tmp.spelling.push_back(str[i]); word.push_back(tmp); } else if(isdigit(str[i])){//纯数字(目前将整型常量和浮点型常量考虑在一起) TKWord tmp; tmp.tkcode = TK_CINT; tmp.next = NULL; while (isdigit(str[i]) || str[i] == '.') { tmp.spelling.push_back(str[i]); i++; } word.push_back(tmp); i--; } else if(str[i]=='\t') {//tab键 TKWord tmp; tmp.next = NULL; tmp.spelling = " "; word.push_back(tmp); } } } /*颜色打印函数*/ void printColor(string str, int token) { HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE); if (token >= TK_IDENT) {//变量名或者函数名为灰色 SetConsoleTextAttribute(h, FOREGROUND_INTENSITY); } else if (token >= KW_CHAR) {//关键字为绿色 SetConsoleTextAttribute(h, FOREGROUND_GREEN | FOREGROUND_INTENSITY); } else if (token >= TK_CINT) {//整型常量、浮点型常量、字符常量、字符串常量等为褐色 SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_GREEN); } else {//运算符为红色 SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_INTENSITY); } if (-1 == str[0]) { printf("\n ENd Of File"); SetConsoleTextAttribute(h, FOREGROUND_RED | FOREGROUND_INTENSITY | FOREGROUND_BLUE | FOREGROUND_INTENSITY); } else { cout << str; } }
#include "complier.h" /*定义一个静态结构数组*/ static TKWord keywords[] = { {TK_PLUS,"+",NULL}, {TK_MINUS,"-",NULL}, {TK_STAR,"*",NULL}, {TK_DIVIDE,"/",NULL}, {TK_MOD,"%",NULL}, {TK_EQ,"==",NULL}, {TK_NEQ,"!=",NULL}, {TK_LT,"<",NULL}, {TK_LEQ,"<=",NULL}, {TK_GT,">",NULL}, {TK_GEQ,">=",NULL}, {TK_ASSIGN,"=",NULL}, {TK_POINTSTO,"->",NULL}, {TK_DOT,".",NULL}, {TK_AND,"&",NULL}, {TK_OPENPA,"(",NULL}, {TK_CLOSEPA,")",NULL}, {TK_OPENBR,"[",NULL}, {TK_CLOSEBR,"]",NULL}, {TK_BEGIN,"{",NULL}, {TK_END,"}",NULL}, {TK_SEMICOLON,";",NULL}, {TK_COMMA,",",NULL}, {TK_ELLIPSIS,"...",NULL}, {TK_EOF,"End Of File",NULL}, {TK_CINT,"整形常量",NULL}, {TK_CFLOAT,"浮点型常量",NULL}, {TK_CCHAR,"字符常量",NULL}, {TK_CSTR,"字符串常量",NULL}, {KW_CHAR,"char",NULL}, {KW_SHORT,"short",NULL}, {KW_INT,"int",NULL}, {KW_VOID,"void",NULL}, {KW_STRUCT,"struct",NULL}, {KW_IF,"if",NULL}, {KW_ELSE,"else",NULL}, {KW_FOR,"for",NULL}, {KW_CONTINUE,"continue",NULL}, {KW_BREAK,"break",NULL}, {KW_RETURN,"return",NULL}, {40} }; /*定义哈希表*/ TKWord* tk_Hashtable[MAXKEY] = { 0 }; /*定义一个向量word,用于存放分割后的字符串*/ vector<TKWord>word; void main() { int keywordsLen = sizeof(keywords) / sizeof(TKWord); InitKeywords(keywords, keywordsLen, tk_Hashtable); ifstream file; file.open("test.txt", ios::in); if (!file.is_open()) { return; } string str; while (getline(file, str)) { if (str.empty()) { continue; } word.clear(); split(word, str); for (int i = 0; i < word.size(); i++) { TKWord tmp = word[i]; if (tk_Hashtable[elf_hash(tmp.spelling)] == NULL) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->tkcode = tmp.tkcode; /*string snew = tmp.spelling; for (int j = 0; j < snew.size(); j++) { pnew->spelling.push_back(snew[j]); }*/ pnew->spelling = tmp.spelling;//拷贝字符串 tk_Hashtable[elf_hash(tmp.spelling)] = pnew; } else { TKWord* p = tk_Hashtable[elf_hash(tmp.spelling)]; TKWord* q = p; while (p != NULL) { if (!p->spelling.compare(tmp.spelling)) { word[i].tkcode = p->tkcode; break; } q = p; p = p->next; } if (!p) { TKWord* pnew = new TKWord; pnew->next = NULL; pnew->spelling = tmp.spelling; pnew->tkcode = tmp.tkcode; q->next = pnew; } } printColor(word[i].spelling, word[i].tkcode); } cout << endl; } }
4 测试文件及显示结果
int split(vector<string> &word,const string str){ for(int i=0;i<str.length();i++){ if(str[i]==34){ string temp; temp.push_back(str[i]); i++; while(str[i]!=34){ temp.push_back(str[i]); i++; } temp.push_back(str[i]); word.push_back(temp); }else if(ispunct(str[i])||str[i]==' \b'){ string temp; temp.push_back(str[i]); word.push_back(temp); }else if(isdigit(str[i])){ string temp; while(isdigit(str[i])){ temp.push_back(str[i]); ++i; } if(str[i]=='.'){ ++i; if(isdigit(str[i])){ temp.push_back('.'); while(isdigit(str[i])){ temp.push_back(str[i]); i++; } }else{ return -1; } } word.push_back(temp); --i; }else if(isalpha(str[i])){ string temp; while(isalnum(str[i])){ temp.push_back(str[i]); i++; } word.push_back(temp); --i; }else if(str[i]=='\t'){ string temp=" "; word.push_back(temp); } } }
5 尚需改进的地方
分析不完全。如没有将所有运算符考虑进去;在分割函数中,函数名和变量名、数字的整型和浮点型也都没有分开考虑。
6 经验和教训
bug: 前面定义里含有string的结构体,后面用malloc申请一个新的结构体,并将旧的结构体拷贝到新结构体中,但是这期间拷贝string时总是报错,在多次尝试之后,最后发现将malloc换用new后程序正确运行。
教训:不要将C和C++的操作混用!(这里用C语言的malloc函数申请一个包含C++独用的string类型的结构体,这是会在后面的操作中出现错误)