2019/10/09-作业05
词法分析程序(Lexical Analyzer)要求:
- 从左至右扫描构成源程序的字符流
- 识别出有词法意义的单词(Lexemes)
- 返回单词记录(单词类别,单词本身)
- 滤掉空格
- 跳过注释
- 发现词法错误
程序结构:
输入:字符流(什么输入方式,什么数据结构保存)
处理:
–遍历(什么遍历方式)
–词法规则
输出:单词流(什么输出形式)
–二元组
单词类别:
1.标识符(10)
2.无符号数(11)
3.保留字(一词一码)
4.运算符(一词一码)
5.界符(一词一码)
单词符号 |
种别码 |
单词符号 |
种别码 |
begin |
1 |
: |
17 |
if |
2 |
:= |
18 |
then |
3 |
< |
20 |
while |
4 |
<= |
21 |
do |
5 |
<> |
22 |
end |
6 |
> |
23 |
l(l|d)* |
10 |
>= |
24 |
dd* |
11 |
= |
25 |
+ |
13 |
; |
26 |
- |
14 |
( |
27 |
* |
15 |
) |
28 |
/ |
16 |
# |
0 |
答案
1.词法分析程序源码
注:由于使用了regex.h正则库等特性,必须使用支持C99标准的编译器,否则无法编译成功(不支持VC6.0和dev-c++),本次作业使用的是gcc 9.1编译器
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <regex.h> 5 6 void analyzer(char* code_string, int len); 7 int table(char* res); 8 int8_t is_char_or_digit(char res); 9 char* file_to_string(const char* path, int *len); 10 int re_match(const char* pattern, const char* res); 11 12 int main(int argc, char *args[]) 13 { 14 int content_len; 15 char* content = file_to_string(args[1], &content_len); 16 17 analyzer(content, content_len); 18 19 free(content); 20 return 0; 21 } 22 23 void analyzer(char* code_string, int len) 24 { 25 // Create a buffer to store a single character. 26 char* character = (char*)calloc(1, sizeof(char)); 27 28 for (int i = 0; i < len; ++i) 29 { 30 // Value the buffer with current character. 31 *character = code_string[i]; 32 33 // Skip space and return. 34 if (*character == ' ' || *character == '\n' || *character == '\r' || *character == '\t') 35 continue; 36 // Match reserved-words or symbols. 37 else if ( is_char_or_digit(*character) ) 38 { 39 // Get the symbol's or the reserved-word's length 40 int end_of_symbol, matched_len = 0; 41 for (end_of_symbol = i; end_of_symbol < len; ++end_of_symbol) 42 { 43 if ( !is_char_or_digit(code_string[end_of_symbol]) ) 44 { 45 matched_len = end_of_symbol - i; 46 break; 47 } 48 } 49 50 // Create a buffer to store the symbol. 51 char* matched = (char*)calloc(matched_len, sizeof(char)); 52 memcpy(matched, code_string+i, matched_len * sizeof(char)); 53 54 i = end_of_symbol - 1; // Shift the cursor. 55 56 printf( "(%s, %d)\n", matched, table(matched) ); 57 free(matched); 58 } 59 else if ( *character == ':' && code_string[i+1] == '=') 60 { 61 printf( "(%s, %d)\n", ":=", table(":=") ); 62 i++; // Shift the cursor. 63 } 64 else if ( *character == '<' && code_string[i+1] == '>') 65 { 66 printf( "(%s, %d)\n", "<>", table("<>") ); 67 i++; // Shift the cursor. 68 } 69 else if ( *character == '<' && code_string[i+1] == '=') 70 { 71 printf( "(%s, %d)\n", "<=", table("<=") ); 72 i++; // Shift the cursor. 73 } 74 else if ( *character == '>' && code_string[i+1] == '=') 75 { 76 printf( "(%s, %d)\n", ">=", table(">=") ); 77 i++; // Shift the cursor. 78 } 79 else if ( *character == '+' && code_string[i+1] == '=') 80 { 81 printf( "(%s, %d)\n", "+=", table("+=") ); 82 i++; // Shift the cursor. 83 } 84 else 85 printf( "(%c, %d)\n", *character, table(character) ); 86 } 87 88 free(character); // Free the buffer 89 } 90 91 int table(char* res) 92 { 93 const char* ld_pattern = "^[0-9a-zA-Z]*$"; 94 const char* dd_pattern = "[0-9]"; 95 96 if ( strcmp(res, "begin") == 0 ) return 1; 97 else if ( strcmp(res, "if") == 0 ) return 2; 98 else if ( strcmp(res, "then") == 0 )return 3; 99 else if ( strcmp(res, "while") == 0 )return 4; 100 else if ( strcmp(res, "do") == 0 ) return 5; 101 else if ( strcmp(res, "end") == 0 ) return 6; 102 else if ( strcmp(res, "+") == 0 ) return 13; 103 else if ( strcmp(res, "-") == 0 ) return 14; 104 else if ( strcmp(res, "*") == 0 ) return 15; 105 else if ( strcmp(res, "/") == 0 ) return 16; 106 else if ( strcmp(res, ":") == 0 ) return 17; 107 else if ( strcmp(res, ":=") == 0 ) return 18; 108 else if ( strcmp(res, "<") == 0 ) return 20; 109 else if ( strcmp(res, "<>") == 0 ) return 21; 110 else if ( strcmp(res, "<=") == 0 ) return 22; 111 else if ( strcmp(res, ">") == 0 ) return 23; 112 else if ( strcmp(res, ">=") == 0 ) return 24; 113 else if ( strcmp(res, "+=") == 0 ) return 25; 114 else if ( strcmp(res, "=") == 0 ) return 26; 115 else if ( strcmp(res, ";") == 0 ) return 27; 116 else if ( strcmp(res, "(") == 0 ) return 28; 117 else if ( strcmp(res, ")") == 0 ) return 29; 118 else if ( strcmp(res, "{") == 0 ) return 30; 119 else if ( strcmp(res, "}") == 0 ) return 31; 120 else if ( strcmp(res, "#") == 0 ) return 0; 121 else if ( re_match(dd_pattern, res) == 0 ) return 11; 122 else if ( re_match(ld_pattern, res) == 0 ) return 10; 123 else return -1; 124 } 125 126 int8_t is_char_or_digit(char res) 127 { 128 if ( (res >= '0' && res <= '9') || (res >= 'a' && res <= 'z') || (res >= 'A' && res <= 'Z') ) 129 return 1; 130 else 131 return 0; 132 } 133 134 char* file_to_string(const char* path, int *len) 135 { 136 FILE *fp; 137 if ( !(fp = fopen(path, "rb")) ) 138 return NULL; 139 140 fseek(fp, 0, SEEK_END); 141 *len = ftell(fp); 142 143 fseek(fp , 0 , SEEK_SET); 144 char* content = (char*)malloc(sizeof(char) * (*len)); 145 146 fread(content, sizeof(char), *len, fp); 147 148 fclose(fp); 149 150 return content; 151 } 152 153 int re_match(const char* pattern, const char* res) 154 { 155 regmatch_t pmatch[1]; 156 size_t nmatch = 1; 157 regex_t reg; 158 159 regcomp(®, pattern, REG_EXTENDED); 160 int status = regexec(®, res, nmatch, pmatch, 0); 161 regfree(®); 162 163 return status; 164 }
2.运行结果
准备好测试用的伪代码文件test.txt如下:
运行可执行文件时,将test.txt的文件路径作为运行参数,程序会按照指定路径读取,运行结果如下: