手撕正则表达式
我们先撕简单的。a ab a|b aa* a(a|b)* 先不管匹配任意字符的. 重复>=1次的+ [^0-9]除0-9外 \digit数字等。
正则表达式(regular expression, re)为啥叫表达式,不叫正则字符串之类?因为它是个表达式。3+5*2是个表达式;两个字符串可以有连接运算,如"a"+"b"或"a"."b"得到"ab"。
在正则表达式里,a,b,c就像2,3,5,是被运算的数,. | * ()是运算符。请注意:ab是a和b拼接,人们为了省事不把拼接运算符写出来。
(3+5)*2=16,3+(5*2)=13。如果没有四则运算优先级和括号,3+5*2等于16还是13?运算符后置(后缀表达式)没有歧义,例如35+2*是mul(add(3,5), 2),352+*是mul(3, add(5,2))。mul: multipy. What are infix, postfix and prefix expressions?
我们分3步走:
- 把a(a|b)*变成aab|*.这样的后缀表达式,40行程序。ab是a和b拼接,是a.b的缩写(中间有个.)
- 用Thompson算法把后缀表达式变成NFA,号称4行 (case, case, case, default)
- 用NFA检查是否匹配,号称10行
第1步中缀变后缀请看代码。
第2步后缀变NFA。NFA可以像积木一样拼起来。下面分别是a, ab, a|b, a*的NFA:
图片是用dot - graphviz version 2.49.0画的。如 dot -o ab.png -Tpng todot.txt 或 dot -Tpng todot.txt >ab.png 。dot -h看帮助。
https://files.cnblogs.com/files/blogs/714801/Graphviz.7z 1996KB 可能是最小的了,带grep.exe
拼接NFA的代码:
NFA postfix_to_nfa(const char* pfstr) { Stack<NFA> stk; for (const char* p = pfstr; *p; p++) { switch (*p) { case '.': stk.push(stk.pop() + stk.pop()); break; case '|': stk.push(stk.pop() | stk.pop()); break; case '*': stk.push(*stk.pop()); break; default: stk.push(*p); } } NFA nfa = stk.pop(); if (!stk.empty()) error; return nfa; }
运算符函数也不长,含打印,匹配等全部代码180行:

// 从ChrisZZ(zchrissirhcz@gmail.com)的程序改来的 #include <stdio.h> #include <string.h> #include <string> #include <stack> using namespace std; #define error throw __LINE__ template<class T>struct Stack : public stack<T> { T pop() { T t = top(); stack<T>::pop(); return t; } }; const char END = '\0', EPSILON = '\001'; // Epsilon (upper case Ε, lower case ε): empty struct State { // 像链表里的node int id; // 自动加1的编号 State* next[2]; // 到next[0]的边是epsilon;到next[1]的是char char ch; State(int ch_=256, State* p1=0, State* p0=0) : id(_id++), ch(ch_) { next[0] = p0; next[1] = p1; } static int _id; static char _visited[256]; // 下标是State的编号,仅print时用 }; int State::_id; char State::_visited[256]; struct NFA { State *start, *end; NFA() : start(0), end(0) {} NFA(char ch) { end = new State(END); start = new State(ch, end); } NFA operator + (NFA nfa) { end->ch = EPSILON; end->next[1] = nfa.start; end = nfa.end; return *this; } NFA operator | (NFA nfa) { State *head = new State(EPSILON, start, nfa.start), *tail = new State(END); end->ch = EPSILON; end->next[1] = tail; end = tail; start = head; nfa.end->ch = EPSILON; nfa.end->next[1] = tail; return *this; } NFA operator * () { State *tail = new State(END), *head = new State(EPSILON, start, tail); end->ch = EPSILON; end->next[0] = start; end->next[1] = tail; end = tail; start = head; return *this; } void print(const char* file_name); const char* elm; // point to the end of the longest match const char* match(const char* str) { elm = str; visit4m(start, str); return elm; } void visit4p(const State* s, FILE* fp); // visit for print void visit4m(const State* s, const char* str); // visit for match }; NFA postfix_to_nfa(const char* pfstr) { Stack<NFA> stk; for (const char* p = pfstr; *p; p++) { switch (*p) { case '.': stk.push(stk.pop() + stk.pop()); break; case '|': stk.push(stk.pop() | stk.pop()); break; case '*': stk.push(*stk.pop()); break; default: stk.push(*p); } } NFA nfa = stk.pop(); if (!stk.empty()) error; return nfa; } void NFA::print(const char* file_name) { // 同时输出到屏幕和DOT文件 puts(""); FILE* fp = fopen(file_name, "wt"); if (!fp) return; fputs("digraph {\n\"\"\n", fp); fputs("[shape = plaintext]\n", fp); fputs("\trankdir = LR\n", fp); memset(State::_visited, 0, sizeof(State::_visited)), visit4p(start, fp); fputs("}", fp), fclose(fp); } void NFA::visit4p(const State* st, FILE* fp) { if (State::_visited[st->id]) return; State::_visited[st->id] = 1; for (int i = 0; i < 2; i++) { if (State* p = st->next[i]) { char label[16]; if (st->ch == EPSILON) strcpy(label, "''"); else sprintf(label, "'%c'", st->ch); // DOT支持不带BOM的UTF-8编码的文件。ε的UTF-8编码是\xce\xb5 printf("%d - %s -> %d\n", st->id, label, p->id); fprintf(fp, "%d -> %d [label = <%s>]\n", st->id, p->id, label); visit4p(p, fp); } } } void NFA::visit4m(const State* st, const char* str) { if (st == end) { if (str > elm) elm = str; return; } for (int i = 0; i < 2; i++) { if (State* p = st->next[i]) { if (st->ch == EPSILON) visit4m(p, str); if (st->ch == *str) visit4m(p, str + 1); } } } struct CountOf { int opnd; // a是opnd b是opnd ab.也是opnd int or; // | }; string re_to_postfix(const char* re) { string out; CountOf cntof = { 0 }; stack<CountOf> khdz; // KuoHao (parenthesis) 的栈 const char* p; for (p = re; *p; p++) { switch (char c = *p) { case '(': if (cntof.opnd > 1) out += '.'; // a(??? khdz.push(cntof); cntof.or = cntof.opnd = 0; break; case ')': if (cntof.opnd == 0 || khdz.empty()) error; // ) () while (--cntof.opnd > 0) out += '.'; // ((a|b)(c|d)) =1时不进循环 1个opnd不需要. while (cntof.or-- > 0) out += '|'; // =1时进循环 cntof = khdz.top(); khdz.pop(); ++cntof.opnd; // 如遇到(时还没有opnd,遇到(a)的)时,知道了(a)是个opnd break; case '*': if (cntof.opnd ==0 ) error; out += c; break; case '|': // a|b变ab| a|b|c变ab|c| ab|c变ab.c| if (cntof.opnd == 0) error; while (--cntof.opnd > 0) out += '.'; ++cntof.or; break; default: // a变a ab变ab. abc变ab.c. if (cntof.opnd > 1) { --cntof.opnd; out += '.'; } out += c; ++cntof.opnd; } // switch // printf("%*c", 5, ' ')输出5个空格 printf("%*c%s %d %d %s\n", 1 + p - re, ' ', p, cntof.opnd, cntof.or, out.c_str()); } // for if (!khdz.empty()) error; while (--cntof.opnd > 0) out += '.'; while (cntof.or-- > 0) out += '|'; printf("%*c%s %s\n", 1 + p - re, ' ', p, out.c_str()); return out; } int main(){ try { //const char* re = "a"; //const char* re = "a*"; //const char* re = "ab"; //const char* re = "a|b"; const char* re = "((a|b)(c|d))*"; NFA nfa = postfix_to_nfa(re_to_postfix(re).c_str()); nfa.print("todot.txt"); const char* s = "bdabc"; const char* p = nfa.match(s); printf("\nmatch: %.*s\n", p - s, s); } catch(int n) { printf("Error at line %d.\n", n); } getchar(); return 0; }
print和match都是递归遍历图。print把visited去掉可能陷入无限递归(如a*)。match可以拽下名词:guided tour.
- 正则表达式转NFA - ChrisZZ
- Brief intro to NFA, DFA and regexes
- Programming Thompson's algorithm: How to represent a NFA?
- Can any NFA be converted to a DFA? | NFA转DFA
- How to create DFA from regular expression without using NFA? I asked this question to our professor but he told me that we can use intuition and kindly refused to provide any explanation. :-)
- Regular Expression to DFA a*怎么转DFA?或者说带有epsilon的NFA怎么转DFA?我想首先要定义啥叫空。"a"是a和\0,""是\0,自动机的输入总是“普通”字符,没有epsilon.
- Hopcroft's DFA minimization algorithm | Generates Regular Expressions That Match A Set of Strings
- Brzozowski's algebraic method to convert a DFA into a regular expression
- Grep - GNU Project regex.c getopt.c ... This is GNU grep 2.0, the "fastest grep in the west" (we hope)... GNU grep is based on a fast lazy-state deterministic matcher (about twice as fast as stock Unix egrep) hybridized with a Boyer-Moore-Gosper search for a fixed string that eliminates impossible text from being considered by the full regexp matcher without necessarily having to look at every character. The result is typically many times faster than Unix grep or egrep. (Regular expressions containing backreferencing will run more slowly, however.)
- The Difference Between grep, egrep, and fgrep
- Flex - a scanner generator (gnu.org) | Bison - GNU Project | PLY (Python Lex-Yacc)
- DOT Language
- 话相当糙理糙不糙?老外看中国人像搞IT的看互联网公司?我们搞底层你们搞钱huh? 前IT人现坐家的我,脸有点红。中国的航天、军工、特高压、基建、通信(不懂所以列不全)才是真厉害。
- python写的多项式符号乘法
圆圈版:

void NFA::print(const char* file_name) { // 同时输出到屏幕和DOT文件 puts(""); FILE* fp = fopen(file_name, "wt"); if (!fp) return; fputs("digraph {\n", fp); // graph不允许有向边-> fputs("rankdir=LR\n", fp); // Left-Right, default: TB (Top-Bottom) // https://www.graphviz.org/doc/info/attrs.html // 为所有node指定默认值 fputs("node [shape=circle style=filled fillcolor=\"#000080\" color=red fontcolor=yellow]\n", fp); // By default, DOT assumes the UTF-8 character encoding. 不需要也不认BOM. // Another way to avoid non-ASCII characters in labels is to use HTML entities for special characters. // <ε>是用<>括起来的an HTML entity, "", "\xce\xb5",123, _123等也行 fputs("<> [shape=none width=0.0 height=0.0]\n", fp); // 没有圆圈的- start -> fprintf(fp, "<> -> %d [label=start]\n", start->id); fprintf(fp, "%d [peripheries=2]\n", end->id); // end2个圆圈,n个也行 memset(State::_visited, 0, sizeof(State::_visited)), visit4p(start, fp); fputs("}", fp), fclose(fp); } void NFA::visit4p(const State* st, FILE* fp) { if (State::_visited[st->id]) return; State::_visited[st->id] = 1; for (int i = 0; i < 2; i++) { if (State* p = st->next[i]) { char label[16]; if (st->ch == EPSILON) strcpy(label, ""); else sprintf(label, "%c", st->ch); printf("%d - %s -> %d\n", st->id, label, p->id); if (st->ch == EPSILON) strcpy(label, "ε"); // https://www.graphviz.org/docs/attr-types/arrowType/ fprintf(fp, "%d -> %d [label=<%s> arrowhead=vee]\n", st->id, p->id, label); visit4p(p, fp); } } }
DOT有动画版多好。ffmpeg可以把一系列图片转换成视频或动画GIF. style=invis可以隐藏元素,但依然占位。invisible.
github.com/gdevic/minix1 有较短的regexp.c和regsub.c. re编译后也可以叫做一个程序,由虚拟机来执行。
也许可以用bison写个解析正则表达式语法并生成自动机的程序。:-) [:alphanum:] \w [^\040]{1-5} ...
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!
2021-12-22 Logic synthesis和仿真