从文法到解析器的所有算法
从文法到解析器的所有算法
最近完成了替代Lex+YACC的自动生成词法分析器+语法分析器的项目,暂且命名为bitParser。想拥有自己的解析器的小伙伴可以将文法给我,送解析器。
本文以下面是一个支持加减乘除和括号的四则运算的文法Calc.st为例:
// 输入文件Calc.st
Exp : Exp '+' Term
| Exp '-' Term
| Term ;
Term : Term '*' Factor
| Term '/' Factor
| Factor ;
Factor : '(' Exp ')'
| 'number' ;
%%[0-9]+%% 'number' // 示例只处理非负整数
//无须书写 %%[+]%% '+' 等
点击查看 Calc.st 显式版
#extractor <Calc.st.ext>
// 8 regulations:
Exp :
Exp '+' Term // [0] [0]
| Exp '-' Term // [1] [1]
| Term ; // [2] [2]
Term :
Term '*' Factor // [0] [3]
| Term '/' Factor // [1] [4]
| Factor ; // [2] [5]
Factor :
'(' Exp ')' // [0] [6]
| 'number' ; // [1] [7]
// 1 token statements:
%%[0-9]+%% 'number' // [0]
// 0 precedences
// options
%grammarName Exp
%start Exp
%blockComment off
%inlineComment off
%validScopeChars [\u0001-\uFFFF]
%validGlobalChars [\u0001-\uFFFF]
编译原理中的语法分析器
基础结构
nullable
判断一个Symbol[]
是否可能产生空(ε)empty(即什么都没有推导出来)?
点击查看 nullable
// 计算所有可能推导出空(ε)empty的结点。
/// get the dictionary that tells if a symbol(Vn/Vt) can refer to ε.
static Dictionary<Symbol, bool> GetNullableDict(YieldContext context) {
var nullableDict = new Dictionary<Symbol, bool>();
// for all Vn symbols(include S')
foreach (var symbol in context.eVns) {
nullableDict.Add(symbol, false);
}
// for all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
foreach (var symbol in context.eSyntaxVts) {
nullableDict.Add(symbol, false);
}
// iterate untill not changed.
bool changed = false;
do {
changed = false;
foreach (var regulation in context.extendedRegulationDrafts) {
// 如果regulation.right可推导出ε,就说明regulation.left可推导出ε。
if (CanBeEmpty(regulation.right, nullableDict)) {
var left = regulation.left;
if (!nullableDict[left]) {
nullableDict[left] = true;
changed = true;
}
}
}
} while (changed);
return nullableDict;
}
// list是否都能产生ε?
static bool CanBeEmpty(VNodesKey nodeList, Dictionary<Symbol, bool> nullableDict) {
return CanBeEmpty(nodeList, 0, nodeList.Length, nullableDict);
}
// list是否都能产生ε?
static bool CanBeEmpty(IReadOnlyList<Symbol> nodeList, Dictionary<Symbol, bool> nullableDict) {
return CanBeEmpty(nodeList, 0, nodeList.Count, nullableDict);
}
// list中指定的某一段结点是否都能产生ε?
static bool CanBeEmpty(VNodesKey nodeList, int checkIndex, int checkCount, Dictionary<Symbol, bool> nullableDict) {
bool result = true;
for (int i = 0; i < checkCount; i++) {
var node = nodeList[i + checkIndex];
if (!nullableDict[node]) {
result = false;
break;
}
}
return result;
}
// list中指定的某一段结点是否都能产生ε?
static bool CanBeEmpty(IReadOnlyList<Symbol> nodeList, int checkIndex, int checkCount, Dictionary<Symbol, bool> nullableDict) {
bool result = true;
for (int i = 0; i < checkCount; i++) {
var node = nodeList[i + checkIndex];
if (!nullableDict[node]) {
result = false;
break;
}
}
return result;
}
FIRST
这是天书般的解释:若文法G为二型文法且不含左递归,则G的非终结符的每个候选式α的终结首符集FIRST(α)为FIRST(α) = { a | α经过0或多步推导为a...的形式,其中a∈Vt }
这是我的理解:FIRST集的含义是:候选式经过推导,最后就是一个终结符的串,推导过程不同,会有多个不同的串(可能是无限个),这些串里的第一个字符组成的集合就是这个候选式的FIRST集。有了这个FIRST集,就可以知道这个候选式是否能匹配接下来要解析的单词流了。
也就是说,给定Symbol
数组,在它能推导出的所有产生式中,第一个Symbol
都有哪些?
算法的基本思路是:所有的Vt,其FIRST都是Vt自己。以此为基础,一遍一遍地找到全部FIRST,直至找不到新的a。
点击查看 计算文法的FIRST集
// returns the dictionary of FIRST.target -> FIRST
internal static Dictionary<VNodesKey, FIRST> CalcFIRSTDict(
YieldContext context, Dictionary<Symbol, bool> nullableDict) {
var result = new Dictionary<VNodesKey, FIRST>();
CalcFIRSTDict4Node(result, context, nullableDict);
CalcFIRSTDict4Right(result, context, nullableDict);
return result;
}
// 计算各个文法的right的FIRST集
private static void CalcFIRSTDict4Right(Dictionary<VNodesKey, FIRST> result,
YieldContext context, Dictionary<Symbol, bool> nullableDict) {
// allocate space for every regulationDraft.right
var rightDict = new Dictionary<VNodesKey, FIRST>();
var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
foreach (var regulationDraft in context.extendedRegulationDrafts) {
var target = new VNodesKey(regulationDraft.right);
if (!result.TryGetValue(target, out var _)) {
var values = new Bits(length: eVtKinds); // not filled up yet.
var first = new FIRST(target, hasEmpty: false, values);
result.Add(first.target, first);
rightDict.Add(first.target, first);
}
}
bool changed = false;
do {
changed = false;
foreach (var first in rightDict.Values) {
var target = first.target; var count = target.Length; var allEmpty = true;
for (int checkLength = 0; checkLength < count; checkLength++) {
// 如果前checkLength个结点都可为ε,
// 就说明 FIRST( target ) 包含 FIRST( target[checkLength] ),ε除外。
const int checkIndex = 0;
if (CanBeEmpty(target, checkIndex, checkLength, nullableDict)) {
Symbol refKey = target[checkLength];
if (!result.TryGetValue(new VNodesKey(refKey), out var refFirst)) { throw new Exception(Consts.algorithmError); }
if (first.TryInsert(refFirst.values)) { changed = true; }
}
else { allEmpty = false; break; }
}
if (allEmpty) {
if (!first.hasEmpty) {
// 如果target的全部结点都可为ε,就说明FIRST( target ) 包含ε。
if (CanBeEmpty(target, nullableDict)) {
first.hasEmpty = true;
changed = true;
}
}
}
}
} while (changed);
}
// 计算文法的所有单个的结点的FIRST
private static void CalcFIRSTDict4Node(Dictionary<VNodesKey, FIRST> result,
YieldContext context, Dictionary<Symbol, bool> nullableDict) {
IReadOnlyList<RegulationDraft> eRegulationDrafts = context.extendedRegulationDrafts;
// allocate space for every single symbols.
var eVtKinds = context.eSyntaxVts.Count;
// 初始化FIRST( Vn )
foreach (var Vn in context.eVns) {
var values = new Bits(length: eVtKinds);
var containsEmpty = nullableDict[Vn];
var first = new FIRST(target: Vn, containsEmpty, values);
result.Add(first.target, first);
}
// 初始化FIRST( Vt )(FIRST( Vt )实际上已经完工)
foreach (var Vt in context.eSyntaxVts) {
var values = new Bits(length: eVtKinds);
var index = context.GeteSyntaxVtIndex(Vt); values.Set(index, true);
var first = new FIRST(target: Vt, hasEmpty: false, values);
result.Add(first.target, first);
}
bool changed = false;
do {
changed = false;
foreach (var regulationDraft in eRegulationDrafts) {
var left = regulationDraft.left; var right = regulationDraft.right;
// try to collect FIRST( left )
var allEmpty = true;
for (int endPosition = 0; endPosition < right.Count; endPosition++) {
// 如果前endPosition个结点都可为null,
// 就说明 FIRST(left) 包含 FIRST(right[endPosition]),ε除外。
if (CanBeEmpty(right, 0, endPosition, nullableDict)) {
var refKey = right[endPosition];
if (left != refKey) {
if (first.TryInsert(refFirst.values)) { changed = true; }
}
}
else { allEmpty = false; break; }
}
if (allEmpty) {
if (result.TryGetValue(new VNodesKey(left), out var first)) {
if (!first.hasEmpty) {
if (CanBeEmpty(right, nullableDict)) {
first.hasEmpty = true;
changed = true;
}
}
}
}
}
} while (changed);
}
FOLLOW
这是天书般的解释:设上下文无关文法(二型文法)G,开始符号为S,对于G中的任意非终结符A,其FOLLOW(A) = { a | S 经过0或多步推导会出现 ...Aa...的形式,其中a∈Vt或 '¥'
号 }
这是我的理解:结点Vt的FOLLOW集:在所有可能的产生式中,可能出现在Vt后面的所有结点。
这是天书般的算法:
令'¥'∈FOLLOW(S)
若文法G中有形如A –> αBβ的规则,且β≠ε,则将FIRST(β)中的一切非终结符Vt加入FOLLOW(B)
若文法G中有形如A -> αB或A -> αBβ的规则,且ε∈FIRST(β),则将FOLLOW(A)中的全部元素加入FOLLOW(B)
反复使用前两条规则,直到所有的FOLLOW集都没有改变。
这是我的代码:
点击查看 计算文法的FOLLOW集
// 计算文法的FOLLOW集
// returns the dictionary of FOLLOW.Vn -> FOLLOW
internal static Dictionary<Symbol/*FOLLOW.Vn*/, FOLLOW> CalcFOLLOWDict(
YieldContext context,
Dictionary<Symbol, bool> emptyDict, Dictionary<VNodesKey, FIRST> firstDict) {
var result = new Dictionary<Symbol/*FOLLOW.Vn*/, FOLLOW>();
// 初始化Follow Dict
var EOFIndex = context.GeteSyntaxVtIndex(Symbol.EOF);
var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
foreach (var Vn in context.eVns) {
var follow = new FOLLOW(Vn, eVtKinds);
if (Vn == context.extendedStartNode) {
// add '¥' to S' : S ; '¥'
follow.TryInsert(EOFIndex);
}
result.Add(follow.Vn, follow);
}
// 迭代到不动点
// iterate untill not changed.
bool changed = false;
do {
changed = false;
foreach (var regulationDraft in context.extendedRegulationDrafts) {
var right = regulationDraft.right; var count = right.Count;
for (var endPosition = 0; endPosition < count; endPosition++) {
Symbol target = right[endPosition];
if (target.kind == Symbol.Kind.Vt) { continue; } // 叶结点没有follow
// 准备为target添加follow元素
var checkIndex = endPosition + 1; var allEmpty = true;
for (var checkCount = 0; checkCount < count - checkIndex; checkCount++) {
// if right[checkIndex->(checkIndex+checkCount-1)] can be empty,
// then FOLLOW( target ) includes FIRST( right[checkInde+checkCount] ) except ε.
if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
// FOLLOW( target ) 包含 FIRST( right[checkInde+checkCount] )(除了ε)
Symbol Vn = target;
Symbol key = right[checkIndex + checkCount];
if (follow.TryInsert(first.values)) { changed = true; }
}
else { allEmpty = false; break; }
}
if (allEmpty) {
// 如果target之后的全部结点都可为ε,那么 FOLLOW( target ) 包含 FOLLOW( regulation.left )
if (follow != refFollow) {
var checkCount = count - checkIndex;
if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
if (follow.TryInsert(refFollow.values)) { changed = true; }
}
}
}
}
}
} while (changed);
return result;
}
从产生式到LL(1)语法分析表
这是天书般的解释:
点击查看 构造LL(1)分析表的算法
输入:文法G
输出:G的LL(1)分析表M(Ax, ay),其中A为非终结符,a为终结符
算法:
求出G的FIRST集和FOLLOW集
for (G的每个产生式 A -> γ1 | γ2 | ... | γm) {
if ( a ∈ FIRST(γi)) 置 M(A, a) 为 “A -> γi”
if ( ε ∈ FIRST(γi))
for (每个 a ∈ FOLLOW(A))
置 M(A, a)为 “A -> γi”(实际上此处的γi都是ε)
}
置所有无定义的 M(A, a)为出错。
这是我的代码:
点击查看 用LL(1)分析法得到分析表
// 用LL(1)分析法得到分析表
internal static LL1SyntaxInfo CalcLL1SyntaxInfo(YieldContext context,
Dictionary<Symbol, FOLLOW> eFOLLOWDict, Dictionary<VNodesKey, FIRST> eFIRSTDict) {
var regulationDrafts = context.grammar.regulationDrafts;
var regCount = regulationDrafts.Count;
var table = new LL1ParseTableDraft();
for (int regulationId = 0; regulationId < regCount; regulationId++) {
var regulation = regulationDrafts[regulationId];
var Vn = regulation.left;
var key = new VNodesKey(regulation.right);
var first = eFIRSTDict[key]; // FIRST( regulation.right )
var actionDraft = new LL1ParseActionDraft(regulation);
for (int t = 0; t < first.values.length; t++) {
if (first.values.Get(t)) {
var Vt = context.GeteSyntaxVt(t);
table.SetAction(Vn, Vt, actionDraft);
}
}
if (first.hasEmpty) {
var follow = eFOLLOWDict[Vn];
for (int t = 0; t < follow.values.length; t++) {
if (follow.values.Get(t)) {
var Vt = context.GeteSyntaxVt(t);
table.SetAction(Vn, Vt, actionDraft);
}
}
}
}
var result = new LL1SyntaxInfo(table);
return result;
}
从产生式到LR(1)语法分析表
点击查看 从产生式到LR(1)语法分析表
// 用LR分析法计算stateList、edgeList、分析表LRTableDraft
internal static LRSyntaxInfo CalcLRSyntaxInfo(
LRParseContext LRContext, YieldContext context) {
var stateList = new CoupleList<LRState>(LRContext.stateComparer);
var edgeList = new CoupleList<LREdge>(LRContext.edgeComparer);
CalcSyntaxStates(stateList, edgeList, LRContext, context);
LRTableDraft table = CalcLRTableDraft(stateList, edgeList,
LRContext.vtsProvider, context);
var result = new LRSyntaxInfo(stateList, edgeList, table);
return result;
}
private static void CalcSyntaxStates(CoupleList<LRState> stateList, CoupleList<LREdge> edgeList, LRParseContext LRContext, YieldContext context) {
var beta2FIRST = new Dictionary<VNodesKey, FIRST>();
var queue = new Queue<LRState>();
{
var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
var bitsEOF = new Bits(length: eVtKinds);
var EOFIndex = context.GeteSyntaxVtIndex(Symbol.EOF); bitsEOF.Set(EOFIndex, true);
var eRegulationDraft = context.extendedRegulationDrafts[0];
var firstState = new LRState(index: 0);
firstState.AddMerge(eRegulationDraft, dotPosition: 0, lookaheads: bitsEOF, out var _, LRContext);
Algo.Closure(firstState, beta2FIRST, LRContext, context);
var against = stateList.TryInsert(firstState);
queue.Enqueue(firstState);
}
while (queue.Count > 0) {
var subject = queue.Dequeue();
var dealt = new List<Symbol>(subject.ItemGroups.Count);
foreach (var itemGroup in subject.ItemGroups) {
Symbol? symbol = itemGroup.symbolAfterDot;
if ((symbol is not null) // this symbol is not dealt with yet
&& (Algo.TryBinaryInsert(dealt, symbol, out var _, Symbol.Comparer) is null)) {
var to = Algo.Goto(subject, symbol, LRContext, context);
Algo.Closure(to, beta2FIRST, LRContext, context);
var against = stateList.TryInsert(to);//融入组织之中吧
LRState? whichTo = null;
if (against is not null) { whichTo = against; }
else {
to.index = stateList.Count - 1; queue.Enqueue(to);
whichTo = to;
}
var edge = new LREdge(subject, symbol, whichTo);
var against2 = edgeList.TryInsert(edge);
}
}
}
}
// LR的Closure操作。
// 补全一个状态。
private static void Closure(this LRState subject,
Dictionary<VNodesKey, FIRST> beta2FIRST,
LRParseContext LRContext, YieldContext context) {
var queue = new Queue<LRItemGroup>();
foreach (var itemGroup in subject.ItemGroups) {
Symbol? symbol = itemGroup.symbolAfterDot;
if (symbol != null && symbol.kind == Symbol.Kind.Vn) {
queue.Enqueue(itemGroup);
}
}
while (queue.Count > 0) {
var itemGroup = queue.Dequeue();
Bits lookaheads;
if (LRContext.wantLookaheads) {
VNodesKey beta = itemGroup.beta;// A : α ⏳ Symbol β ; 'z'
if (beta.Length > 0) {
if (!beta2FIRST.TryGetValue(beta, out var first)) {
first = CalcFIRST(beta, context);
beta2FIRST.Add(beta, first);
}
if (first.hasEmpty) { // first | itemGroup
lookaheads = new Bits(first.values);
lookaheads.Or(itemGroup.lookaheads);
}
else { lookaheads = first.values; }
}
else { lookaheads = itemGroup.lookaheads; }
}
else {
var bitLength = context.eSyntaxVts.Count;
lookaheads = new Bits(bitLength); // 0 lookahead
}
var regulationDrafts = context.left2RegulationDrafts[itemGroup.symbolAfterDot];
foreach (var regulationDraft in regulationDrafts) {
const int dotPosition = 0;
var position = subject.AddMerge(regulationDraft, dotPosition,
lookaheads, out var updated, LRContext);
if (updated) {
Symbol? symbol = position.symbolAfterDot;
if (symbol != null && symbol.kind == Symbol.Kind.Vn) {
if (!queue.Contains(position)) { queue.Enqueue(position); }
}
}
}
}
}
// LR的Goto操作。
// 将⏳移到所有LR项中的符号<paramref name="symbol"/>之后。
private static LRState Goto(this LRState subject, Symbol symbol,
LRParseContext LRContext, YieldContext context) {
var toState = new LRState(index: -1); // -1 means not ready.
foreach (var itemGroup in subject.ItemGroups) {
if (itemGroup.symbolAfterDot == symbol) {
toState.AddMerge(itemGroup.regulationDraft, itemGroup.dotPosition + 1,
itemGroup.lookaheads, out var _, LRContext);
}
}
return toState;
}
// organize stateList and edgeList into a table.
private static LRTableDraft CalcLRTableDraft(
CoupleList<LRState> stateList, CoupleList<LREdge> edgeList,
ILRTableVtsProvider VtsProvider, YieldContext context) {
var table = new LRTableDraft(stateCount: stateList.Count);
// goto, shift in
foreach (var edge in edgeList) {
var state = edge.from; var to = edge.to;
switch (edge.symbol.kind) {
case Symbol.Kind.Vn: // goto action
LRParseActionDraft Goto = new(LRParseActionDraft.Kind.Goto, to);
table.SetAction(state, edge.symbol, Goto, context); break;
case Symbol.Kind.Vt: // shift in action
LRParseActionDraft Shift = new(LRParseActionDraft.Kind.Shift, to);
table.SetAction(state, edge.symbol, Shift, context); break;
default: throw new NotImplementedException();
}
}
// accept, reduce
var eStart = context.extendedStartNode; // the S' in many books.
foreach (var state in stateList) {
foreach (var itemGroup in state.ItemGroups) {
if (itemGroup.symbolAfterDot is null) {
if (itemGroup.regulationDraft.left == eStart) {
// accept action
table.SetAction(state, Symbol.EOF, LRParseActionDraft.accept, context);
}
else {
// reduce action
var Reduce = new LRParseActionDraft(itemGroup.regulationDraft);
var bits = VtsProvider.GetVts(itemGroup);
for (int i = 0; i < bits.length; i++) {
if (bits.Get(i)) {
var Vt = context.GeteSyntaxVt(i);
table.SetAction(state, Vt, Reduce, context);
}
}
}
}
}
}
return table;
}
从LR(1)语法分析表到LALR(1)语法分析表
既然已经有了LR(1)语法分析表,那么再计算LALR(1)语法分析表就有了简便方法。原因是:LALR(1)分析法与LR(1)分析法的区别仅在于:两个LR(1)的State里,某个项中的lookAhead不同时,那就意味着这是不同的State;而两个LALR(1)的State里,各个项中的lookAhead不同时,仍旧表示同一个State。也就是说,只需将LR(1)的各个State中,对应项相同而lookAhead不同的State合并,就得到了LALR(1)分析表的State。
点击查看 LALR(1)简便算法
// 用LR(1)分析法得到的信息快速得到LALR(1)的Edge、State、分析表
static LRSyntaxInfo GetLALR1SyntaxInfo(
LRSyntaxInfo _LRSyntaxInfo, // LR(1) syntax info
IReadOnlyList<RegulationDraft> eRegulations) {
var stateList = new CoupleList<LRState>();
// a LALR(1) state is 1/more LR(1) states.
// so, let's merge LR(1) states into LALR(1) state.
var LRState2LALR1State = new Dictionary<LRState, LRState>();
foreach (var state in _LRSyntaxInfo.stateList) {
var mentor = Absorb(stateList, state, context);
// 'state'(LR(1)) is merged into 'mentor'(LALR(1))
LRState2LALR1State.Add(state, mentor);
}
var edgeList = new CoupleList<LREdge>(_LRSyntaxInfo.edgeList.Count, context.edgeComparer);
foreach (var edge in _LRSyntaxInfo.edgeList) {
var from = LRState2LALR1State[edge.from];
var to = LRState2LALR1State[edge.to];
var edge2 = new LREdge(from, edge.V, to);
edgeList.TryInsert(edge2);
}
var eEnd = VNode.endOfTokenList;
LRTableDraft table = GetLRTableDraft(stateList, edgeList, context.vtsProvider, eRegulations, eEnd);
var result = new LRSyntaxInfo(stateList, edgeList, table);
return result;
}
// either some state in LALR1StateList absorbs/merges LR1State's lookAheads,
// or a new LALR(1) state which consists of LR1State is generated.
static LRState Absorb(CoupleList<LRState> LALR1StateList, LRState LR1State, LRParseContext context) {
var addList = LALR1StateList.addList; var orderList = LALR1StateList.orderList;
LRState? mentor = null; var index = orderList.Count;
int left = 0, right = orderList.Count - 1;
if (right < 0) {
mentor = new LRState(index);
mentor.index = index;
foreach (var itemGroup in LR1State.ItemGroups) {
mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
}
addList.Add(mentor); orderList.Add(mentor);
}
else {
while (left < right) {
int mid = (left + right) / 2;
var current = orderList[mid];
var result = TryAbsorb(current, LR1State, context);
if (result < 0) { right = mid; }
else if (result == 0) { left = mid; right = mid; mentor = current; }
else { left = mid + 1; }
}
if (mentor is null) {
var LALR1State = orderList[left];
var result = TryAbsorb(LALR1State, LR1State, context);
if (result == 0) { /* already inserted into 'current' */ mentor = LALR1State; }
else {
mentor = new LRState(index);
foreach (var itemGroup in LR1State.ItemGroups) {
mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
}
addList.Add(mentor); orderList.Insert(result < 0 ? left : left + 1, mentor);
}
}
}
return mentor;
}
// try to absorb state if this is equal state to state
// only used in LALR(1) syntax parse.
static int TryAbsorb(LRState LALR1State, LRState LR1State, LRParseContext context) {
var result = context.stateComparer(LALR1State, LR1State);
if (result == 0) { // euqal states should be absorbed.
foreach (var itemGroup in LR1State.ItemGroups) {
DoAbsorb(LALR1State.OrderList, itemGroup, context);
}
}
return result;
}
static void DoAbsorb(IReadOnlyList<LRItemGroup> list, LRItemGroup itemGroup, LRParseContext context) {
LRItemGroup? against = null; var keyIndex = -1;
var left = 0; var right = list.Count - 1;
if (right >= 0) {
var result = -1;
while (left < right) {
var mid = (left + right) / 2;
var current = list[mid];
result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
if (result < 0) { right = mid; }
else if (result == 0) { left = mid; right = mid; against = current; keyIndex = left; }
else { left = mid + 1; }
}
if (result != 0) {
var current = list[left];
result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
if (result == 0) { against = current; keyIndex = left; }
}
}
foreach (var lookAhead in itemGroup.lookAheads) {
against.lookAheads.TryInsert(lookAhead);
}
}
从LALR(1)语法分析表到语法分析器的C#代码
LALR(1)语法分析表记录的是在某个LALR(1)状态下,遇到某个V(Vn或Vt)时,应当跳转到哪个状态。在跳转过程中,有时应当用某个产生式规约,有时应当移进Vt,如果遇到'¥'
,则应当结束。
对应的语法分析器C#源代码,应当将LALR(1)语法分析表记录下来,这实际上是若干字典对象Dictionary<int/*V*/, LRParseAction>
。
点击查看 用LALR(1)分析表建立的语法分析器
const int syntaxStateCount = 16;
/// <summary>
/// LALR(1) syntax parse table
/// </summary>
private static readonly Dictionary<int/*Node.type*/, LRParseAction>[]
syntaxStates = new Dictionary<int/*Node.type*/, LRParseAction>[syntaxStateCount];
private static void InitializeSyntaxStates() {
var states = CompilerExp.syntaxStates;
// 78 actions
// conflicts(0)=not sovled(0)+solved(0)(0 warnings)
#region create objects of syntax states
states[0] = new(capacity: 5);
states[1] = new(capacity: 3);
states[2] = new(capacity: 6);
states[3] = new(capacity: 6);
states[4] = new(capacity: 5);
states[5] = new(capacity: 6);
states[6] = new(capacity: 4);
states[7] = new(capacity: 4);
states[8] = new(capacity: 3);
states[9] = new(capacity: 3);
states[10] = new(capacity: 3);
states[11] = new(capacity: 6);
states[12] = new(capacity: 6);
states[13] = new(capacity: 6);
states[14] = new(capacity: 6);
states[15] = new(capacity: 6);
#endregion create objects of syntax states
#region re-used actions
LRParseAction aGoto2 = new(LRParseAction.Kind.Goto, states[2]);// refered 2 times
LRParseAction aGoto3 = new(LRParseAction.Kind.Goto, states[3]);// refered 4 times
LRParseAction aShift4 = new(LRParseAction.Kind.Shift, states[4]);// refered 6 times
LRParseAction aShift5 = new(LRParseAction.Kind.Shift, states[5]);// refered 6 times
LRParseAction aShift6 = new(LRParseAction.Kind.Shift, states[6]);// refered 2 times
LRParseAction aShift7 = new(LRParseAction.Kind.Shift, states[7]);// refered 2 times
LRParseAction aShift8 = new(LRParseAction.Kind.Shift, states[8]);// refered 3 times
LRParseAction aShift9 = new(LRParseAction.Kind.Shift, states[9]);// refered 3 times
LRParseAction aReduce2 = new(regulations[2]);// refered 4 times
LRParseAction aReduce5 = new(regulations[5]);// refered 6 times
LRParseAction aReduce7 = new(regulations[7]);// refered 6 times
LRParseAction aReduce0 = new(regulations[0]);// refered 4 times
LRParseAction aReduce1 = new(regulations[1]);// refered 4 times
LRParseAction aReduce3 = new(regulations[3]);// refered 6 times
LRParseAction aReduce4 = new(regulations[4]);// refered 6 times
LRParseAction aReduce6 = new(regulations[6]);// refered 6 times
#endregion re-used actions
// 78 actions
// conflicts(0)=not sovled(0)+solved(0)(0 warnings)
#region init actions of syntax states
// syntaxStates[0]:
// [-1] Exp' : ⏳ Exp ;☕ '¥'
// [0] Exp : ⏳ Exp '+' Term ;☕ '-' '+' '¥'
// [1] Exp : ⏳ Exp '-' Term ;☕ '-' '+' '¥'
// [2] Exp : ⏳ Term ;☕ '-' '+' '¥'
// [3] Term : ⏳ Term '*' Factor ;☕ '-' '*' '/' '+' '¥'
// [4] Term : ⏳ Term '/' Factor ;☕ '-' '*' '/' '+' '¥'
// [5] Term : ⏳ Factor ;☕ '-' '*' '/' '+' '¥'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' '*' '/' '+' '¥'
// [7] Factor : ⏳ 'number' ;☕ '-' '*' '/' '+' '¥'
states[0]/*0*/.Add(st.@vnExp, new(LRParseAction.Kind.Goto, states[1]));
states[0]/*1*/.Add(st.@vnTerm, aGoto2);
states[0]/*2*/.Add(st.@vnFactor, aGoto3);
states[0]/*3*/.Add(st.@LeftParenthesis符, aShift4);
states[0]/*4*/.Add(st.@number, aShift5);
// syntaxStates[1]:
// [-1] Exp' : Exp ⏳ ;☕ '¥'
// [0] Exp : Exp ⏳ '+' Term ;☕ '-' '+' '¥'
// [1] Exp : Exp ⏳ '-' Term ;☕ '-' '+' '¥'
states[1]/*5*/.Add(st.@Plus符, aShift6);
states[1]/*6*/.Add(st.@Dash符, aShift7);
states[1]/*7*/.Add(st.@终, LRParseAction.accept);
// syntaxStates[2]:
// [2] Exp : Term ⏳ ;☕ '-' ')' '+' '¥'
// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥'
states[2]/*8*/.Add(st.@Asterisk符, aShift8);
states[2]/*9*/.Add(st.@Slash符, aShift9);
states[2]/*10*/.Add(st.@Dash符, aReduce2);
states[2]/*11*/.Add(st.@RightParenthesis符, aReduce2);
states[2]/*12*/.Add(st.@Plus符, aReduce2);
states[2]/*13*/.Add(st.@终, aReduce2);
// syntaxStates[3]:
// [5] Term : Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥'
states[3]/*14*/.Add(st.@Dash符, aReduce5);
states[3]/*15*/.Add(st.@RightParenthesis符, aReduce5);
states[3]/*16*/.Add(st.@Asterisk符, aReduce5);
states[3]/*17*/.Add(st.@Slash符, aReduce5);
states[3]/*18*/.Add(st.@Plus符, aReduce5);
states[3]/*19*/.Add(st.@终, aReduce5);
// syntaxStates[4]:
// [6] Factor : '(' ⏳ Exp ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [0] Exp : ⏳ Exp '+' Term ;☕ '-' ')' '+'
// [1] Exp : ⏳ Exp '-' Term ;☕ '-' ')' '+'
// [2] Exp : ⏳ Term ;☕ '-' ')' '+'
// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+'
// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+'
// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+'
// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+'
states[4]/*20*/.Add(st.@vnExp, new(LRParseAction.Kind.Goto, states[10]));
states[4]/*21*/.Add(st.@vnTerm, aGoto2);
states[4]/*22*/.Add(st.@vnFactor, aGoto3);
states[4]/*23*/.Add(st.@LeftParenthesis符, aShift4);
states[4]/*24*/.Add(st.@number, aShift5);
// syntaxStates[5]:
// [7] Factor : 'number' ⏳ ;☕ '-' ')' '*' '/' '+' '¥'
states[5]/*25*/.Add(st.@Dash符, aReduce7);
states[5]/*26*/.Add(st.@RightParenthesis符, aReduce7);
states[5]/*27*/.Add(st.@Asterisk符, aReduce7);
states[5]/*28*/.Add(st.@Slash符, aReduce7);
states[5]/*29*/.Add(st.@Plus符, aReduce7);
states[5]/*30*/.Add(st.@终, aReduce7);
// syntaxStates[6]:
// [0] Exp : Exp '+' ⏳ Term ;☕ '-' ')' '+' '¥'
// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥'
states[6]/*31*/.Add(st.@vnTerm, new(LRParseAction.Kind.Goto, states[11]));
states[6]/*32*/.Add(st.@vnFactor, aGoto3);
states[6]/*33*/.Add(st.@LeftParenthesis符, aShift4);
states[6]/*34*/.Add(st.@number, aShift5);
// syntaxStates[7]:
// [1] Exp : Exp '-' ⏳ Term ;☕ '-' ')' '+' '¥'
// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥'
states[7]/*35*/.Add(st.@vnTerm, new(LRParseAction.Kind.Goto, states[12]));
states[7]/*36*/.Add(st.@vnFactor, aGoto3);
states[7]/*37*/.Add(st.@LeftParenthesis符, aShift4);
states[7]/*38*/.Add(st.@number, aShift5);
// syntaxStates[8]:
// [3] Term : Term '*' ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥'
states[8]/*39*/.Add(st.@vnFactor, new(LRParseAction.Kind.Goto, states[13]));
states[8]/*40*/.Add(st.@LeftParenthesis符, aShift4);
states[8]/*41*/.Add(st.@number, aShift5);
// syntaxStates[9]:
// [4] Term : Term '/' ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥'
states[9]/*42*/.Add(st.@vnFactor, new(LRParseAction.Kind.Goto, states[14]));
states[9]/*43*/.Add(st.@LeftParenthesis符, aShift4);
states[9]/*44*/.Add(st.@number, aShift5);
// syntaxStates[10]:
// [6] Factor : '(' Exp ⏳ ')' ;☕ '-' ')' '*' '/' '+' '¥'
// [0] Exp : Exp ⏳ '+' Term ;☕ '-' ')' '+'
// [1] Exp : Exp ⏳ '-' Term ;☕ '-' ')' '+'
states[10]/*45*/.Add(st.@RightParenthesis符, new(LRParseAction.Kind.Shift, states[15]));
states[10]/*46*/.Add(st.@Plus符, aShift6);
states[10]/*47*/.Add(st.@Dash符, aShift7);
// syntaxStates[11]:
// [0] Exp : Exp '+' Term ⏳ ;☕ '-' ')' '+' '¥'
// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥'
states[11]/*48*/.Add(st.@Asterisk符, aShift8);
states[11]/*49*/.Add(st.@Slash符, aShift9);
states[11]/*50*/.Add(st.@Dash符, aReduce0);
states[11]/*51*/.Add(st.@RightParenthesis符, aReduce0);
states[11]/*52*/.Add(st.@Plus符, aReduce0);
states[11]/*53*/.Add(st.@终, aReduce0);
// syntaxStates[12]:
// [1] Exp : Exp '-' Term ⏳ ;☕ '-' ')' '+' '¥'
// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥'
// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥'
states[12]/*54*/.Add(st.@Asterisk符, aShift8);
states[12]/*55*/.Add(st.@Slash符, aShift9);
states[12]/*56*/.Add(st.@Dash符, aReduce1);
states[12]/*57*/.Add(st.@RightParenthesis符, aReduce1);
states[12]/*58*/.Add(st.@Plus符, aReduce1);
states[12]/*59*/.Add(st.@终, aReduce1);
// syntaxStates[13]:
// [3] Term : Term '*' Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥'
states[13]/*60*/.Add(st.@Dash符, aReduce3);
states[13]/*61*/.Add(st.@RightParenthesis符, aReduce3);
states[13]/*62*/.Add(st.@Asterisk符, aReduce3);
states[13]/*63*/.Add(st.@Slash符, aReduce3);
states[13]/*64*/.Add(st.@Plus符, aReduce3);
states[13]/*65*/.Add(st.@终, aReduce3);
// syntaxStates[14]:
// [4] Term : Term '/' Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥'
states[14]/*66*/.Add(st.@Dash符, aReduce4);
states[14]/*67*/.Add(st.@RightParenthesis符, aReduce4);
states[14]/*68*/.Add(st.@Asterisk符, aReduce4);
states[14]/*69*/.Add(st.@Slash符, aReduce4);
states[14]/*70*/.Add(st.@Plus符, aReduce4);
states[14]/*71*/.Add(st.@终, aReduce4);
// syntaxStates[15]:
// [6] Factor : '(' Exp ')' ⏳ ;☕ '-' ')' '*' '/' '+' '¥'
states[15]/*72*/.Add(st.@Dash符, aReduce6);
states[15]/*73*/.Add(st.@RightParenthesis符, aReduce6);
states[15]/*74*/.Add(st.@Asterisk符, aReduce6);
states[15]/*75*/.Add(st.@Slash符, aReduce6);
states[15]/*76*/.Add(st.@Plus符, aReduce6);
states[15]/*77*/.Add(st.@终, aReduce6);
#endregion init actions of syntax states
}
编译原理中的词法分析器
根据文法Grammar生成词法分析器,其基本原理是:用正则表达式描述解析器中的单词Token,然后将其转换为词法分析器代码。
从正则表达式到ε-NFA
正则表达式是若干个“字符及其重复次数”的list。因此它也可以用一个文法表示。
// Pattern is xxx in %%xxx%%
// xxx is any char between □(32) and ~(126).
// VnRegulations:
Pattern : PreRegex Regex PostRegex ;
PreRegex : 'refVt' | empty ;
PostRegex : '/' Regex | empty ;
Regex : Regex '|' Bunch | Bunch ;
Bunch : Bunch Unit | Unit ;
Unit : 'char' Repeat | '.' Repeat | 'scope' Repeat | '(' Regex ')' Repeat ;
Repeat : '?' | '+' | '*' | 'MinMax' | empty ;
// 'refVt' is <'Vt'> 'Vt' is same with which in Grammar.st
%%\<'([ -&]|\\'|[(-\[]|\\\\|[\]-~])+'>%% 'refVt'
%%\t|\n|\r|[\u0020!"#$%&']%% 'char'
%%\\[()*+]%% 'char'
%%[,-]%% 'char'
%%\\[./]%% 'char'
%%[0-9:;]%% 'char'
%%\\[<]%% 'char'
%%=%% 'char'
%%\\[>]|\\[?]%% 'char'
%%[@A-Z]%% 'char'
%%\\[\[]|\\\\|\\[\]]%% 'char'
%%^|[_`a-z]%% 'char'
%%\\[{|}]%% 'char'
%%~%% 'char'
%%\\u[0-9A-Fa-f]{4}%% 'char'
%%\[([^\]]|\\[\]])+]%% 'scope'
%%\{[,0-9 \t]+}%% 'MinMax'
当处理一个char及其次数时:
点击查看 当处理一个char及其次数时
eNFAFragment Parse(char value, MinMax minmax) {
ICharRange conditionCode = SingleChar.New(value);
var count = minmax.max + 1; // 用count个eNFAState链接表示minmax.max次
if (count <= 0) { count = minmax.min + 1; }
var stateList = new eNFAState[count];
stateList[0] = new eNFAState();
for (int i = 1; i < count; i++) {
stateList[i] = new eNFAState();
var edge = eNFAEdge.Connect(stateList[i - 1], stateList[i], conditionCode);
}
for (int i = minmax.min; i < count - 1; i++) {
var edge = eNFAEdge.Connect(stateList[i], stateList[count - 1], null/*ε边*/);
}
if (minmax.max < 0) { // 如果最大为无限次
var edge = eNFAEdge.Connect(stateList[count - 1], stateList[count - 1], conditionCode);
}
var unit = new eNFAFragment(start: stateList[0], end: stateList[count - 1]);
return unit;
}
当处理其他类型的字符集合时,与此同理,不再赘述。
完整的词法分析器,需要处理Grammar中的所有Token类型,因此,我们必须把各个正则表达式拼合为一个大的正则表达式,如下:
点击查看 拼合所有正则表达式
// get the whole complete ε-NFA of all kinds of tokens for lexical analyze
public AutomatonInfo GetWholeAutomaton() {
var wholeStart = new eNFAState(0, "wholeStart");
var wholeEnd = new eNFAState(1, "wholeEnd"); wholeEnd.isEnd = true;
var wholeRegex = new eNFAInfo(wholeStart, wholeEnd);
// connect all eNFAInfo together to make a whole complete ε-NFA for lexical analyze.
var id1 = 1u;
var Vt2Pattern4Vts = this.GetVt2Pattern4Vts();
foreach (var pair in Vt2Pattern4Vts) {
var Vt = pair.Key; var pattern4Vts = pair.Value;
for (int i = 0; i < pattern4Vts.Count; i++) { //bits.length == VtInfo.pattern4Vts.Count
var pattern4Vt = pattern4Vts[i]; var preVt = pattern4Vt.pattern.preVt;
pattern4Vt.pattern.xCopy(id1++, out var regex, out var postRegex);
{ // connect preENFA & tokenScript
var closeStart = eNFAEdge.Connect(wholeStart, regex.start);
var closeEnd = eNFAEdge.Connect(postRegex.end, wholeEnd);
}
foreach (var signalCondition in pattern4Vt.signalConditions) {
{
var script = new TokenScript(TokenScript.Kind.BeginToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
foreach (var edge in regex.start.toEdges) { edge.TryAttach(script, wholeRegex); }
}
{
var script = new TokenScript(TokenScript.Kind.ExtendToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
foreach (var edge in postRegex.start.toEdges) { edge.TryAttach(script, wholeRegex); }
}
{
var script = new TokenScript(TokenScript.Kind.AcceptToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
foreach (var edge in postRegex.end.fromEdges) { edge.TryAttach(script, wholeRegex); }
}
}
}
}
var validChars = this.GetValidScopeChars();
var wholeAutomaton = AutomatonInfo.New(wholeRegex, true, validChars);
return wholeAutomaton;
}
从ε-NFA到补充完整的ε-NFA
这一步的作用是:将ε-NFA中所有隐藏的ε边都画出来,以便后续处理。
基本思路是:如果A-ε->B且B-ε->C,那么应当画出A-ε->C这条ε边。
点击查看 从ε-NFA到补充完整的ε-NFA
// build edges that are implyed by ε edges.
private static eNFAInfo ManifesteNFA(eNFAInfo eNFA) {
var copyed = eNFA.Copy();
SpreadEnds(copyed);
CompleteEdges(copyed);
return copyed;
}
private static void SpreadEnds(eNFAInfo eNFA) {
var initialEnds = new Queue<eNFAState>();
{
var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
if (subject.isEnd) {
if (!initialEnds.Contains(subject)) { initialEnds.Enqueue(subject); }
}
foreach (var edge in subject.toEdges) {
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
// spread the ends
{
var queue = initialEnds;
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
foreach (var edge in subject.fromEdges) {
if (edge.conditionCode is null) {
var from = edge.from; from.isEnd = true;
if (!visited.Contains(from)) { queue.Enqueue(from); }
}
}
foreach (var edge in subject.toEdges) {
if (edge.conditionCode is null) {
var to = edge.to; to.isEnd = true;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
}
}
// complete edges.
private static void CompleteEdges(eNFAInfo eNFA) {
var initialEmptyQueue = new Queue<eNFAEdge>();
{
var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
foreach (var edge in subject.toEdges) {
if (edge.conditionCode is null) {
if (!initialEmptyQueue.Contains(edge)) { initialEmptyQueue.Enqueue(edge); }
}
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
{
var emptyQueue = initialEmptyQueue;
var visited = new List<eNFAEdge>();
while (emptyQueue.Count > 0) {
var emptyEdge = emptyQueue.Dequeue();
if (!visited.Contains(emptyEdge)) {
visited.Add(emptyEdge);
var from = emptyEdge.from; var to = emptyEdge.to;
foreach (var edge in to.toEdges) {
var to2 = edge.to;
// if(from -->|emptyEdge| to) { from -->|to.toEdges| to2 }
var newEdge = eNFAEdge.Connect(from, to2, edge.conditionCode);
if (newEdge is not null) {
if (newEdge.conditionCode is null) { emptyQueue.Enqueue(newEdge); }
}
}
}
}
}
}
从补充完整的ε-NFA到NFA
这一步的作用是:去掉ε-NFA中的ε边(即无须任何char即可跳转过去的边),顺便去掉无用的状态State,以便后续处理。
基本思路是:复制原本的ε-NFA,但是,在复制过程中,如果边是ε边,则不复制它,也不复制它指向的State。
点击查看 从补充完整的ε-NFA到NFA
// remove empty edges(and thus useless states).
private static NFAInfo ToNFA(eNFAInfo eNFAManifested) {
// Template.state -> Copyed.state
var stateDict = new Dictionary<eNFAState, NFAState>();
// Template.edge -> Copyed.edge
var edgeDict = new Dictionary<eNFAEdge, NFAEdge>();
NFAInfo NFA;
{
var tStart = eNFAManifested.start;
var cStart = new NFAState(tStart);
stateDict.Add(tStart, cStart);
NFA = new NFAInfo(cStart);
}
{
var tStart = eNFAManifested.start;
var queue = new Queue<eNFAState>(); queue.Enqueue(tStart);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var tSubject = queue.Dequeue();
if (!visited.Contains(tSubject)) {
visited.Add(tSubject);
// copy state
if (!stateDict.TryGetValue(tSubject, out var cSubject)) {
cSubject = new NFAState(tSubject); stateDict.Add(tSubject, cSubject);
}
bool allEmpty = true;
foreach (var tEdge in tSubject.toEdges) {
if (tEdge.conditionCode != null) {
allEmpty = false;
var tTo = tEdge.to;
// copy state
if (!stateDict.TryGetValue(tTo, out var cTo)) {
cTo = new NFAState(tTo);
stateDict.Add(tTo, cTo);
}
// copy edge
if (!edgeDict.TryGetValue(tEdge, out var cEdge)) {
cEdge = NFAEdge.Connect(cSubject, cTo, tEdge.conditionCode, tEdge.possibleVts);
edgeDict.Add(tEdge, cEdge);
}
if (!visited.Contains(tTo)) { queue.Enqueue(tTo); }
}
}
}
}
}
return NFA;
}
从NFA到DFA
NFA的一个缺点是:可能存在这样的情况,即A-x->B和A-x->C同时存在,即一个状态A可能在经过字符x时跳转到两个不同的状态B和C上。为了消灭这种情况,我们需要将NFA转换为等价的DFA。DFA就没有这种情况了。
基本思路是:若A-x->B和A-x->C同时存在,则将B和C视为一个整体,让这个整体作为DFA的一个状态。这样,A经过字符x时,就会只跳转到一个状态上了。
这被称为子集构造法(Subset Construction Algorithm)。
点击查看 从NFA到DFA的子集构造法(Subset Construction Algorithm)
// 子集构造法
// transform from NFA to DFA.
private static DFAInfo ToDFA(NFAInfo NFA) {
int DFAId = 0;// id in order of DFA state creation
DFAInfo DFA;
var stateList = new CoupleList<DFAStateDraft>();
var edgeList = new CoupleList<DFAEdgeDraft>();
var queue = new Queue<DFAStateDraft>();
{
var DFAStart = new DFAStateDraft(DFAId++, NFA.start.name, NFA.start);
DFA = new DFAInfo(DFAStart, NFA);
stateList.TryInsert(DFAStart);
queue.Enqueue(DFAStart);
}
while (queue.Count > 0) { // find DFA states except the DFAStart
var from = queue.Dequeue();
// DFAToDict: { DFAFrom go through {some chars} to { DFATo } }
var chars = new List<char>();
foreach (var NFAState in from.NFAStates) {
foreach (var NFAEdge in NFAState.toEdges) {
foreach (var c in NFAEdge.GetChars()) {
if (!chars.Contains(c)) { chars.Add(c); }
}
}
}
// DFATo -> matching chars
var rawDict = new ListedDict<CoupleList<NFAEdgeDraft>, char>();
foreach (var c in chars) {
var NFAEdges = new CoupleList<NFAEdgeDraft>(); // -->|c| { toStates }
foreach (var NFAState in from.NFAStates) {
foreach (var NFAEdge in NFAState.toEdges) {
if (NFAEdge.Contains(c)) {
NFAEdges.TryInsert(NFAEdge);
}
}
}
rawDict.TryInsert(NFAEdges, c);
}
foreach (var item in rawDict) {
var NFAEdges = item.Key;
var to = new DFAStateDraft(DFAId, from NFAEdge in NFAEdges select NFAEdge.to);
if (stateList.TryInsert(to)) {
DFAId++;
var literalChars = item.Value;
string condition = ToCharRange(literalChars); }
var edge = DFAEdgeDraft.Connect(from, to, condition);
edgeList.TryInsert(edge);
queue.Enqueue(to);
}
else {
var t = stateList.IndexOf(to);
var oldTo = stateList[t];
var literalChars = item.Value;
string condition = ToCharRange(literalChars); }
var edge = DFAEdgeDraft.Connect(from, oldTo, condition);
edgeList.TryInsert(edge);
}
}
}
return DFA;
}
private static string/*DFAEdgeDraft.condition*/ ToCharRange(CoupleList<char> charList) {
if (charList.Count == 1) {
var c = charList[0];
return c.ToString();
}
var rangeItems = new List<RangeItem>();
var index = 0;
while (index < charList.Count) {
var min = charList[index]; var max = charList[index];
while (index < charList.Count - 1
&& Math.Abs(charList[index] - charList[index + 1]) == 1) {
if (charList[index + 1] < min) { min = charList[index + 1]; }
if (max < charList[index + 1]) { max = charList[index + 1]; }
index++;
}
rangeItems.Add(new RangeItem(min, max));
index++;
}
bool dashExists = false;
for (int t = 0; t < rangeItems.Count; t++) {
var rangeItem = rangeItems[t];
if (rangeItem.min == '-' && rangeItem.max == '-') {
rangeItems.Remove(rangeItem);
t--;
dashExists = true;
}
}
var b = new StringBuilder();
b.Append("["); if (dashExists) { b.Append("-"); }
foreach (var item in rangeItems) {
b.Append(item.ToCondition());
}
b.Append("]");
return b.ToString();
}
这一版本的效率太低,我已重写之。但这一版本更容易理解,因而放在这里。
从DFA到miniDFA
DFA的状态可能有很多,有时候是可以合并的。将其合并到状态数量最少的DFA,就是miniDFA。
基本思路是:
初始化:将代表一个Token的正则表达式的End状态分别独立划分出来,分别作为miniDFA的一个状态,将其他状态作为miniDFA的一个状态。
循环:拆分当前的各个miniDFA状态,方法是,如果这个miniDFA状态中的两个DFA状态不等价,则将其拆分到两个新的miniDFA状态中。何为不等价?miniDFA状态中的两个DFA状态A和B,若A和B对某个字符x,其跳转到不同的miniDFA状态,那么A和B就是不等价的。
收尾:无可拆分时,目前的状态就是全部miniDFA状态。最坏情况下,全部miniDFA状态与全部DFA状态的数量相同。
点击查看 从DFA到miniDFA
// minimize states of the specified FAInfo.
private miniDFAInfo TominiDFA(DFAInfo DFAInfo) {
// every CoupleList<DFAState> is a/some potential miniDFA(s),
// and it needs to be further split.
List<CoupleList<DFAState>> chaos = InitChaos(DFAInfo);
List<CoupleList<DFAState>> completedChaos = SplitChaos(chaos, validChars);
// dump minimum DFA
miniDFAState[] miniDFAStates = ConstructminiDFAStates(completedChaos);
miniDFAInfo miniDFA;
{
miniDFAState? miniDFAStart = null;
foreach (var state in miniDFAStates) {
if (state.Contains(DFAInfo.start)) {
miniDFAStart = state; break;
}
}
miniDFA = new miniDFAInfo(miniDFAStart, DFAInfo);
}
// DFA state -> index of the item(which is a collection) in chaos
var DFA2Chaos = new Dictionary<DFAState, /*Collection<DFAState>*/int>();
{
for (int index = 0; index < completedChaos.Count; index++) {
var DFAStates = completedChaos[index];
foreach (var DFAState in DFAStates) { DFA2Chaos.Add(DFAState, /*DFAStates*/index); }
}
}
{
// edges of minimum DFA
var miniEdges = new CoupleList<miniDFAEdge>();
var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
var visited = new List<DFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
var fromIndex = DFA2Chaos[subject];
var miniDFAFrom = miniDFAStates[fromIndex];
foreach (var edge in subject.toEdges) {
var to = edge.to;
var toIndex = DFA2Chaos[to];
var newEdge = miniDFAEdge.Connect(miniDFAFrom, miniDFAStates[toIndex], edge.conditionCode);
miniEdges.TryInsert(newEdge);
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
return miniDFA;
}
private static miniDFAState[] ConstructminiDFAStates(List<CoupleList<DFAState>> completedChaos) {
var count = completedChaos.Count;
var miniDFAStates = new miniDFAState[count];
for (int id = 0; id < count; id++) {
var DFAStates = completedChaos[id];
miniDFAStates[id] = new miniDFAState(id, DFAStates);
}
return miniDFAStates;
}
private List<CoupleList<DFAState>> SplitChaos(List<CoupleList<DFAState>> initialChaos, ICharRange validChars) {
var currentChaos = initialChaos;
bool updated = true;
while (updated) {
var nextChaos = new List<CoupleList<DFAState>>();
foreach (var miniDFAEgg in currentChaos) {
var merged = new bool[miniDFAEgg.Count];
for (int i = 0; i < miniDFAEgg.Count; i++) {
if (merged[i]) { continue; }
var standard = miniDFAEgg[i];
var newEgg = new CoupleList<DFAState>(); newEgg.TryInsert(standard);
merged[i] = true;
for (int j = i + 1; j < miniDFAEgg.Count; j++) {
if (merged[j]) { continue; }
var state = miniDFAEgg[j];
if (EqualValue(standard, state, currentChaos, validChars)) {
newEgg.TryInsert(state);
merged[j] = true;
}
}
nextChaos.Add(newEgg);
}
}
updated = (nextChaos.Count != currentChaos.Count);
currentChaos = nextChaos;
}
return currentChaos;
}
// index -> condition code of the miniDFAEgg in the chaos.
private Dictionary<int/*which miniDFAEgg*/, ICharRange> GetHopcroft(
DFAState key, List<CoupleList<DFAState>> chaos) {
var HopcroftBuilder = new Dictionary<int/*which miniDFAEgg*/, RangeListBuilder>();
foreach (var edge in key.toEdges) {
var found = false;
for (int sIndex = 0; sIndex < chaos.Count; sIndex++) {
var miniDFAEgg = chaos[sIndex];
foreach (var DFAState in miniDFAEgg) {
if (DFAState == edge.to) {
if (!HopcroftBuilder.TryGetValue(sIndex, out var builder)) {
builder = new RangeListBuilder();
HopcroftBuilder.Add(sIndex, builder);
}
builder.Append(edge.conditionCode);
found = true; break;
}
}
if (found) { break; }
}
}
Hopcroft = new Dictionary<int/*which miniDFAEgg*/, ICharRange>();
foreach (var item in HopcroftBuilder) {
var indexOfChaos = item.Key; var builder = item.Value;
var conditionCode = builder.Build();
Hopcroft.Add(indexOfChaos, conditionCode);
}
return Hopcroft;
}
// standard and current are in the same miniDFAEgg of chaos
// Are they of equal value?
private bool EqualValue(DFAState standard, DFAState current, List<CoupleList<DFAState>> chaos) {
var standardDict = this.GetHopcroft(standard, chaos);
var currentDict = this.GetHopcroft(current, chaos);
if (standardDict.Count != currentDict.Count) { return false; }
foreach (var item in standardDict) {
var indexOfChaos = item.Key;
if (!currentDict.TryGetValue(indexOfChaos, out var cConditionCode)) { return false; }
var sConditionCode = item.Value;
var sameRange = Algo.SameRange(sConditionCode, cConditionCode);
if (!sameRange) { return false; }
}
return true;
}
private static List<CoupleList<DFAState>> InitChaos(DFAInfo DFAInfo) {
var chaos = new List<CoupleList<DFAState>>();
var nonEnds = new CoupleList<DFAState>(DFAState.Comparer);
var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
var visited = new List<DFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
// just split every ends for now
if (subject.isEnd) {
var ends = new CoupleList<DFAState>(1); ends.TryInsert(subject);
chaos.Add(ends);
}
else { nonEnds.TryInsert(subject); }
foreach (var edge in subject.toEdges) {
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
if (nonEnds.Count > 0) { chaos.Insert(0, nonEnds); }
return chaos;
}
至此,从正则表达式到miniDFA的算法就完成了。词法分析器的构造并不复杂,但是需要十分细致耐心,相关资料又少又乱,耗费的开发时间反而比语法分析器部分多。
从DFA到词法分析器的C#代码
DFA与词法分析器的核心代码是一一对应的。即,DFA的一个状态就是词法分析器里的一个LexicalState lexicalStateA;
字段,它包含一个匿名函数,此函数根据输入的字符char c
,跳转到下一个LexicalState lexicalStateB;
字段。DFA的一个状态的每个跳出边都是一个相应的else if(c == 'x'){}
分支,使其跳转到恰当的状态。词法分析器在不断地跳转过程中,收集信息,在合适的位置截断输入流string
,并为其赋予相应的Token
类型,使之成为一个Token
对象。
点击查看 Calc.st的第一个状态【lexicalState0.cs】
/// <summary>
/// lexicalState0
/// <para>CompilerExp.Lexical●[1 DFA States]</para>
/// </summary>
private static readonly Action<LexicalContext, char> lexicalState0 =
static (context, c) => {
if (false) { /* for simpler code generation purpose. */ }
/* user-input condition code */
/* [0-9] */
else if (/* possible Vt : 'number' */
/* no possible signal */
/* [xxx] scope */
'0'/*'\u0030'(48)*/ <= c && c <= '9'/*'\u0039'(57)*/) {
BeginToken(context);
context.currentState = lexicalState1;
}
/* user-input condition code */
/* \) */
else if (/* possible Vt : ')' */
/* no possible signal */
/* single char */
c == ')'/*'\u0029'(41)*/) {
BeginToken(context);
context.currentState = lexicalState2;
}
/* user-input condition code */
/* \( */
else if (/* possible Vt : '(' */
/* no possible signal */
/* single char */
c == '('/*'\u0028'(40)*/) {
BeginToken(context);
context.currentState = lexicalState3;
}
/* user-input condition code */
/* \/ */
else if (/* possible Vt : '/' */
/* no possible signal */
/* single char */
c == '/'/*'\u002F'(47)*/) {
BeginToken(context);
context.currentState = lexicalState4;
}
/* user-input condition code */
/* \* */
else if (/* possible Vt : '*' */
/* no possible signal */
/* single char */
c == '*'/*'\u002A'(42)*/) {
BeginToken(context);
context.currentState = lexicalState5;
}
/* user-input condition code */
/* - */
else if (/* possible Vt : '-' */
/* no possible signal */
/* single char */
c == '-'/*'\u002D'(45)*/) {
BeginToken(context);
context.currentState = lexicalState6;
}
/* user-input condition code */
/* \+ */
else if (/* possible Vt : '+' */
/* no possible signal */
/* single char */
c == '+'/*'\u002B'(43)*/) {
BeginToken(context);
context.currentState = lexicalState7;
}
/* deal with everything else. */
else if (c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\0') {
context.currentState = lexicalState0; // skip them.
}
else { // unexpected char.
BeginToken(context);
context.tokenEnd = context.cursor; // ExtendToken(context);
AcceptToken(st.Error, context);
context.currentState = lexicalState0;
}
};
由于这里仅仅是从DFA对象到C#源代码文件的转化,并不涉及编译原理相关算法,就不详述了。
编译原理中的语义分析器
语义分析器的基本思路是:后序优先(从叶结点到根结点)遍历语法树,逐步得到用户所需的结果。
按后序优先遍历的顺序遍历语法树Node,才是按源代码中的字符顺序遍历源代码。在遍历时,对不同的结点执行不同的函数,就可以逐步得到语义结果。
点击查看 后序优先遍历语法树Node
// Extract some data structure from syntax tree.
public T? Extract(Node rootNode, TokenList tokens, string sourceCode) {
var context = new TContext<T>(rootNode, tokens, sourceCode);
// post-order traverse rootNode with stack(without recursion).
var nodeStack = new Stack<Node>(); var indexStack = new Stack<int>();
// init stack.
{
// push nextLeft and its next pending children.
var nextLeft = rootNode; var index = 0;
nodeStack.Push(nextLeft); indexStack.Push(index);
while (nextLeft.Children.Count > 0) {
nextLeft = nextLeft.Children[0];
nodeStack.Push(nextLeft); indexStack.Push(0);
}
}
while (nodeStack.Count > 0) {
var current = nodeStack.Pop(); var index = indexStack.Pop() + 1;
if (index < current.Children.Count) {
// push this node back again.
nodeStack.Push(current); indexStack.Push(index);
// push nextLeft and its next pending children.
var nextLeft = current.Children[index];
nodeStack.Push(nextLeft); indexStack.Push(0);
while (nextLeft.Children.Count > 0) {
nextLeft = nextLeft.Children[0];
nodeStack.Push(nextLeft); indexStack.Push(0);
}
}
else {
if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
action(current, context);
}
}
}
{
var current = this.endOfTokenList; // extra '¥' token indicates end of source code.
if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
action(current, context);
}
}
return context.result;
}
在C#中,我也将这些“不同的函数”做成了匿名函数的形式。因为确实不需要知道它们的名字。
点击查看 Calc.st文法的语义分析器【获取计算结果】
private static readonly Dictionary<int/*Node.type*/, Action<Node, TContext<FinalValue>>>
@finalValueExtractorDict = new();
private static readonly Action<Node, TContext<FinalValue>> VtHandler =
(node, context) => {
var token = context.tokens[node.tokenIndex];
context.objStack.Push(token);
};
// initialize dict for extractor.
private static void InitializeFinalValueExtractorDict() {
var extractorDict = @finalValueExtractorDict;
// extractorDict.Add(EType.@Plus符, VtHandler);
// extractorDict.Add(EType.@Dash符, VtHandler);
// extractorDict.Add(EType.@Asterisk符, VtHandler);
// extractorDict.Add(EType.@Slash符, VtHandler);
// extractorDict.Add(EType.@LeftParenthesis符, VtHandler);
// extractorDict.Add(EType.@RightParenthesis符, VtHandler);
extractorDict.Add(EType.@number, VtHandler);
extractorDict.Add(EType.@终,
static (node, context) => {
// [-1]: FinalValue : Additive ;
var @finalValue = (double)context.objStack.Pop();
context.result = new FinalValue(@finalValue);
}); // end of extractorDict.Add(EType.@终, (node, context) => { ... });
extractorDict.Add(EType.@Additive,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[0]) {
// [0]: Additive : Additive '+' Multiplicative ;
var @multiplicative0 = (double)context.objStack.Pop();
var @additive2 = (double)context.objStack.Pop();
var value = additive2 + multiplicative0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[1]) {
// [1]: Additive : Additive '-' Multiplicative ;
var @multiplicative0 = (double)context.objStack.Pop();
var @additive2 = (double)context.objStack.Pop();
var value = additive2 - multiplicative0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[2]) {
// [2]: Additive : Multiplicative ;
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Additive, (node, context) => { ... });
extractorDict.Add(EType.@Multiplicative,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[3]) {
// [3]: Multiplicative : Multiplicative '*' Primary ;
var @primary0 = (double)context.objStack.Pop();
var @multiplicative2 = (double)context.objStack.Pop();
var value = multiplicative2 * primary0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[4]) {
// [4]: Multiplicative : Multiplicative '/' Primary ;
var @primary0 = (double)context.objStack.Pop();
var @multiplicative2 = (double)context.objStack.Pop();
var value = multiplicative2 / primary0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[5]) {
// [5]: Multiplicative : Primary ;
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Multiplicative, (node, context) => { ... });
extractorDict.Add(EType.@Primary,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[6]) {
// [6]: Primary : '(' Additive ')' ;
}
else if (node.regulation == CompilerCalc.regulations[7]) {
// [7]: Primary : 'number' ;
var @number0 = context.objStack.Pop() as Token;
var value = double.Parse(number0.value);
context.objStack.Push(value);
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Primary, (node, context) => { ... });
}
End
![]() |
微信扫码,自愿捐赠。天涯同道,共谱新篇。
微信捐赠不显示捐赠者个人信息,如需要,请注明联系方式。 |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了