从文法到解析器的所有算法

从文法到解析器的所有算法

最近完成了替代Lex+YACC的自动生成词法分析器+语法分析器的项目,暂且命名为bitParser。想拥有自己的解析器的小伙伴可以将文法给我,送解析器。

本文以下面是一个支持加减乘除和括号的四则运算的文法Calc.st为例:

// 输入文件Calc.st
Exp    : Exp '+' Term
       | Exp '-' Term
       | Term ;
Term   : Term '*' Factor
       | Term '/' Factor
       | Factor ;
Factor : '(' Exp ')'
       | 'number' ;

%%[0-9]+%% 'number' // 示例只处理非负整数
//无须书写 %%[+]%% '+'
点击查看 Calc.st 显式版
#extractor <Calc.st.ext>

// 8 regulations:
Exp : 
    Exp '+' Term // [0] [0]
  | Exp '-' Term // [1] [1]
  | Term ; // [2] [2]
Term : 
    Term '*' Factor // [0] [3]
  | Term '/' Factor // [1] [4]
  | Factor ; // [2] [5]
Factor : 
    '(' Exp ')' // [0] [6]
  | 'number' ; // [1] [7]

// 1 token statements:
%%[0-9]+%% 'number' // [0]

// 0 precedences

// options
%grammarName Exp
%start Exp
%blockComment off
%inlineComment off
%validScopeChars [\u0001-\uFFFF]
%validGlobalChars [\u0001-\uFFFF]

编译原理中的语法分析器

基础结构

nullable

判断一个Symbol[]是否可能产生空(ε)empty(即什么都没有推导出来)?

点击查看 nullable
// 计算所有可能推导出空(ε)empty的结点。
/// get the dictionary that tells if a symbol(Vn/Vt) can refer to ε.
static Dictionary<Symbol, bool> GetNullableDict(YieldContext context) {
	var nullableDict = new Dictionary<Symbol, bool>();
	// for all Vn symbols(include S')
	foreach (var symbol in context.eVns) {
		nullableDict.Add(symbol, false);
	}
	// for all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
	foreach (var symbol in context.eSyntaxVts) {
		nullableDict.Add(symbol, false);
	}

	// iterate untill not changed.
	bool changed = false;
	do {
		changed = false;
		foreach (var regulation in context.extendedRegulationDrafts) {
			// 如果regulation.right可推导出ε,就说明regulation.left可推导出ε。
			if (CanBeEmpty(regulation.right, nullableDict)) {
				var left = regulation.left;
				if (!nullableDict[left]) {
					nullableDict[left] = true;
					changed = true;
				}
			}
		}
	} while (changed);

	return nullableDict;
}
// list是否都能产生ε?
static bool CanBeEmpty(VNodesKey nodeList, Dictionary<Symbol, bool> nullableDict) {
	return CanBeEmpty(nodeList, 0, nodeList.Length, nullableDict);
}
// list是否都能产生ε?
static bool CanBeEmpty(IReadOnlyList<Symbol> nodeList, Dictionary<Symbol, bool> nullableDict) {
	return CanBeEmpty(nodeList, 0, nodeList.Count, nullableDict);
}

// list中指定的某一段结点是否都能产生ε?
static bool CanBeEmpty(VNodesKey nodeList, int checkIndex, int checkCount, Dictionary<Symbol, bool> nullableDict) {
	bool result = true;
	for (int i = 0; i < checkCount; i++) {
		var node = nodeList[i + checkIndex];
		if (!nullableDict[node]) {
			result = false;
			break;
		}
	}

	return result;
}

// list中指定的某一段结点是否都能产生ε?
static bool CanBeEmpty(IReadOnlyList<Symbol> nodeList, int checkIndex, int checkCount, Dictionary<Symbol, bool> nullableDict) {
	bool result = true;
	for (int i = 0; i < checkCount; i++) {
		var node = nodeList[i + checkIndex];
		if (!nullableDict[node]) {
			result = false;
			break;
		}
	}

	return result;
}

FIRST

这是天书般的解释:若文法G为二型文法且不含左递归,则G的非终结符的每个候选式α的终结首符集FIRST(α)为FIRST(α) = { a | α经过0或多步推导为a...的形式,其中a∈Vt }

这是我的理解:FIRST集的含义是:候选式经过推导,最后就是一个终结符的串,推导过程不同,会有多个不同的串(可能是无限个),这些串里的第一个字符组成的集合就是这个候选式的FIRST集。有了这个FIRST集,就可以知道这个候选式是否能匹配接下来要解析的单词流了。

也就是说,给定Symbol数组,在它能推导出的所有产生式中,第一个Symbol都有哪些?

算法的基本思路是:所有的Vt,其FIRST都是Vt自己。以此为基础,一遍一遍地找到全部FIRST,直至找不到新的a。

点击查看 计算文法的FIRST集
// returns the dictionary of FIRST.target -> FIRST
internal static Dictionary<VNodesKey, FIRST> CalcFIRSTDict(
	YieldContext context, Dictionary<Symbol, bool> nullableDict) {
	var result = new Dictionary<VNodesKey, FIRST>();

	CalcFIRSTDict4Node(result, context, nullableDict);

	CalcFIRSTDict4Right(result, context, nullableDict);

	return result;
}

// 计算各个文法的right的FIRST集
private static void CalcFIRSTDict4Right(Dictionary<VNodesKey, FIRST> result,
	YieldContext context, Dictionary<Symbol, bool> nullableDict) {
	// allocate space for every regulationDraft.right
	var rightDict = new Dictionary<VNodesKey, FIRST>();
	var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
	foreach (var regulationDraft in context.extendedRegulationDrafts) {
		var target = new VNodesKey(regulationDraft.right);
		if (!result.TryGetValue(target, out var _)) {
			var values = new Bits(length: eVtKinds); // not filled up yet.
			var first = new FIRST(target, hasEmpty: false, values);
			result.Add(first.target, first);
			rightDict.Add(first.target, first);
		}
	}

	bool changed = false;
	do {
		changed = false;
		foreach (var first in rightDict.Values) {
			var target = first.target; var count = target.Length; var allEmpty = true;
			for (int checkLength = 0; checkLength < count; checkLength++) {
				// 如果前checkLength个结点都可为ε,
				// 就说明 FIRST( target ) 包含 FIRST( target[checkLength] ),ε除外。
				const int checkIndex = 0;
				if (CanBeEmpty(target, checkIndex, checkLength, nullableDict)) {
					Symbol refKey = target[checkLength];
					if (!result.TryGetValue(new VNodesKey(refKey), out var refFirst)) { throw new Exception(Consts.algorithmError); }
					if (first.TryInsert(refFirst.values)) { changed = true; }
				}
				else { allEmpty = false; break; }
			}
			if (allEmpty) {
				if (!first.hasEmpty) {
					// 如果target的全部结点都可为ε,就说明FIRST( target ) 包含ε。
					if (CanBeEmpty(target, nullableDict)) {
						first.hasEmpty = true;
						changed = true;
					}
				}
			}
		}
	} while (changed);
}

// 计算文法的所有单个的结点的FIRST
private static void CalcFIRSTDict4Node(Dictionary<VNodesKey, FIRST> result,
	YieldContext context, Dictionary<Symbol, bool> nullableDict) {
	IReadOnlyList<RegulationDraft> eRegulationDrafts = context.extendedRegulationDrafts;
	// allocate space for every single symbols.
	var eVtKinds = context.eSyntaxVts.Count;
	// 初始化FIRST( Vn )
	foreach (var Vn in context.eVns) {
		var values = new Bits(length: eVtKinds);
		var containsEmpty = nullableDict[Vn];
		var first = new FIRST(target: Vn, containsEmpty, values);
		result.Add(first.target, first);
	}
	// 初始化FIRST( Vt )(FIRST( Vt )实际上已经完工)
	foreach (var Vt in context.eSyntaxVts) {
		var values = new Bits(length: eVtKinds);
		var index = context.GeteSyntaxVtIndex(Vt); values.Set(index, true);
		var first = new FIRST(target: Vt, hasEmpty: false, values);
		result.Add(first.target, first);
	}

	bool changed = false;
	do {
		changed = false;
		foreach (var regulationDraft in eRegulationDrafts) {
			var left = regulationDraft.left; var right = regulationDraft.right;
			// try to collect FIRST( left )
			var allEmpty = true;
			for (int endPosition = 0; endPosition < right.Count; endPosition++) {
				// 如果前endPosition个结点都可为null,
				// 就说明 FIRST(left) 包含 FIRST(right[endPosition]),ε除外。
				if (CanBeEmpty(right, 0, endPosition, nullableDict)) {
					var refKey = right[endPosition];
					if (left != refKey) {
						if (first.TryInsert(refFirst.values)) { changed = true; }
					}
				}
				else { allEmpty = false; break; }
			}
			if (allEmpty) {
				if (result.TryGetValue(new VNodesKey(left), out var first)) {
					if (!first.hasEmpty) {
						if (CanBeEmpty(right, nullableDict)) {
							first.hasEmpty = true;
							changed = true;
						}
					}
				}
			}
		}
	} while (changed);
}

FOLLOW

这是天书般的解释:设上下文无关文法(二型文法)G,开始符号为S,对于G中的任意非终结符A,其FOLLOW(A) = { a | S 经过0或多步推导会出现 ...Aa...的形式,其中a∈Vt或 '¥'号 }

这是我的理解:结点Vt的FOLLOW集:在所有可能的产生式中,可能出现在Vt后面的所有结点。

这是天书般的算法:

'¥'∈FOLLOW(S)
若文法G中有形如A –> αBβ的规则,且β≠ε,则将FIRST(β)中的一切非终结符Vt加入FOLLOW(B)
若文法G中有形如A -> αB或A -> αBβ的规则,且ε∈FIRST(β),则将FOLLOW(A)中的全部元素加入FOLLOW(B)
反复使用前两条规则,直到所有的FOLLOW集都没有改变。

这是我的代码:

点击查看 计算文法的FOLLOW集
// 计算文法的FOLLOW集
// returns the dictionary of FOLLOW.Vn -> FOLLOW
internal static Dictionary<Symbol/*FOLLOW.Vn*/, FOLLOW> CalcFOLLOWDict(
	YieldContext context,
	Dictionary<Symbol, bool> emptyDict, Dictionary<VNodesKey, FIRST> firstDict) {
	var result = new Dictionary<Symbol/*FOLLOW.Vn*/, FOLLOW>();

	// 初始化Follow Dict
	var EOFIndex = context.GeteSyntaxVtIndex(Symbol.EOF);
	var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
	foreach (var Vn in context.eVns) {
		var follow = new FOLLOW(Vn, eVtKinds);
		if (Vn == context.extendedStartNode) {
			// add '¥' to S' : S ; '¥'
			follow.TryInsert(EOFIndex);
		}
		result.Add(follow.Vn, follow);
	}

	// 迭代到不动点
	// iterate untill not changed.
	bool changed = false;
	do {
		changed = false;
		foreach (var regulationDraft in context.extendedRegulationDrafts) {
			var right = regulationDraft.right; var count = right.Count;
			for (var endPosition = 0; endPosition < count; endPosition++) {
				Symbol target = right[endPosition];
				if (target.kind == Symbol.Kind.Vt) { continue; } // 叶结点没有follow
				// 准备为target添加follow元素
				var checkIndex = endPosition + 1; var allEmpty = true;
				for (var checkCount = 0; checkCount < count - checkIndex; checkCount++) {
					// if right[checkIndex->(checkIndex+checkCount-1)] can be empty,
					// then FOLLOW( target ) includes FIRST( right[checkInde+checkCount] ) except ε.
					if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
						// FOLLOW( target ) 包含 FIRST( right[checkInde+checkCount] )(除了ε)
						Symbol Vn = target;
						Symbol key = right[checkIndex + checkCount];
						if (follow.TryInsert(first.values)) { changed = true; }
					}
					else { allEmpty = false; break; }
				}
				if (allEmpty) {
					// 如果target之后的全部结点都可为ε,那么 FOLLOW( target ) 包含 FOLLOW( regulation.left )
					if (follow != refFollow) {
						var checkCount = count - checkIndex;
						if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
							if (follow.TryInsert(refFollow.values)) { changed = true; }
						}
					}
				}
			}
		}
	} while (changed);

	return result;
}

从产生式到LL(1)语法分析表

这是天书般的解释:

点击查看 构造LL(1)分析表的算法
输入:文法G
输出:G的LL(1)分析表M(Ax, ay),其中A为非终结符,a为终结符
算法:
求出G的FIRST集和FOLLOW集
for (G的每个产生式 A -> γ1 | γ2 | ... | γm) {
    if ( a ∈ FIRST(γi)) 置 M(A, a) 为 “A -> γi”
    if ( ε ∈ FIRST(γi))
        for (每个 a ∈ FOLLOW(A))
            置 M(A, a)为 “A -> γi”(实际上此处的γi都是ε)
}
置所有无定义的 M(A, a)为出错。

这是我的代码:

点击查看 用LL(1)分析法得到分析表
// 用LL(1)分析法得到分析表
internal static LL1SyntaxInfo CalcLL1SyntaxInfo(YieldContext context,
	Dictionary<Symbol, FOLLOW> eFOLLOWDict, Dictionary<VNodesKey, FIRST> eFIRSTDict) {
	var regulationDrafts = context.grammar.regulationDrafts;
	var regCount = regulationDrafts.Count;
	var table = new LL1ParseTableDraft();
	for (int regulationId = 0; regulationId < regCount; regulationId++) {
		var regulation = regulationDrafts[regulationId];
		var Vn = regulation.left;
		var key = new VNodesKey(regulation.right);
		var first = eFIRSTDict[key]; // FIRST( regulation.right )
		var actionDraft = new LL1ParseActionDraft(regulation);
		for (int t = 0; t < first.values.length; t++) {
			if (first.values.Get(t)) {
				var Vt = context.GeteSyntaxVt(t);
				table.SetAction(Vn, Vt, actionDraft);
			}
		}
		if (first.hasEmpty) {
			var follow = eFOLLOWDict[Vn];
			for (int t = 0; t < follow.values.length; t++) {
				if (follow.values.Get(t)) {
					var Vt = context.GeteSyntaxVt(t);
					table.SetAction(Vn, Vt, actionDraft);
				}
			}
		}
	}
	
	var result = new LL1SyntaxInfo(table);
	return result;
}

从产生式到LR(1)语法分析表

点击查看 从产生式到LR(1)语法分析表
// 用LR分析法计算stateList、edgeList、分析表LRTableDraft
internal static LRSyntaxInfo CalcLRSyntaxInfo(
	LRParseContext LRContext, YieldContext context) {
	var stateList = new CoupleList<LRState>(LRContext.stateComparer);
	var edgeList = new CoupleList<LREdge>(LRContext.edgeComparer);

	CalcSyntaxStates(stateList, edgeList, LRContext, context);

	LRTableDraft table = CalcLRTableDraft(stateList, edgeList,
		LRContext.vtsProvider, context);

	var result = new LRSyntaxInfo(stateList, edgeList, table);
	return result;
}
private static void CalcSyntaxStates(CoupleList<LRState> stateList, CoupleList<LREdge> edgeList, LRParseContext LRContext, YieldContext context) {
	var beta2FIRST = new Dictionary<VNodesKey, FIRST>();
	var queue = new Queue<LRState>();
	{
		var eVtKinds = context.eSyntaxVts.Count;// all Vt symbols(include Symbol.EOF('¥')) valid for sytax-parse.
		var bitsEOF = new Bits(length: eVtKinds);
		var EOFIndex = context.GeteSyntaxVtIndex(Symbol.EOF); bitsEOF.Set(EOFIndex, true);
		var eRegulationDraft = context.extendedRegulationDrafts[0];
		var firstState = new LRState(index: 0);
		firstState.AddMerge(eRegulationDraft, dotPosition: 0, lookaheads: bitsEOF, out var _, LRContext);
		Algo.Closure(firstState, beta2FIRST, LRContext, context);
		var against = stateList.TryInsert(firstState);
		queue.Enqueue(firstState);
	}
	while (queue.Count > 0) {
		var subject = queue.Dequeue();
		var dealt = new List<Symbol>(subject.ItemGroups.Count);
		foreach (var itemGroup in subject.ItemGroups) {
			Symbol? symbol = itemGroup.symbolAfterDot;
			if ((symbol is not null) // this symbol is not dealt with yet
			 && (Algo.TryBinaryInsert(dealt, symbol, out var _, Symbol.Comparer) is null)) {
				var to = Algo.Goto(subject, symbol, LRContext, context);
				Algo.Closure(to, beta2FIRST, LRContext, context);
				var against = stateList.TryInsert(to);//融入组织之中吧
				LRState? whichTo = null;
				if (against is not null) { whichTo = against; }
				else {
					to.index = stateList.Count - 1; queue.Enqueue(to);
					whichTo = to;
				}
				var edge = new LREdge(subject, symbol, whichTo);
				var against2 = edgeList.TryInsert(edge);
			}
		}
	}
}
// LR的Closure操作。
// 补全一个状态。
private static void Closure(this LRState subject,
	Dictionary<VNodesKey, FIRST> beta2FIRST,
	LRParseContext LRContext, YieldContext context) {
	var queue = new Queue<LRItemGroup>();
	foreach (var itemGroup in subject.ItemGroups) {
		Symbol? symbol = itemGroup.symbolAfterDot;
		if (symbol != null && symbol.kind == Symbol.Kind.Vn) {
			queue.Enqueue(itemGroup);
		}
	}
	while (queue.Count > 0) {
		var itemGroup = queue.Dequeue();
		Bits lookaheads;
		if (LRContext.wantLookaheads) {
			VNodesKey beta = itemGroup.beta;// A : α ⏳ Symbol β ; 'z'
			if (beta.Length > 0) {
				if (!beta2FIRST.TryGetValue(beta, out var first)) {
					first = CalcFIRST(beta, context);
					beta2FIRST.Add(beta, first);
				}
				if (first.hasEmpty) { // first | itemGroup
					lookaheads = new Bits(first.values);
					lookaheads.Or(itemGroup.lookaheads);
				}
				else { lookaheads = first.values; }
			}
			else { lookaheads = itemGroup.lookaheads; }
		}
		else {
			var bitLength = context.eSyntaxVts.Count;
			lookaheads = new Bits(bitLength); // 0 lookahead
		}
		var regulationDrafts = context.left2RegulationDrafts[itemGroup.symbolAfterDot];
		foreach (var regulationDraft in regulationDrafts) {
			const int dotPosition = 0;
			var position = subject.AddMerge(regulationDraft, dotPosition,
				lookaheads, out var updated, LRContext);
			if (updated) {
				Symbol? symbol = position.symbolAfterDot;
				if (symbol != null && symbol.kind == Symbol.Kind.Vn) {
					if (!queue.Contains(position)) { queue.Enqueue(position); }
				}
			}
		}
	}
}
// LR的Goto操作。
// 将⏳移到所有LR项中的符号<paramref name="symbol"/>之后。
private static LRState Goto(this LRState subject, Symbol symbol,
	LRParseContext LRContext, YieldContext context) {
	var toState = new LRState(index: -1); // -1 means not ready.
	foreach (var itemGroup in subject.ItemGroups) {
		if (itemGroup.symbolAfterDot == symbol) {
			toState.AddMerge(itemGroup.regulationDraft, itemGroup.dotPosition + 1,
				itemGroup.lookaheads, out var _, LRContext);
		}
	}
	return toState;
}
// organize stateList and edgeList into a table.
private static LRTableDraft CalcLRTableDraft(
	CoupleList<LRState> stateList, CoupleList<LREdge> edgeList,
	ILRTableVtsProvider VtsProvider, YieldContext context) {
	var table = new LRTableDraft(stateCount: stateList.Count);
	// goto, shift in
	foreach (var edge in edgeList) {
		var state = edge.from; var to = edge.to;
		switch (edge.symbol.kind) {
		case Symbol.Kind.Vn: // goto action
		LRParseActionDraft Goto = new(LRParseActionDraft.Kind.Goto, to);
		table.SetAction(state, edge.symbol, Goto, context); break;
		case Symbol.Kind.Vt: // shift in action
		LRParseActionDraft Shift = new(LRParseActionDraft.Kind.Shift, to);
		table.SetAction(state, edge.symbol, Shift, context); break;
		default: throw new NotImplementedException();
		}
	}
	// accept, reduce
	var eStart = context.extendedStartNode; // the S' in many books.
	foreach (var state in stateList) {
		foreach (var itemGroup in state.ItemGroups) {
			if (itemGroup.symbolAfterDot is null) {
				if (itemGroup.regulationDraft.left == eStart) {
					// accept action
					table.SetAction(state, Symbol.EOF, LRParseActionDraft.accept, context);
				}
				else {
					// reduce action
					var Reduce = new LRParseActionDraft(itemGroup.regulationDraft);
					var bits = VtsProvider.GetVts(itemGroup);
					for (int i = 0; i < bits.length; i++) {
						if (bits.Get(i)) {
							var Vt = context.GeteSyntaxVt(i);
							table.SetAction(state, Vt, Reduce, context);
						}
					}
				}
			}
		}
	}

	return table;
}

从LR(1)语法分析表到LALR(1)语法分析表

既然已经有了LR(1)语法分析表,那么再计算LALR(1)语法分析表就有了简便方法。原因是:LALR(1)分析法与LR(1)分析法的区别仅在于:两个LR(1)的State里,某个项中的lookAhead不同时,那就意味着这是不同的State;而两个LALR(1)的State里,各个项中的lookAhead不同时,仍旧表示同一个State。也就是说,只需将LR(1)的各个State中,对应项相同而lookAhead不同的State合并,就得到了LALR(1)分析表的State。

点击查看 LALR(1)简便算法
// 用LR(1)分析法得到的信息快速得到LALR(1)的Edge、State、分析表
static LRSyntaxInfo GetLALR1SyntaxInfo(
    LRSyntaxInfo _LRSyntaxInfo, // LR(1) syntax info
	IReadOnlyList<RegulationDraft> eRegulations) {
	var stateList = new CoupleList<LRState>();
	// a LALR(1) state is 1/more LR(1) states.
	// so, let's merge LR(1) states into LALR(1) state.
	var LRState2LALR1State = new Dictionary<LRState, LRState>();
	foreach (var state in _LRSyntaxInfo.stateList) {
		var mentor = Absorb(stateList, state, context);
		// 'state'(LR(1)) is merged into 'mentor'(LALR(1))
		LRState2LALR1State.Add(state, mentor);
	}
	var edgeList = new CoupleList<LREdge>(_LRSyntaxInfo.edgeList.Count, context.edgeComparer);
	foreach (var edge in _LRSyntaxInfo.edgeList) {
		var from = LRState2LALR1State[edge.from];
		var to = LRState2LALR1State[edge.to];
		var edge2 = new LREdge(from, edge.V, to);
		edgeList.TryInsert(edge2);
	}

	var eEnd = VNode.endOfTokenList;
	LRTableDraft table = GetLRTableDraft(stateList, edgeList, context.vtsProvider, eRegulations, eEnd);
	var result = new LRSyntaxInfo(stateList, edgeList, table);

	return result;
}

// either some state in LALR1StateList absorbs/merges LR1State's lookAheads,
// or a new LALR(1) state which consists of LR1State is generated.
static LRState Absorb(CoupleList<LRState> LALR1StateList, LRState LR1State, LRParseContext context) {
	var addList = LALR1StateList.addList; var orderList = LALR1StateList.orderList;

	LRState? mentor = null; var index = orderList.Count;
	int left = 0, right = orderList.Count - 1;
	if (right < 0) {
		mentor = new LRState(index);
		mentor.index = index;
		foreach (var itemGroup in LR1State.ItemGroups) {
			mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
		}
		addList.Add(mentor); orderList.Add(mentor);
	}
	else {
		while (left < right) {
			int mid = (left + right) / 2;
			var current = orderList[mid];
			var result = TryAbsorb(current, LR1State, context);
			if (result < 0) { right = mid; }
			else if (result == 0) { left = mid; right = mid; mentor = current; }
			else { left = mid + 1; }
		}
		if (mentor is null) {
			var LALR1State = orderList[left];
			var result = TryAbsorb(LALR1State, LR1State, context);
			if (result == 0) { /* already inserted into 'current' */ mentor = LALR1State; }
			else {
				mentor = new LRState(index);
				foreach (var itemGroup in LR1State.ItemGroups) {
					mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
				}
				addList.Add(mentor); orderList.Insert(result < 0 ? left : left + 1, mentor);
			}
		}
	}

	return mentor;
}


// try to absorb state if this is equal state to state
// only used in LALR(1) syntax parse.
static int TryAbsorb(LRState LALR1State, LRState LR1State, LRParseContext context) {
	var result = context.stateComparer(LALR1State, LR1State);
	if (result == 0) { // euqal states should be absorbed.
		foreach (var itemGroup in LR1State.ItemGroups) {
			DoAbsorb(LALR1State.OrderList, itemGroup, context);
		}
	}
	return result;
}

static void DoAbsorb(IReadOnlyList<LRItemGroup> list, LRItemGroup itemGroup, LRParseContext context) {
	LRItemGroup? against = null; var keyIndex = -1;
	var left = 0; var right = list.Count - 1;
	if (right >= 0) {
		var result = -1;
		while (left < right) {
			var mid = (left + right) / 2;
			var current = list[mid];
			result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
			if (result < 0) { right = mid; }
			else if (result == 0) { left = mid; right = mid; against = current; keyIndex = left; }
			else { left = mid + 1; }
		}
		if (result != 0) {
			var current = list[left];
			result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
			if (result == 0) { against = current; keyIndex = left; }
		}
	}
	foreach (var lookAhead in itemGroup.lookAheads) {
		against.lookAheads.TryInsert(lookAhead);
	}
}

从LALR(1)语法分析表到语法分析器的C#代码

LALR(1)语法分析表记录的是在某个LALR(1)状态下,遇到某个V(Vn或Vt)时,应当跳转到哪个状态。在跳转过程中,有时应当用某个产生式规约,有时应当移进Vt,如果遇到'¥',则应当结束。

对应的语法分析器C#源代码,应当将LALR(1)语法分析表记录下来,这实际上是若干字典对象Dictionary<int/*V*/, LRParseAction>

点击查看 用LALR(1)分析表建立的语法分析器
const int syntaxStateCount = 16;
/// <summary>
/// LALR(1) syntax parse table
/// </summary>
private static readonly Dictionary<int/*Node.type*/, LRParseAction>[]
    syntaxStates = new Dictionary<int/*Node.type*/, LRParseAction>[syntaxStateCount];

private static void InitializeSyntaxStates() {
	var states = CompilerExp.syntaxStates;
	// 78 actions
	// conflicts(0)=not sovled(0)+solved(0)(0 warnings)
	#region create objects of syntax states
	states[0] = new(capacity: 5);
	states[1] = new(capacity: 3);
	states[2] = new(capacity: 6);
	states[3] = new(capacity: 6);
	states[4] = new(capacity: 5);
	states[5] = new(capacity: 6);
	states[6] = new(capacity: 4);
	states[7] = new(capacity: 4);
	states[8] = new(capacity: 3);
	states[9] = new(capacity: 3);
	states[10] = new(capacity: 3);
	states[11] = new(capacity: 6);
	states[12] = new(capacity: 6);
	states[13] = new(capacity: 6);
	states[14] = new(capacity: 6);
	states[15] = new(capacity: 6);
	#endregion create objects of syntax states

	#region re-used actions
	LRParseAction aGoto2 = new(LRParseAction.Kind.Goto, states[2]);// refered 2 times
	LRParseAction aGoto3 = new(LRParseAction.Kind.Goto, states[3]);// refered 4 times
	LRParseAction aShift4 = new(LRParseAction.Kind.Shift, states[4]);// refered 6 times
	LRParseAction aShift5 = new(LRParseAction.Kind.Shift, states[5]);// refered 6 times
	LRParseAction aShift6 = new(LRParseAction.Kind.Shift, states[6]);// refered 2 times
	LRParseAction aShift7 = new(LRParseAction.Kind.Shift, states[7]);// refered 2 times
	LRParseAction aShift8 = new(LRParseAction.Kind.Shift, states[8]);// refered 3 times
	LRParseAction aShift9 = new(LRParseAction.Kind.Shift, states[9]);// refered 3 times
	LRParseAction aReduce2 = new(regulations[2]);// refered 4 times
	LRParseAction aReduce5 = new(regulations[5]);// refered 6 times
	LRParseAction aReduce7 = new(regulations[7]);// refered 6 times
	LRParseAction aReduce0 = new(regulations[0]);// refered 4 times
	LRParseAction aReduce1 = new(regulations[1]);// refered 4 times
	LRParseAction aReduce3 = new(regulations[3]);// refered 6 times
	LRParseAction aReduce4 = new(regulations[4]);// refered 6 times
	LRParseAction aReduce6 = new(regulations[6]);// refered 6 times
	#endregion re-used actions

	// 78 actions
	// conflicts(0)=not sovled(0)+solved(0)(0 warnings)
	#region init actions of syntax states
	// syntaxStates[0]:
	// [-1] Exp' : ⏳ Exp ;☕ '¥' 
	// [0] Exp : ⏳ Exp '+' Term ;☕ '-' '+' '¥' 
	// [1] Exp : ⏳ Exp '-' Term ;☕ '-' '+' '¥' 
	// [2] Exp : ⏳ Term ;☕ '-' '+' '¥' 
	// [3] Term : ⏳ Term '*' Factor ;☕ '-' '*' '/' '+' '¥' 
	// [4] Term : ⏳ Term '/' Factor ;☕ '-' '*' '/' '+' '¥' 
	// [5] Term : ⏳ Factor ;☕ '-' '*' '/' '+' '¥' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' '*' '/' '+' '¥' 
	// [7] Factor : ⏳ 'number' ;☕ '-' '*' '/' '+' '¥' 
	states[0]/*0*/.Add(st.@vnExp, new(LRParseAction.Kind.Goto, states[1]));
	states[0]/*1*/.Add(st.@vnTerm, aGoto2);
	states[0]/*2*/.Add(st.@vnFactor, aGoto3);
	states[0]/*3*/.Add(st.@LeftParenthesis符, aShift4);
	states[0]/*4*/.Add(st.@number, aShift5);
	// syntaxStates[1]:
	// [-1] Exp' : Exp ⏳ ;☕ '¥' 
	// [0] Exp : Exp ⏳ '+' Term ;☕ '-' '+' '¥' 
	// [1] Exp : Exp ⏳ '-' Term ;☕ '-' '+' '¥' 
	states[1]/*5*/.Add(st.@Plus符, aShift6);
	states[1]/*6*/.Add(st.@Dash符, aShift7);
	states[1]/*7*/.Add(st.@终, LRParseAction.accept);
	// syntaxStates[2]:
	// [2] Exp : Term ⏳ ;☕ '-' ')' '+' '¥' 
	// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	states[2]/*8*/.Add(st.@Asterisk符, aShift8);
	states[2]/*9*/.Add(st.@Slash符, aShift9);
	states[2]/*10*/.Add(st.@Dash符, aReduce2);
	states[2]/*11*/.Add(st.@RightParenthesis符, aReduce2);
	states[2]/*12*/.Add(st.@Plus符, aReduce2);
	states[2]/*13*/.Add(st.@终, aReduce2);
	// syntaxStates[3]:
	// [5] Term : Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥' 
	states[3]/*14*/.Add(st.@Dash符, aReduce5);
	states[3]/*15*/.Add(st.@RightParenthesis符, aReduce5);
	states[3]/*16*/.Add(st.@Asterisk符, aReduce5);
	states[3]/*17*/.Add(st.@Slash符, aReduce5);
	states[3]/*18*/.Add(st.@Plus符, aReduce5);
	states[3]/*19*/.Add(st.@终, aReduce5);
	// syntaxStates[4]:
	// [6] Factor : '(' ⏳ Exp ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [0] Exp : ⏳ Exp '+' Term ;☕ '-' ')' '+' 
	// [1] Exp : ⏳ Exp '-' Term ;☕ '-' ')' '+' 
	// [2] Exp : ⏳ Term ;☕ '-' ')' '+' 
	// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+' 
	// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+' 
	// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' 
	// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' 
	states[4]/*20*/.Add(st.@vnExp, new(LRParseAction.Kind.Goto, states[10]));
	states[4]/*21*/.Add(st.@vnTerm, aGoto2);
	states[4]/*22*/.Add(st.@vnFactor, aGoto3);
	states[4]/*23*/.Add(st.@LeftParenthesis符, aShift4);
	states[4]/*24*/.Add(st.@number, aShift5);
	// syntaxStates[5]:
	// [7] Factor : 'number' ⏳ ;☕ '-' ')' '*' '/' '+' '¥' 
	states[5]/*25*/.Add(st.@Dash符, aReduce7);
	states[5]/*26*/.Add(st.@RightParenthesis符, aReduce7);
	states[5]/*27*/.Add(st.@Asterisk符, aReduce7);
	states[5]/*28*/.Add(st.@Slash符, aReduce7);
	states[5]/*29*/.Add(st.@Plus符, aReduce7);
	states[5]/*30*/.Add(st.@终, aReduce7);
	// syntaxStates[6]:
	// [0] Exp : Exp '+' ⏳ Term ;☕ '-' ')' '+' '¥' 
	// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥' 
	states[6]/*31*/.Add(st.@vnTerm, new(LRParseAction.Kind.Goto, states[11]));
	states[6]/*32*/.Add(st.@vnFactor, aGoto3);
	states[6]/*33*/.Add(st.@LeftParenthesis符, aShift4);
	states[6]/*34*/.Add(st.@number, aShift5);
	// syntaxStates[7]:
	// [1] Exp : Exp '-' ⏳ Term ;☕ '-' ')' '+' '¥' 
	// [3] Term : ⏳ Term '*' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [4] Term : ⏳ Term '/' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [5] Term : ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥' 
	states[7]/*35*/.Add(st.@vnTerm, new(LRParseAction.Kind.Goto, states[12]));
	states[7]/*36*/.Add(st.@vnFactor, aGoto3);
	states[7]/*37*/.Add(st.@LeftParenthesis符, aShift4);
	states[7]/*38*/.Add(st.@number, aShift5);
	// syntaxStates[8]:
	// [3] Term : Term '*' ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥' 
	states[8]/*39*/.Add(st.@vnFactor, new(LRParseAction.Kind.Goto, states[13]));
	states[8]/*40*/.Add(st.@LeftParenthesis符, aShift4);
	states[8]/*41*/.Add(st.@number, aShift5);
	// syntaxStates[9]:
	// [4] Term : Term '/' ⏳ Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [6] Factor : ⏳ '(' Exp ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [7] Factor : ⏳ 'number' ;☕ '-' ')' '*' '/' '+' '¥' 
	states[9]/*42*/.Add(st.@vnFactor, new(LRParseAction.Kind.Goto, states[14]));
	states[9]/*43*/.Add(st.@LeftParenthesis符, aShift4);
	states[9]/*44*/.Add(st.@number, aShift5);
	// syntaxStates[10]:
	// [6] Factor : '(' Exp ⏳ ')' ;☕ '-' ')' '*' '/' '+' '¥' 
	// [0] Exp : Exp ⏳ '+' Term ;☕ '-' ')' '+' 
	// [1] Exp : Exp ⏳ '-' Term ;☕ '-' ')' '+' 
	states[10]/*45*/.Add(st.@RightParenthesis符, new(LRParseAction.Kind.Shift, states[15]));
	states[10]/*46*/.Add(st.@Plus符, aShift6);
	states[10]/*47*/.Add(st.@Dash符, aShift7);
	// syntaxStates[11]:
	// [0] Exp : Exp '+' Term ⏳ ;☕ '-' ')' '+' '¥' 
	// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	states[11]/*48*/.Add(st.@Asterisk符, aShift8);
	states[11]/*49*/.Add(st.@Slash符, aShift9);
	states[11]/*50*/.Add(st.@Dash符, aReduce0);
	states[11]/*51*/.Add(st.@RightParenthesis符, aReduce0);
	states[11]/*52*/.Add(st.@Plus符, aReduce0);
	states[11]/*53*/.Add(st.@终, aReduce0);
	// syntaxStates[12]:
	// [1] Exp : Exp '-' Term ⏳ ;☕ '-' ')' '+' '¥' 
	// [3] Term : Term ⏳ '*' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	// [4] Term : Term ⏳ '/' Factor ;☕ '-' ')' '*' '/' '+' '¥' 
	states[12]/*54*/.Add(st.@Asterisk符, aShift8);
	states[12]/*55*/.Add(st.@Slash符, aShift9);
	states[12]/*56*/.Add(st.@Dash符, aReduce1);
	states[12]/*57*/.Add(st.@RightParenthesis符, aReduce1);
	states[12]/*58*/.Add(st.@Plus符, aReduce1);
	states[12]/*59*/.Add(st.@终, aReduce1);
	// syntaxStates[13]:
	// [3] Term : Term '*' Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥' 
	states[13]/*60*/.Add(st.@Dash符, aReduce3);
	states[13]/*61*/.Add(st.@RightParenthesis符, aReduce3);
	states[13]/*62*/.Add(st.@Asterisk符, aReduce3);
	states[13]/*63*/.Add(st.@Slash符, aReduce3);
	states[13]/*64*/.Add(st.@Plus符, aReduce3);
	states[13]/*65*/.Add(st.@终, aReduce3);
	// syntaxStates[14]:
	// [4] Term : Term '/' Factor ⏳ ;☕ '-' ')' '*' '/' '+' '¥' 
	states[14]/*66*/.Add(st.@Dash符, aReduce4);
	states[14]/*67*/.Add(st.@RightParenthesis符, aReduce4);
	states[14]/*68*/.Add(st.@Asterisk符, aReduce4);
	states[14]/*69*/.Add(st.@Slash符, aReduce4);
	states[14]/*70*/.Add(st.@Plus符, aReduce4);
	states[14]/*71*/.Add(st.@终, aReduce4);
	// syntaxStates[15]:
	// [6] Factor : '(' Exp ')' ⏳ ;☕ '-' ')' '*' '/' '+' '¥' 
	states[15]/*72*/.Add(st.@Dash符, aReduce6);
	states[15]/*73*/.Add(st.@RightParenthesis符, aReduce6);
	states[15]/*74*/.Add(st.@Asterisk符, aReduce6);
	states[15]/*75*/.Add(st.@Slash符, aReduce6);
	states[15]/*76*/.Add(st.@Plus符, aReduce6);
	states[15]/*77*/.Add(st.@终, aReduce6);
	#endregion init actions of syntax states

}

编译原理中的词法分析器

根据文法Grammar生成词法分析器,其基本原理是:用正则表达式描述解析器中的单词Token,然后将其转换为词法分析器代码。

从正则表达式到ε-NFA

正则表达式是若干个“字符及其重复次数”的list。因此它也可以用一个文法表示。

// Pattern is xxx in %%xxx%%
// xxx is any char between □(32) and ~(126).

// VnRegulations:
Pattern    : PreRegex Regex PostRegex ;
PreRegex   : 'refVt' | empty ;
PostRegex  : '/' Regex | empty ;
Regex      : Regex '|' Bunch | Bunch ;
Bunch      : Bunch Unit | Unit ;
Unit       : 'char' Repeat | '.' Repeat | 'scope' Repeat | '(' Regex ')' Repeat ;
Repeat     : '?' | '+' | '*' | 'MinMax' | empty ;

// 'refVt' is <'Vt'> 'Vt' is same with which in Grammar.st
%%\<'([ -&]|\\'|[(-\[]|\\\\|[\]-~])+'>%% 'refVt' 

%%\t|\n|\r|[\u0020!"#$%&']%% 'char' 
%%\\[()*+]%%                 'char' 
%%[,-]%%                     'char' 
%%\\[./]%%                   'char' 
%%[0-9:;]%%                  'char' 
%%\\[<]%%                    'char' 
%%=%%                        'char' 
%%\\[>]|\\[?]%%              'char' 
%%[@A-Z]%%                   'char' 
%%\\[\[]|\\\\|\\[\]]%%       'char' 
%%^|[_`a-z]%%                'char' 
%%\\[{|}]%%                  'char' 
%%~%%                        'char' 
%%\\u[0-9A-Fa-f]{4}%%        'char' 
%%\[([^\]]|\\[\]])+]%% 'scope' 
%%\{[,0-9 \t]+}%% 'MinMax' 

当处理一个char及其次数时:

点击查看 当处理一个char及其次数时
eNFAFragment Parse(char value, MinMax minmax) {
    ICharRange conditionCode = SingleChar.New(value);
    var count = minmax.max + 1; // 用count个eNFAState链接表示minmax.max次
    if (count <= 0) { count = minmax.min + 1; }
    var stateList = new eNFAState[count];
	stateList[0] = new eNFAState();
    for (int i = 1; i < count; i++) {
        stateList[i] = new eNFAState();
        var edge = eNFAEdge.Connect(stateList[i - 1], stateList[i], conditionCode);
    }
    for (int i = minmax.min; i < count - 1; i++) {
        var edge = eNFAEdge.Connect(stateList[i], stateList[count - 1], null/*ε边*/);
    }
    if (minmax.max < 0) { // 如果最大为无限次
        var edge = eNFAEdge.Connect(stateList[count - 1], stateList[count - 1], conditionCode);
    }
    var unit = new eNFAFragment(start: stateList[0], end: stateList[count - 1]);
	return unit;
}

当处理其他类型的字符集合时,与此同理,不再赘述。

完整的词法分析器,需要处理Grammar中的所有Token类型,因此,我们必须把各个正则表达式拼合为一个大的正则表达式,如下:

点击查看 拼合所有正则表达式
// get the whole complete ε-NFA of all kinds of tokens for lexical analyze
public AutomatonInfo GetWholeAutomaton() {
	var wholeStart = new eNFAState(0, "wholeStart");
	var wholeEnd = new eNFAState(1, "wholeEnd"); wholeEnd.isEnd = true;
	var wholeRegex = new eNFAInfo(wholeStart, wholeEnd);
	// connect all eNFAInfo together to make a whole complete ε-NFA for lexical analyze.
	var id1 = 1u;
	var Vt2Pattern4Vts = this.GetVt2Pattern4Vts();
	foreach (var pair in Vt2Pattern4Vts) {
		var Vt = pair.Key; var pattern4Vts = pair.Value;
		for (int i = 0; i < pattern4Vts.Count; i++) { //bits.length == VtInfo.pattern4Vts.Count
			var pattern4Vt = pattern4Vts[i]; var preVt = pattern4Vt.pattern.preVt;
			pattern4Vt.pattern.xCopy(id1++, out var regex, out var postRegex);
			{   // connect preENFA & tokenScript
				var closeStart = eNFAEdge.Connect(wholeStart, regex.start);
				var closeEnd = eNFAEdge.Connect(postRegex.end, wholeEnd);
			}
			foreach (var signalCondition in pattern4Vt.signalConditions) {
				{
					var script = new TokenScript(TokenScript.Kind.BeginToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
					foreach (var edge in regex.start.toEdges) { edge.TryAttach(script, wholeRegex); }
				}
				{
					var script = new TokenScript(TokenScript.Kind.ExtendToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
					foreach (var edge in postRegex.start.toEdges) { edge.TryAttach(script, wholeRegex); }
				}
				{
					var script = new TokenScript(TokenScript.Kind.AcceptToken, signalCondition, preVt, Vt, pattern4Vt.nextSignal);
					foreach (var edge in postRegex.end.fromEdges) { edge.TryAttach(script, wholeRegex); }
				}
			}
		}
	}

	var validChars = this.GetValidScopeChars();
	var wholeAutomaton = AutomatonInfo.New(wholeRegex, true, validChars);
	return wholeAutomaton;
}

从ε-NFA到补充完整的ε-NFA

这一步的作用是:将ε-NFA中所有隐藏的ε边都画出来,以便后续处理。

基本思路是:如果A-ε->B且B-ε->C,那么应当画出A-ε->C这条ε边。

点击查看 从ε-NFA到补充完整的ε-NFA
// build edges that are implyed by ε edges.
private static eNFAInfo ManifesteNFA(eNFAInfo eNFA) {
	var copyed = eNFA.Copy();
	SpreadEnds(copyed);
	CompleteEdges(copyed);
	return copyed;
}

private static void SpreadEnds(eNFAInfo eNFA) {
	var initialEnds = new Queue<eNFAState>();
	{
		var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
		var visited = new List<eNFAState>();
		while (queue.Count > 0) {
			var subject = queue.Dequeue();
			if (!visited.Contains(subject)) {
			    visited.Add(subject);
				if (subject.isEnd) {
					if (!initialEnds.Contains(subject)) { initialEnds.Enqueue(subject); }
				}

				foreach (var edge in subject.toEdges) {
					var to = edge.to;
					if (!visited.Contains(to)) { queue.Enqueue(to); }
				}
			}
		}
	}
	// spread the ends
	{
		var queue = initialEnds;
		var visited = new List<eNFAState>();
		while (queue.Count > 0) {
			var subject = queue.Dequeue();
			if (!visited.Contains(subject)) {
			    visited.Add(subject);
				foreach (var edge in subject.fromEdges) {
					if (edge.conditionCode is null) {
						var from = edge.from; from.isEnd = true;
						if (!visited.Contains(from)) { queue.Enqueue(from); }
					}
				}
				foreach (var edge in subject.toEdges) {
					if (edge.conditionCode is null) {
						var to = edge.to; to.isEnd = true;
						if (!visited.Contains(to)) { queue.Enqueue(to); }
					}
				}
			}
		}
	}
}

// complete edges.
private static void CompleteEdges(eNFAInfo eNFA) {
	var initialEmptyQueue = new Queue<eNFAEdge>();
	{
		var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
		var visited = new List<eNFAState>();
		while (queue.Count > 0) {
			var subject = queue.Dequeue();
			if (!visited.Contains(subject)) {
			    visited.Add(subject);
				foreach (var edge in subject.toEdges) {
					if (edge.conditionCode is null) {
						if (!initialEmptyQueue.Contains(edge)) { initialEmptyQueue.Enqueue(edge); }
					}

					var to = edge.to;
					if (!visited.Contains(to)) { queue.Enqueue(to); }
				}
			}
		}

	}
	{
		var emptyQueue = initialEmptyQueue;
		var visited = new List<eNFAEdge>();
		while (emptyQueue.Count > 0) {
			var emptyEdge = emptyQueue.Dequeue();
			if (!visited.Contains(emptyEdge)) {
			    visited.Add(emptyEdge);
				var from = emptyEdge.from; var to = emptyEdge.to;
				foreach (var edge in to.toEdges) {
					var to2 = edge.to;
					// if(from -->|emptyEdge| to) { from -->|to.toEdges| to2 }
					var newEdge = eNFAEdge.Connect(from, to2, edge.conditionCode);
					if (newEdge is not null) {
						if (newEdge.conditionCode is null) { emptyQueue.Enqueue(newEdge); }
					}
				}
			}
		}
	}
}

从补充完整的ε-NFA到NFA

这一步的作用是:去掉ε-NFA中的ε边(即无须任何char即可跳转过去的边),顺便去掉无用的状态State,以便后续处理。

基本思路是:复制原本的ε-NFA,但是,在复制过程中,如果边是ε边,则不复制它,也不复制它指向的State。

点击查看 从补充完整的ε-NFA到NFA
// remove empty edges(and thus useless states).
private static NFAInfo ToNFA(eNFAInfo eNFAManifested) {
	// Template.state -> Copyed.state
	var stateDict = new Dictionary<eNFAState, NFAState>();
	// Template.edge -> Copyed.edge
	var edgeDict = new Dictionary<eNFAEdge, NFAEdge>();
	NFAInfo NFA;
	{
		var tStart = eNFAManifested.start;
		var cStart = new NFAState(tStart);
		stateDict.Add(tStart, cStart);
		NFA = new NFAInfo(cStart);
	}
	{
		var tStart = eNFAManifested.start;
		var queue = new Queue<eNFAState>(); queue.Enqueue(tStart);
		var visited = new List<eNFAState>();
		while (queue.Count > 0) {
			var tSubject = queue.Dequeue();
			if (!visited.Contains(tSubject)) {
			    visited.Add(tSubject);
				// copy state
				if (!stateDict.TryGetValue(tSubject, out var cSubject)) {
					cSubject = new NFAState(tSubject); stateDict.Add(tSubject, cSubject);
				}

				bool allEmpty = true;
				foreach (var tEdge in tSubject.toEdges) {
					if (tEdge.conditionCode != null) {
						allEmpty = false;
						var tTo = tEdge.to;
						// copy state
						if (!stateDict.TryGetValue(tTo, out var cTo)) {
							cTo = new NFAState(tTo);
							stateDict.Add(tTo, cTo);
						}
						// copy edge
						if (!edgeDict.TryGetValue(tEdge, out var cEdge)) {
							cEdge = NFAEdge.Connect(cSubject, cTo, tEdge.conditionCode, tEdge.possibleVts);
							edgeDict.Add(tEdge, cEdge);
						}

						if (!visited.Contains(tTo)) { queue.Enqueue(tTo); }
					}
				}
			}
		}
	}

	return NFA;
}

从NFA到DFA

NFA的一个缺点是:可能存在这样的情况,即A-x->B和A-x->C同时存在,即一个状态A可能在经过字符x时跳转到两个不同的状态B和C上。为了消灭这种情况,我们需要将NFA转换为等价的DFA。DFA就没有这种情况了。

基本思路是:若A-x->B和A-x->C同时存在,则将B和C视为一个整体,让这个整体作为DFA的一个状态。这样,A经过字符x时,就会只跳转到一个状态上了。

这被称为子集构造法(Subset Construction Algorithm)。

点击查看 从NFA到DFA的子集构造法(Subset Construction Algorithm)
// 子集构造法
// transform from NFA to DFA.
private static DFAInfo ToDFA(NFAInfo NFA) {
	int DFAId = 0;// id in order of DFA state creation
	DFAInfo DFA;
	var stateList = new CoupleList<DFAStateDraft>();
	var edgeList = new CoupleList<DFAEdgeDraft>();
	var queue = new Queue<DFAStateDraft>();
	{
		var DFAStart = new DFAStateDraft(DFAId++, NFA.start.name, NFA.start);
		DFA = new DFAInfo(DFAStart, NFA);
		stateList.TryInsert(DFAStart);
		queue.Enqueue(DFAStart);
	}
	while (queue.Count > 0) { // find DFA states except the DFAStart
		var from = queue.Dequeue();
		// DFAToDict: { DFAFrom go through {some chars} to { DFATo } }
		var chars = new List<char>();
		foreach (var NFAState in from.NFAStates) {
			foreach (var NFAEdge in NFAState.toEdges) {
				foreach (var c in NFAEdge.GetChars()) {
					if (!chars.Contains(c)) { chars.Add(c); }
				}
			}
		}
		// DFATo -> matching chars
		var rawDict = new ListedDict<CoupleList<NFAEdgeDraft>, char>();
		foreach (var c in chars) {
			var NFAEdges = new CoupleList<NFAEdgeDraft>(); // -->|c| { toStates }
			foreach (var NFAState in from.NFAStates) {
				foreach (var NFAEdge in NFAState.toEdges) {
					if (NFAEdge.Contains(c)) {
						NFAEdges.TryInsert(NFAEdge);
					}
				}
			}
			rawDict.TryInsert(NFAEdges, c);
		}
		foreach (var item in rawDict) {
			var NFAEdges = item.Key;
			var to = new DFAStateDraft(DFAId, from NFAEdge in NFAEdges select NFAEdge.to);
			if (stateList.TryInsert(to)) {
				DFAId++;
				var literalChars = item.Value;
				string condition = ToCharRange(literalChars); }
				var edge = DFAEdgeDraft.Connect(from, to, condition);
				edgeList.TryInsert(edge);

				queue.Enqueue(to);
			}
			else {
				var t = stateList.IndexOf(to);
				var oldTo = stateList[t];
				var literalChars = item.Value;
				string condition = ToCharRange(literalChars); }
				var edge = DFAEdgeDraft.Connect(from, oldTo, condition);
				edgeList.TryInsert(edge);
			}
		}
	}

	return DFA;
}

private static string/*DFAEdgeDraft.condition*/ ToCharRange(CoupleList<char> charList) {
	if (charList.Count == 1) {
		var c = charList[0];
		return c.ToString();
	}

	var rangeItems = new List<RangeItem>();
	var index = 0;
	while (index < charList.Count) {
		var min = charList[index]; var max = charList[index];
		while (index < charList.Count - 1
			&& Math.Abs(charList[index] - charList[index + 1]) == 1) {
			if (charList[index + 1] < min) { min = charList[index + 1]; }
			if (max < charList[index + 1]) { max = charList[index + 1]; }
			index++;
		}

		rangeItems.Add(new RangeItem(min, max));
		index++;
	}

	bool dashExists = false;
	for (int t = 0; t < rangeItems.Count; t++) {
		var rangeItem = rangeItems[t];
		if (rangeItem.min == '-' && rangeItem.max == '-') {
			rangeItems.Remove(rangeItem);
			t--;
			dashExists = true;
		}
	}
	var b = new StringBuilder();
	b.Append("["); if (dashExists) { b.Append("-"); }
	foreach (var item in rangeItems) {
		b.Append(item.ToCondition());
	}
	b.Append("]");
	return b.ToString();
}

这一版本的效率太低,我已重写之。但这一版本更容易理解,因而放在这里。

从DFA到miniDFA

DFA的状态可能有很多,有时候是可以合并的。将其合并到状态数量最少的DFA,就是miniDFA。

基本思路是:

初始化:将代表一个Token的正则表达式的End状态分别独立划分出来,分别作为miniDFA的一个状态,将其他状态作为miniDFA的一个状态。
循环:拆分当前的各个miniDFA状态,方法是,如果这个miniDFA状态中的两个DFA状态不等价,则将其拆分到两个新的miniDFA状态中。何为不等价?miniDFA状态中的两个DFA状态A和B,若A和B对某个字符x,其跳转到不同的miniDFA状态,那么A和B就是不等价的。
收尾:无可拆分时,目前的状态就是全部miniDFA状态。最坏情况下,全部miniDFA状态与全部DFA状态的数量相同。
点击查看 从DFA到miniDFA
// minimize states of the specified FAInfo.
private miniDFAInfo TominiDFA(DFAInfo DFAInfo) {
	// every CoupleList<DFAState> is a/some potential miniDFA(s),
	// and it needs to be further split.
	List<CoupleList<DFAState>> chaos = InitChaos(DFAInfo);
	List<CoupleList<DFAState>> completedChaos = SplitChaos(chaos, validChars);
	// dump minimum DFA
	miniDFAState[] miniDFAStates = ConstructminiDFAStates(completedChaos);
	miniDFAInfo miniDFA;
	{
		miniDFAState? miniDFAStart = null;
		foreach (var state in miniDFAStates) {
			if (state.Contains(DFAInfo.start)) {
				miniDFAStart = state; break;
			}
		}
		miniDFA = new miniDFAInfo(miniDFAStart, DFAInfo);
	}

	// DFA state -> index of the item(which is a collection) in chaos
	var DFA2Chaos = new Dictionary<DFAState, /*Collection<DFAState>*/int>();
	{
		for (int index = 0; index < completedChaos.Count; index++) {
			var DFAStates = completedChaos[index];
			foreach (var DFAState in DFAStates) { DFA2Chaos.Add(DFAState, /*DFAStates*/index); }
		}
	}
	{
		// edges of minimum DFA
		var miniEdges = new CoupleList<miniDFAEdge>();
		var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
		var visited = new List<DFAState>();
		while (queue.Count > 0) {
			var subject = queue.Dequeue();
			if (!visited.Contains(subject)) {
				visited.Add(subject);
				var fromIndex = DFA2Chaos[subject];
				var miniDFAFrom = miniDFAStates[fromIndex];
				foreach (var edge in subject.toEdges) {
					var to = edge.to;
					var toIndex = DFA2Chaos[to];
					var newEdge = miniDFAEdge.Connect(miniDFAFrom, miniDFAStates[toIndex], edge.conditionCode);
					miniEdges.TryInsert(newEdge);

					if (!visited.Contains(to))  { queue.Enqueue(to); }
				}
			}
		}
	}

	return miniDFA;
}

private static miniDFAState[] ConstructminiDFAStates(List<CoupleList<DFAState>> completedChaos) {
	var count = completedChaos.Count;
	var miniDFAStates = new miniDFAState[count];
	for (int id = 0; id < count; id++) {
		var DFAStates = completedChaos[id];
		miniDFAStates[id] = new miniDFAState(id, DFAStates);
	}

	return miniDFAStates;
}

private List<CoupleList<DFAState>> SplitChaos(List<CoupleList<DFAState>> initialChaos, ICharRange validChars) {
	var currentChaos = initialChaos;
	bool updated = true;
	while (updated) {
		var nextChaos = new List<CoupleList<DFAState>>();
		foreach (var miniDFAEgg in currentChaos) {
			var merged = new bool[miniDFAEgg.Count];
			for (int i = 0; i < miniDFAEgg.Count; i++) {
				if (merged[i]) { continue; }
				var standard = miniDFAEgg[i];
				var newEgg = new CoupleList<DFAState>(); newEgg.TryInsert(standard);
				merged[i] = true;
				for (int j = i + 1; j < miniDFAEgg.Count; j++) {
					if (merged[j]) { continue; }
					var state = miniDFAEgg[j];
					if (EqualValue(standard, state, currentChaos, validChars)) {
						newEgg.TryInsert(state);
						merged[j] = true;
					}
				}
				nextChaos.Add(newEgg);
			}
		}

		updated = (nextChaos.Count != currentChaos.Count);
		currentChaos = nextChaos;
	}

	return currentChaos;
}

// index  -> condition code of the miniDFAEgg in the chaos.
private Dictionary<int/*which miniDFAEgg*/, ICharRange> GetHopcroft(
    DFAState key, List<CoupleList<DFAState>> chaos) {
	var HopcroftBuilder = new Dictionary<int/*which miniDFAEgg*/, RangeListBuilder>();
	foreach (var edge in key.toEdges) {
		var found = false;
		for (int sIndex = 0; sIndex < chaos.Count; sIndex++) {
			var miniDFAEgg = chaos[sIndex];
			foreach (var DFAState in miniDFAEgg) {
				if (DFAState == edge.to) {
					if (!HopcroftBuilder.TryGetValue(sIndex, out var builder)) {
						builder = new RangeListBuilder();
						HopcroftBuilder.Add(sIndex, builder);
					}
					builder.Append(edge.conditionCode);
					found = true; break;
				}
			}
			if (found) { break; }
		}
	}
	Hopcroft = new Dictionary<int/*which miniDFAEgg*/, ICharRange>();
	foreach (var item in HopcroftBuilder) {
		var indexOfChaos = item.Key; var builder = item.Value;
		var conditionCode = builder.Build();
		Hopcroft.Add(indexOfChaos, conditionCode);
	}

	return Hopcroft;
}

// standard and current are in the same miniDFAEgg of chaos
// Are they of equal value?
private bool EqualValue(DFAState standard, DFAState current, List<CoupleList<DFAState>> chaos) {
	var standardDict = this.GetHopcroft(standard, chaos);
	var currentDict = this.GetHopcroft(current, chaos);

	if (standardDict.Count != currentDict.Count) { return false; }
	foreach (var item in standardDict) {
		var indexOfChaos = item.Key;
		if (!currentDict.TryGetValue(indexOfChaos, out var cConditionCode)) { return false; }
		var sConditionCode = item.Value;
		var sameRange = Algo.SameRange(sConditionCode, cConditionCode);
		if (!sameRange) { return false; }
	}

	return true;
}

private static List<CoupleList<DFAState>> InitChaos(DFAInfo DFAInfo) {
	var chaos = new List<CoupleList<DFAState>>();
	var nonEnds = new CoupleList<DFAState>(DFAState.Comparer);
	var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
	var visited = new List<DFAState>();
	while (queue.Count > 0) {
		var subject = queue.Dequeue();
		if (!visited.Contains(subject)) {
			visited.Add(subject);
			// just split every ends for now
			if (subject.isEnd) {
				var ends = new CoupleList<DFAState>(1); ends.TryInsert(subject);
				chaos.Add(ends);
			}
			else { nonEnds.TryInsert(subject); }

			foreach (var edge in subject.toEdges) {
				var to = edge.to;
				if (!visited.Contains(to)) { queue.Enqueue(to); }
			}
		}
	}

	if (nonEnds.Count > 0) { chaos.Insert(0, nonEnds); }

	return chaos;
}

至此,从正则表达式到miniDFA的算法就完成了。词法分析器的构造并不复杂,但是需要十分细致耐心,相关资料又少又乱,耗费的开发时间反而比语法分析器部分多。

从DFA到词法分析器的C#代码

DFA与词法分析器的核心代码是一一对应的。即,DFA的一个状态就是词法分析器里的一个LexicalState lexicalStateA;字段,它包含一个匿名函数,此函数根据输入的字符char c,跳转到下一个LexicalState lexicalStateB;字段。DFA的一个状态的每个跳出边都是一个相应的else if(c == 'x'){}分支,使其跳转到恰当的状态。词法分析器在不断地跳转过程中,收集信息,在合适的位置截断输入流string,并为其赋予相应的Token类型,使之成为一个Token对象。

点击查看 Calc.st的第一个状态【lexicalState0.cs】
/// <summary>
/// lexicalState0
/// <para>CompilerExp.Lexical●[1 DFA States]</para>
/// </summary>
private static readonly Action<LexicalContext, char> lexicalState0 =
static (context, c) => {
	if (false) { /* for simpler code generation purpose. */ }
	/* user-input condition code */
	/* [0-9] */
	else if (/* possible Vt : 'number' */
	 /* no possible signal */
	/* [xxx] scope */
	'0'/*'\u0030'(48)*/ <= c && c <= '9'/*'\u0039'(57)*/) {
		BeginToken(context);
		context.currentState = lexicalState1;
	}
	/* user-input condition code */
	/* \) */
	else if (/* possible Vt : ')' */
	 /* no possible signal */
	/* single char */
	c == ')'/*'\u0029'(41)*/) {
		BeginToken(context);
		context.currentState = lexicalState2;
	}
	/* user-input condition code */
	/* \( */
	else if (/* possible Vt : '(' */
	 /* no possible signal */
	/* single char */
	c == '('/*'\u0028'(40)*/) {
		BeginToken(context);
		context.currentState = lexicalState3;
	}
	/* user-input condition code */
	/* \/ */
	else if (/* possible Vt : '/' */
	 /* no possible signal */
	/* single char */
	c == '/'/*'\u002F'(47)*/) {
		BeginToken(context);
		context.currentState = lexicalState4;
	}
	/* user-input condition code */
	/* \* */
	else if (/* possible Vt : '*' */
	 /* no possible signal */
	/* single char */
	c == '*'/*'\u002A'(42)*/) {
		BeginToken(context);
		context.currentState = lexicalState5;
	}
	/* user-input condition code */
	/* - */
	else if (/* possible Vt : '-' */
	 /* no possible signal */
	/* single char */
	c == '-'/*'\u002D'(45)*/) {
		BeginToken(context);
		context.currentState = lexicalState6;
	}
	/* user-input condition code */
	/* \+ */
	else if (/* possible Vt : '+' */
	 /* no possible signal */
	/* single char */
	c == '+'/*'\u002B'(43)*/) {
		BeginToken(context);
		context.currentState = lexicalState7;
	}
	/* deal with everything else. */
	else if (c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\0') {
		context.currentState = lexicalState0; // skip them.
	}
	else { // unexpected char.
		BeginToken(context);
		context.tokenEnd = context.cursor; // ExtendToken(context);
		AcceptToken(st.Error, context);
		context.currentState = lexicalState0;
	}
};

由于这里仅仅是从DFA对象到C#源代码文件的转化,并不涉及编译原理相关算法,就不详述了。

编译原理中的语义分析器

语义分析器的基本思路是:后序优先(从叶结点到根结点)遍历语法树,逐步得到用户所需的结果。

按后序优先遍历的顺序遍历语法树Node,才是按源代码中的字符顺序遍历源代码。在遍历时,对不同的结点执行不同的函数,就可以逐步得到语义结果。

点击查看 后序优先遍历语法树Node
// Extract some data structure from syntax tree.
public T? Extract(Node rootNode, TokenList tokens, string sourceCode) {
	var context = new TContext<T>(rootNode, tokens, sourceCode);

	// post-order traverse rootNode with stack(without recursion).
	var nodeStack = new Stack<Node>(); var indexStack = new Stack<int>();
	// init stack.
	{
		// push nextLeft and its next pending children.
		var nextLeft = rootNode; var index = 0;
		nodeStack.Push(nextLeft); indexStack.Push(index);
		while (nextLeft.Children.Count > 0) {
			nextLeft = nextLeft.Children[0];
			nodeStack.Push(nextLeft); indexStack.Push(0);
		}
	}

	while (nodeStack.Count > 0) {
		var current = nodeStack.Pop(); var index = indexStack.Pop() + 1;
		if (index < current.Children.Count) {
			// push this node back again.
			nodeStack.Push(current); indexStack.Push(index);

			// push nextLeft and its next pending children.
			var nextLeft = current.Children[index];
			nodeStack.Push(nextLeft); indexStack.Push(0);
			while (nextLeft.Children.Count > 0) {
				nextLeft = nextLeft.Children[0];
				nodeStack.Push(nextLeft); indexStack.Push(0);
			}
		}
		else {
			if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
				action(current, context);
			}
		}
	}

	{
		var current = this.endOfTokenList; // extra '¥' token indicates end of source code.
		if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
			action(current, context);
		}
	}

	return context.result;
}

在C#中,我也将这些“不同的函数”做成了匿名函数的形式。因为确实不需要知道它们的名字。

点击查看 Calc.st文法的语义分析器【获取计算结果】
private static readonly Dictionary<int/*Node.type*/, Action<Node, TContext<FinalValue>>>
	@finalValueExtractorDict = new();

private static readonly Action<Node, TContext<FinalValue>> VtHandler =
(node, context) => {
	var token = context.tokens[node.tokenIndex];
	context.objStack.Push(token);
};

// initialize dict for extractor.
private static void InitializeFinalValueExtractorDict() {
	var extractorDict = @finalValueExtractorDict;
	// extractorDict.Add(EType.@Plus符, VtHandler);
	// extractorDict.Add(EType.@Dash符, VtHandler);
	// extractorDict.Add(EType.@Asterisk符, VtHandler);
	// extractorDict.Add(EType.@Slash符, VtHandler);
	// extractorDict.Add(EType.@LeftParenthesis符, VtHandler);
	// extractorDict.Add(EType.@RightParenthesis符, VtHandler);
	extractorDict.Add(EType.@number, VtHandler);
	extractorDict.Add(EType.@终,
	static (node, context) => {
		// [-1]: FinalValue : Additive ;
		var @finalValue = (double)context.objStack.Pop();
		context.result = new FinalValue(@finalValue);
	}); // end of extractorDict.Add(EType.@终, (node, context) => { ... });
	extractorDict.Add(EType.@Additive,
	static (node, context) => {
		if (false) { /* for simpler code generation process */ }
		else if (node.regulation == CompilerCalc.regulations[0]) {
			// [0]: Additive : Additive '+' Multiplicative ;
			var @multiplicative0 = (double)context.objStack.Pop();
			var @additive2 = (double)context.objStack.Pop();
			var value = additive2 + multiplicative0;
			context.objStack.Push(value);
		}
		else if (node.regulation == CompilerCalc.regulations[1]) {
			// [1]: Additive : Additive '-' Multiplicative ;
			var @multiplicative0 = (double)context.objStack.Pop();
			var @additive2 = (double)context.objStack.Pop();
			var value = additive2 - multiplicative0;
			context.objStack.Push(value);
		}
		else if (node.regulation == CompilerCalc.regulations[2]) {
			// [2]: Additive : Multiplicative ;
		}
		else { throw new NotImplementedException(); }
	}); // end of extractorDict.Add(EType.@Additive, (node, context) => { ... });
	extractorDict.Add(EType.@Multiplicative,
	static (node, context) => {
		if (false) { /* for simpler code generation process */ }
		else if (node.regulation == CompilerCalc.regulations[3]) {
			// [3]: Multiplicative : Multiplicative '*' Primary ;
			var @primary0 = (double)context.objStack.Pop();
			var @multiplicative2 = (double)context.objStack.Pop();
			var value = multiplicative2 * primary0;
			context.objStack.Push(value);
		}
		else if (node.regulation == CompilerCalc.regulations[4]) {
			// [4]: Multiplicative : Multiplicative '/' Primary ;
			var @primary0 = (double)context.objStack.Pop();
			var @multiplicative2 = (double)context.objStack.Pop();
			var value = multiplicative2 / primary0;
			context.objStack.Push(value);
		}
		else if (node.regulation == CompilerCalc.regulations[5]) {
			// [5]: Multiplicative : Primary ;
		}
		else { throw new NotImplementedException(); }
	}); // end of extractorDict.Add(EType.@Multiplicative, (node, context) => { ... });
	extractorDict.Add(EType.@Primary,
	static (node, context) => {
		if (false) { /* for simpler code generation process */ }
		else if (node.regulation == CompilerCalc.regulations[6]) {
			// [6]: Primary : '(' Additive ')' ;
		}
		else if (node.regulation == CompilerCalc.regulations[7]) {
			// [7]: Primary : 'number' ;
			var @number0 = context.objStack.Pop() as Token;
			var value = double.Parse(number0.value);
			context.objStack.Push(value);
		}
		else { throw new NotImplementedException(); }
	}); // end of extractorDict.Add(EType.@Primary, (node, context) => { ... });
}

End

posted @   BIT祝威  阅读(52)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
canvas start.

canvas end.



点击右上角即可分享
微信分享提示