考试答案匹配算法
这里的答案匹配主要指填空题、问答题类的答案的匹配,也就是字符串的相似度。
网上有很多做法,例如求Levenshtein距离(字符串编辑距离)、汉明距离、莱文斯坦比、Jaro距离和Jaro-Winkler距离等,现在介绍一种可能更适合改卷评分场景的相似度计算方法
一、原理篇
1.1、核心:根据两字符串的匹配关系阵,得出最大匹配串
假设要匹配两个字符串"NEWBOOKS"和"BOOKNEWS",那他们的匹配关系阵如下图所示
其中,标注为1的表示两个字符相同
最大匹配串呢,就是从某个位置起,顺序找出相同的字符组装起来,直到两个字符串都到结尾,这个组装起来的字符串就是这个位置的匹配串
把所有位置的匹配串都比较一下,最长的叫做最大匹配串
如上图 从B(位置0)的位置找,匹配串是BOOKS(长度5),从N(位置4)的位置找,匹配串是NEWS(长度4),于是乎,最大匹配串是BOOKS
找出这个最大匹配串后,用这个最大匹配串的长度,比上标准串的长度,就是本篇重点--考试答案匹配度
1.2、运用到答案匹配系统中,许多老师考试改卷时,应该都是找到得分点,回答到了,就有分;全部点都答到了,应该就能拿满这道题的全部分数
所以逻辑业务应该是拿得分点去匹配答案
二、实现篇
C#版:
准备2个类,StringMatch.cs(对应上面1.1的逻辑)和AnswerMatch.cs(对应上面1.2的逻辑)
public class StringMatch { /// <summary> /// 模式串 /// </summary> private string _pattern; /// <summary> /// 主串 /// </summary> private string _main; /// <summary> /// 匹配关系阵 /// </summary> private bool[,] _matrix; public StringMatch(string main = null, string pattern = null) { this._pattern = pattern; this._main = main; } /// <summary> /// 匹配标准答案和答案的最大相似度 /// 右下行路的搜索法找最大字符匹配数 /// </summary> /// <param name="main">主串</param> /// <param name="pattern">模式串</param> /// <returns></returns> public decimal GetMatchPersen(string main = null, string pattern = null) { if (!string.IsNullOrEmpty(pattern)) { _pattern = pattern; } if (!string.IsNullOrEmpty(main)) { _main = main; } if (string.IsNullOrEmpty(_pattern) || string.IsNullOrEmpty(_main)) { return -1; } _matrix = new bool[_pattern.Length, _main.Length]; for (var i = 0; i < _pattern.Length; i++) { for (var j = 0; j < _main.Length; j++) { _matrix[i, j] = _pattern[i].Equals(_main[j]); } } var matchStr = ""; for (var i = 0; i < _pattern.Length; i++) { var temp = GetMaxMatchStartWith(i); if (temp.Length > matchStr.Length) { matchStr = temp; } } return ((decimal) matchStr.Length) / _main.Length; } /// <summary> /// 从第几个字符开始匹配 /// </summary> /// <param name="start"></param> /// <returns></returns> private string GetMaxMatchStartWith(int start) { var result = ""; var row = start; var lastSame = -1; //记录最后一次出现相同的位置,下次遍历从这后一位开始 while (row < _pattern.Length) { var column = lastSame + 1; while (column < _main.Length) { if (_matrix[row, column]) { lastSame = column; result += _pattern[row]; row++; if (row == _pattern.Length) { return result; } column++; } else { column++; } } row++; } return result; } }
public class AnswerMatch { public AnswerMatch(int format = 0, int digits = 0, List<Point> points = null) { Format = format; Digits = digits; Points = points; } /// <summary> /// 答案点(得分点) /// 填空题之类的只有1个元素,问答题之类的有多个元素,若多格填空的,可多次重置此数组并多次调用GetScore /// </summary> public List<Point> Points { get; set; } /// <summary> /// 当IsPercenScore为true时,不会使用MatchingScores来算得分,此时很可能产生小数 /// </summary> public int Format { get; set; } /// <summary> /// 保留小数位数 /// </summary> public int Digits { get; set; } /// <summary> /// 获取所有点的的得分 /// </summary> /// <param name="answer"></param> /// <param name="format"></param> /// <param name="digits"></param> /// <param name="points"></param> /// <returns></returns> public decimal GetScore(string answer, int? format = null, int? digits = null, List<Point> points = null) { var pointScores = GetPointScoresNoFormat(answer, format, digits, points); var result = 0m; foreach (var pointScore in pointScores) { result += pointScore.Item2; } return GetFormat(result); } /// <summary> /// 获取每一个点的得分 /// </summary> /// <param name="answer"></param> /// <param name="format"></param> /// <param name="digits"></param> /// <param name="points"></param> /// <returns></returns> public List<(long, decimal)> GetPointScores(string answer, int? format = null, int? digits = null, List<Point> points = null) { var temp = GetPointScoresNoFormat(answer, format, digits, points); var result = new List<(long, decimal)>(); foreach (var tuple in temp) { result.Add((tuple.Item1, GetFormat(tuple.Item2))); } return result; } /// <summary> /// 获取每一个点的得分 /// </summary> /// <param name="answer"></param> /// <param name="format"></param> /// <param name="digits"></param> /// <param name="points"></param> /// <returns></returns> private List<(long, decimal)> GetPointScoresNoFormat(string answer, int? format = null, int? digits = null, List<Point> points = null) { var pointMatchs = GetPointMatches(answer, format, digits, points); var result = new List<(long, decimal)>(); foreach (var point in Points) { var percen = pointMatchs.FirstOrDefault(c => c.Item1 == point.Id).Item2; if (point.IsPercenScore) { result.Add((point.Id, GetFormat(percen * point.Score))); continue; } if (point.MatchingScores.All(c => c.Item1 != 0)) { point.MatchingScores.Add((0, 0)); } if (point.MatchingScores.All(c => c.Item1 != 1)) { point.MatchingScores.Add((1, point.Score)); } var matchScores = point.MatchingScores.OrderByDescending(c => c.Item1); foreach (var matchScore in matchScores) { if (percen < matchScore.Item1) continue; result.Add((point.Id, matchScore.Item2)); break; } } return result; } /// <summary> /// 获取每一个点的匹配度 /// </summary> /// <param name="answer"></param> /// <param name="format"></param> /// <param name="digits"></param> /// <param name="points"></param> /// <returns></returns> public List<(long, decimal)> GetPointMatches(string answer, int? format = null, int? digits = null, List<Point> points = null) { if (format.HasValue) { Format = format.Value; } if (digits.HasValue) { Digits = digits.Value; } if (points != null && points.Any()) { Points = points; } if (Points == null || !Points.Any()) { throw new ArgumentNullException(); } var result = new List<(long, decimal)>(); foreach (var point in Points) { var percen = new StringMatch(point.Content, answer).GetMatchPersen(); result.Add((point.Id, percen)); } return result; } /// <summary> /// 格式化输出分数 /// </summary> /// <param name="score"></param> /// <param name="format"></param> /// <param name="digits"></param> /// <returns></returns> private decimal GetFormat(decimal score, int? format = null, int? digits = null) { if (format.HasValue) { Format = format.Value; } if (digits.HasValue) { Digits = digits.Value; } var mutiple = (decimal)Math.Pow(10, Digits + 1); switch (Format) { case (int)ScoreFormat.RoundHalf: return (decimal)Math.Round((double)score * 2, digits: Digits, mode: MidpointRounding.AwayFromZero) / 2.0m; case (int)ScoreFormat.Round: return (decimal) Math.Round((double) score, digits: Digits, mode: MidpointRounding.AwayFromZero); case (int)ScoreFormat.Up: return Math.Ceiling(score * mutiple) / mutiple; case (int)ScoreFormat.Down: return Math.Floor(score * mutiple) / mutiple; case (int)ScoreFormat.Original: return score; default: throw new Exception(); } } } /// <summary> /// 得分点 /// </summary> public class Point { /// <summary> /// 业务的Id /// </summary> public long Id { get; set; } /// <summary> /// 内容 /// </summary> public string Content { get; set; } /// <summary> /// 这个点的得分 /// </summary> public decimal Score { get; set; } /// <summary> /// 超过多少匹配度,可拿取对应的分数,默认拥有元素(1, Score)和(0, 0),即完全匹配时,拿取全部分数,完全不匹配时,拿0分 /// 元素形如(0, 0) (0.5, 1),(0.8, 2),(1, 3),其中(0, 0) 和(1, 3)是程序默认加上的 /// </summary> public List<(decimal, decimal)> MatchingScores { get; set; } /// <summary> /// 当懒得传(0.5, 1),(0.8, 2)时,此值为true,根据匹配度*Score算取得分 /// </summary> public bool IsPercenScore { get; set; } } /// <summary> /// 分数数据格式 /// </summary> public enum ScoreFormat { /// <summary> /// 四舍五入精确到小数保留位数后一位,最后一位必定是5或0 /// </summary> RoundHalf = 0, /// <summary> /// 四舍五入精确到小数保留位数 /// </summary> Round = 1, /// <summary> /// 向上取整到小数保留位数 /// </summary> Up = 2, /// <summary> /// 向下取整到小数保留位数 /// </summary> Down = 3, /// <summary> /// 保持原样(小数保留位数的配置无效) /// </summary> Original = 4 }
下面是使用的代码
适配场景:若干空的填空题、问答题
//模拟默写7言诗,错一个字扣1分,共5分 var am = new AnswerMatch(); var matchingScores = new List<(decimal, decimal)> { {(0.85m, 6m)}, //0.85是比6/7略小 {(0.71m, 5m)}, {(0.57m, 4m)}, {(0.42m, 3m)}, {(0.28m, 2m)}, {(0.14m, 1m)} }; var points = new List<Point> { new Point { Id = 0, Content = "八珍一箸千金價", MatchingScores = matchingScores, Score = 7m }, new Point { Id = 1, Content = "往往精庖賤惠文", MatchingScores = matchingScores, Score = 7m }, new Point { Id = 2, Content = "莫道形模大剛拙", MatchingScores = matchingScores, Score = 7m }, new Point { Id = 3, Content = "剖珠也解獻殷勤", MatchingScores = matchingScores, Score = 7m } }; var answer = @"八珍一箸千金價,往往精包賤惠问。 莫道形模大剛拙,剖珠也解獻殷勤。"; var result = am.GetScore(answer, (int) ScoreFormat.RoundHalf, 0, points) - (4 * 7 - 5); Console.WriteLine("默写古诗题得分:" + (result > 0 ? result : 0)); //3分 //模拟真实高中政治题 //答案①企业要制定正确的经营战略,不断通过深化改革提质增效。 // ②深化公司制股份制改革,大力发展混合所有制增强企业活力。 // ③加强企业的强强联合或兼并,提高企业市场竞争力。 // ④转变经济发展方式,优化产品结构,满足有效市场需求。 // ⑤提高自主创新能力,依靠技术进步、科学管理等手段,形成自己的竞争优势。 // ⑥转变对外经济发展方式,开展国际化经营,形成以技术、品牌、质量、服务为核心的出口竞争新优势等。 // (任意答对四点即可,每点4分,满分16分。如有其他合理答案,可酌情给分。) matchingScores = new List<(decimal, decimal)> { {(0.8m, 4m)}, //表示有8成匹配度就能拿满分了 {(0.6m, 3m)}, {(0.4m, 2m)}, {(0.2m, 1m)} }; points = new List<Point> { new Point { Id = 0, Content = "企业要制定正确的经营战略,不断通过深化改革提质增效", MatchingScores = matchingScores, Score = 4m }, new Point { Id = 1, Content = "深化公司制股份制改革,大力发展混合所有制增强企业活力", MatchingScores = matchingScores, Score = 4m }, new Point { Id = 2, Content = "加强企业的强强联合或兼并,提高企业市场竞争力", MatchingScores = matchingScores, Score = 4m }, new Point { Id = 3, Content = "转变经济发展方式,优化产品结构,满足有效市场需求", MatchingScores = matchingScores, Score = 4m }, new Point { Id = 3, Content = "提高自主创新能力,依靠技术进步、科学管理等手段,形成自己的竞争优势", MatchingScores = matchingScores, Score = 4m }, new Point { Id = 3, Content = "转变对外经济发展方式,开展国际化经营,形成以技术、品牌、质量、服务为核心的出口竞争新优势等", MatchingScores = matchingScores, Score = 4m } }; //某学生回答了6点但6点都不全 answer = @"①企业要制定正确的经营战略; ②深化公司制股份制改革; ③加强企业的强强联合或兼并; ④转变经济发展方式; ⑤提高自主创新能力; ⑥转变对外经济发展方式。"; var temp = am.GetPointScores(answer, (int)ScoreFormat.RoundHalf, 0, points); result = 0; result += temp.Sum(c => c.Item2); Console.WriteLine("政治题得分:" + (result > 16 ? 16 : result)); //11分 Console.ReadKey();
三、评测篇
这种算法有以下特点
1、字符匹配可以跳跃(回答答案不分先后,有点即可拿分,可避免标点符号不一致造成匹配度下降问题)
2、同义词,同义句不能正确匹配(其他算法好像也做不到,可能要去自然语言那块领域才能处理)
3、额外开辟空间存储矩阵,时间复杂度和空间复杂度都是O(m*n)
最后,还有什么评分场景适配不了的欢迎留言交流,算法的改进也可以