动态规划处理diff算法 Myers Diff (正向)
Eugene W. Myers 在他1986年发表于"Algorithmica"的论文"An O(ND) Difference Algorithm and Its Variations"中描述了一种用于处理diff的基础贪婪算法. 在他的论文中, 还对这种算法进行了扩展"Linear Space Refinement".
定义文件A和文件B, 算法会读取两个文件的输入, 假设B为新版本, 算法会生成一段Shortest Edit Script (SES, 最短编辑脚本) 用于将A转换为B . SES只包含两种命令: 从A中删除, 以及在B中插入
寻找SES 等价于寻找 Longest Common Subsequence ( LCS 最长公共子序列 ), LCS是两个文件中去掉一些字符后, 所产生的共有的最长的字符串序列. 注意, 这与 Longest Common Substring (最长公共字符串)不同, 后者是必须连续的.
两个文件中, 可能存在多个LCS, 例如ABC和ACB, 存在两个LCS "AB"和"AC", 在这里分别对应了一个SES. 这个算法在存在多个SES时, 仅返回第一个找到的SES.
算法的运作是依赖于A和B文件构成的有向编辑图, 图中A为X轴, B为Y轴, 假定A和B的长度分别为m, n, 每个坐标代表了各自字符串中的一个字符. 在图中沿X轴前进代表删除A中的字符, 沿Y轴前进代表插入B中的字符. 在横坐标于纵坐标字符相同的地方, 会有一条对角线连接左上与右下两点, 表示不需任何编辑, 等价于路径长度为0. 算法的目标, 就是寻找到一个从坐标(0, 0)到(m, n)的最短路径
算法在比较中, 定义了以下变量
k: 左上至右下的对角线, 以(0,0)对应的对角线k=0, 左侧为-1, -2, ... 右侧为1, 2, ...
d: 路径长度
x, y: 坐标
snake: 代表了一步操作及其后面跟随的对角线移动
Source: https://www.codeproject.com/Articles/42279/Investigating-Myers-diff-algorithm-Part-2-of-2
Java代码
public static void main(String[] args) { String a = "ABCABBACDAB"; String b = "CBABACDAA"; char[] aa = a.toCharArray(); char[] bb = b.toCharArray(); int max = aa.length + bb.length; int[] v = new int[max * 2]; List<Snake> snakes = new ArrayList<>(); for (int d = 0; d <= aa.length + bb.length; d++) { System.out.println("D:" + d); for (int k = -d; k <= d; k += 2) { System.out.print("k:" + k); // down or right? boolean down = (k == -d || (k != d && v[k - 1 + max] < v[k + 1 + max])); int kPrev = down ? k + 1 : k - 1; // start point int xStart = v[kPrev + max]; int yStart = xStart - kPrev; // mid point int xMid = down ? xStart : xStart + 1; int yMid = xMid - k; // end point int xEnd = xMid; int yEnd = yMid; // follow diagonal int snake = 0; while (xEnd < aa.length && yEnd < bb.length && aa[xEnd] == bb[yEnd]) { xEnd++; yEnd++; snake++; } // save end point v[k + max] = xEnd; // record a snake snakes.add(0, new Snake(xStart, yStart, xEnd, yEnd)); System.out.print(", start:("+xStart+","+yStart+"), mid:("+xMid+","+yMid+"), end:("+xEnd+","+yEnd + ")\n"); // check for solution if (xEnd >= aa.length && yEnd >= bb.length) { /* solution has been found */ System.out.println("found"); /* print the snakes */ Snake current = snakes.get(0); System.out.println(String.format("(%2d, %2d)<-(%2d, %2d)", current.getxEnd(), current.getyEnd(), current.getxStart(), current.getyStart())); for (int i = 1; i < snakes.size(); i++) { Snake tmp = snakes.get(i); if (tmp.getxEnd() == current.getxStart() && tmp.getyEnd() == current.getyStart()) { current = tmp; System.out.println(String.format("(%2d, %2d)<-(%2d, %2d)", current.getxEnd(), current.getyEnd(), current.getxStart(), current.getyStart())); if (current.getxStart() == 0 && current.getyStart() == 0) { break; } } } return; } } } } public static class Snake { private int xStart; private int yStart; private int xEnd; private int yEnd; public Snake(int xStart, int yStart, int xEnd, int yEnd) { this.xStart = xStart; this.yStart = yStart; this.xEnd = xEnd; this.yEnd = yEnd; } public int getxStart() { return xStart; } public void setxStart(int xStart) { this.xStart = xStart; } public int getyStart() { return yStart; } public void setyStart(int yStart) { this.yStart = yStart; } public int getxEnd() { return xEnd; } public void setxEnd(int xEnd) { this.xEnd = xEnd; } public int getyEnd() { return yEnd; } public void setyEnd(int yEnd) { this.yEnd = yEnd; } }
运行结果
D:0 k:0, start:(0,-1), mid:(0,0), end:(0,0) D:1 k:-1, start:(0,0), mid:(0,1), end:(0,1) k:1, start:(0,0), mid:(1,0), end:(1,0) D:2 k:-2, start:(0,1), mid:(0,2), end:(2,4) k:0, start:(1,0), mid:(1,1), end:(2,2) k:2, start:(1,0), mid:(2,0), end:(3,1) D:3 k:-3, start:(2,4), mid:(2,5), end:(3,6) k:-1, start:(2,4), mid:(3,4), end:(4,5) k:1, start:(3,1), mid:(3,2), end:(5,4) k:3, start:(3,1), mid:(4,1), end:(5,2) D:4 k:-4, start:(3,6), mid:(3,7), end:(4,8) k:-2, start:(4,5), mid:(4,6), end:(4,6) k:0, start:(5,4), mid:(5,5), end:(5,5) k:2, start:(5,4), mid:(6,4), end:(10,8) k:4, start:(5,2), mid:(6,2), end:(7,3) D:5 k:-5, start:(4,8), mid:(4,9), end:(4,9) k:-3, start:(4,8), mid:(5,8), end:(5,8) k:-1, start:(5,5), mid:(5,6), end:(5,6) k:1, start:(10,8), mid:(10,9), end:(10,9) k:3, start:(10,8), mid:(11,8), end:(11,8) k:5, start:(7,3), mid:(8,3), end:(8,3) D:6 k:-6, start:(4,9), mid:(4,10), end:(4,10) k:-4, start:(5,8), mid:(5,9), end:(5,9) k:-2, start:(5,8), mid:(6,8), end:(7,9) k:0, start:(10,9), mid:(10,10), end:(10,10) k:2, start:(11,8), mid:(11,9), end:(11,9) found (11, 9)<-(11, 8) (11, 8)<-(10, 8) (10, 8)<-( 5, 4) ( 5, 4)<-( 3, 1) ( 3, 1)<-( 1, 0) ( 1, 0)<-( 0, 0)