一个简单字符串差异对比暴力算法实现
如题:请求出两个字符串的差异部分,并以不同的颜色区分显示到浏览器上。
1. 解题思路
1. 找出两字符串中相同的部分,标记;
2. 找出两字符串中不同的部分,标记;
3. 尽可能长的匹配相同部分;
4. 尽可能少的使用复杂度(所有算法的重要目标);
2. 算法实现
算法实现如下:(js实现)
<!DOCTYPE html> <!DOCTYPE html> <html> <head> <title>diff function test</title> <script type="text/javascript"> // 思路1: 使用双指针暴力解法 // 1. 先扫描a, 直到一个与b[j]相同的元素为止, 保存为aQueue // 2. 再扫描b, 每次的查找范围为aQueue // 3. 如果找到, 则进行接下来的最长匹配 // 4. 如果没有找到, 则让a进行最长匹配 // 5. 进入下一轮循环,直到a循环为止 // 时间复杂度: a:m, b:n, a1:m, b1:m(m+1)/2*n, ∴ O(m²n) var logSwitch = 1; function diffWithWhitespace(a, b) { var aValues = a.split(""); var bValues = b.split(""); var alen = aValues.length; var blen = bValues.length; var i = 0, j = 0; var equalLongest = []; var aDiffLongest = []; var aResult = []; var bResult = []; // 上一次比较结果, EQUAL:相等, ADIFF:A, BDIFF:B var lastDiffResultType = "EQUAL"; while(i < alen || j < blen) { if(aValues[i] == bValues[j]) { // todo: 记录结果, 到a中,b中 if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(aValues[i]); printLog("equal:<span style=\"background:#ffea00\">" + aValues[i] + "</span><br />"); i++; j++; continue; } var i2 = i, j2 = j; aDiffLongest = []; while((i2) < alen && aValues[i2] != bValues[j2]) { aDiffLongest.push(aValues[i2]); ++i2; } var aDiffTmp = []; var bDiffTmp = []; if(i2 > alen) { // no equal find // the last one } else if(i2 != alen && aValues[i2] == bValues[j]) { aDiffLongest.push(aValues[i2]); } else{ if(j >= blen) { aDiffTmp.push(aValues[i]); // the last one aResult.push({"item": aDiffTmp, "type":"diff"}); i++; continue; } // a中未找到,全部退回到b中进行查找 bDiffTmp.push(bValues[j]); bResult.push({"item": bDiffTmp, "type":"diff"}) // 去重相同项,也同时跳过上一相同的项 while(++j2 < blen && bValues[j2] == bValues[j]) { bDiffTmp.push(bValues[j2]); j = j2; } printLog("bdiff:" + bDiffTmp + "<br />"); j++; lastDiffResultType = "BDIFF"; continue; } var curMaxStep = aDiffLongest.length; var foundCloser = 0; while(++j2 < blen && curMaxStep-- > 0) { var i3 = 0; for (;i3 < aDiffLongest.length; i3++) { if(bValues[j2] == aDiffLongest[i3]) { // 相同段 foundCloser = 1; break; } } if(foundCloser == 1) { for (var c = i; c < i + i3; c++) { aDiffTmp.push(aValues[c]); } for (var c = j ; c < j2; c++) { bDiffTmp.push(bValues[c]); } if(aDiffTmp.length > 0) { aResult.push({"item": aDiffTmp, "type":"diff"}); printLog("adiff:" + aDiffTmp + "<p>"); } if(bDiffTmp.length > 0) { bResult.push({"item": bDiffTmp, "type":"diff"}); printLog("bdiff:" + bDiffTmp + "<p>"); } var eqItem = bValues[j2]; if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(eqItem); printLog("equal:<span style=\"background:#ffea00\">" + eqItem +"</span><br />"); aDiffLongest.splice(0, 1); i = i + i3; j = j2; i++; j++; break; } else { if(aDiffLongest.length == 0) { lastDiffResultType = "BDIFF"; } else{ lastDiffResultType = "ADIFF"; } lastDiffResultType = "DIFF"; } } if(!foundCloser) { for (var c = aDiffLongest.length - 1; c > 0; c--) { aDiffTmp.push(aDiffLongest[0]); aDiffLongest.splice(0, 1); } for (var c = j ; c < j2; c++) { bDiffTmp.push(bValues[c]); } if(aDiffTmp.length > 0) { aResult.push({"item": aDiffTmp, "type":"diff"}) printLog("adiff:" + aDiffTmp + "<p>"); } if(bDiffTmp.length > 0) { bResult.push({"item": bDiffTmp, "type":"diff"}); printLog("bdiff:" + bDiffTmp + "<p>"); } var eqItem = aDiffLongest[0]; if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; bDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(eqItem); printLog("equal:<span style=\"background:#ffea00\">" + eqItem +"</span><br />"); aDiffLongest.splice(0, 1); i = i2 - i3; j = j2; i++; j++; lastDiffResultType = "ADIFF"; continue; } } return {"a": aResult, "b":bResult}; } turnOffLogSwitch(); var aText = "今天 是个 好天气"; var bText = "今天, 真是 一个 好阴天啊"; diffWithWhitespaceAndAppendBody(aText, bText); aText = "ParseException"; bText = "RuntimeException"; diffWithWhitespaceAndAppendBody(aText, bText); function diffWithWhitespaceAndAppendBody(a, b) { printLog(aText); printLog("<p>"); printLog(bText); printLog("<p>compare result: <p>"); var diffResult = diffWithWhitespace(aText, bText); document.write("<p>A SIDE: <p>") diffResult.a.forEach(function (r) { writeDiffResult(r); }); document.write("<p>B SIDE: <p>") diffResult.b.forEach(function (r) { writeDiffResult(r); }); } function writeDiffResult(structText) { var item = structText.item.join(""); if(structText.type != "equal") { item = "<span style=\"background:#ffea00\">" + item + "</span>"; } document.write(item); } function printLog(msg) { if(logSwitch == 1) { document.write(msg); } } function turnOnLogSwitch() { logSwitch = 1; } function turnOffLogSwitch() { logSwitch = 0; } </script> </head> <body> </body> </html>
<!DOCTYPE html> <!DOCTYPE html> <html> <head> <title>diff function test</title> <script type="text/javascript"> // 思路1: 使用双指针暴力解法 // 1. 先扫描a, 直到一个与b[j]相同的元素为止, 保存为aQueue // 2. 再扫描b, 每次的查找范围为aQueue // 3. 如果找到, 则进行接下来的最长匹配 // 4. 如果没有找到, 则让a进行最长匹配 // 5. 进入下一轮循环,直到a循环为止 // 时间复杂度: a:m, b:n, a1:m, b1:m(m+1)/2*n, ∴ O(m²n) var logSwitch = 1; function diffWithWhitespace(a, b) { var aValues = a.split(""); var bValues = b.split(""); var alen = aValues.length; var blen = bValues.length; var i = 0, j = 0; var equalLongest = []; var aDiffLongest = []; var aResult = []; var bResult = []; // 上一次比较结果, EQUAL:相等, ADIFF:A, BDIFF:B var lastDiffResultType = "EQUAL"; while(i < alen || j < blen) { if(aValues[i] == bValues[j]) { // todo: 记录结果, 到a中,b中 if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(aValues[i]); printLog("equal:<span style=\"background:#ffea00\">" + aValues[i] + "</span><br />"); i++; j++; continue; } var i2 = i, j2 = j; aDiffLongest = []; while((i2) < alen && aValues[i2] != bValues[j2]) { aDiffLongest.push(aValues[i2]); ++i2; } var aDiffTmp = []; var bDiffTmp = []; if(i2 > alen) { // no equal find // the last one } else if(i2 != alen && aValues[i2] == bValues[j]) { aDiffLongest.push(aValues[i2]); } else{ if(j >= blen) { aDiffTmp.push(aValues[i]); // the last one aResult.push({"item": aDiffTmp, "type":"diff"}); i++; continue; } // a中未找到,全部退回到b中进行查找 bDiffTmp.push(bValues[j]); bResult.push({"item": bDiffTmp, "type":"diff"}) // 去重相同项,也同时跳过上一相同的项 while(++j2 < blen && bValues[j2] == bValues[j]) { bDiffTmp.push(bValues[j2]); j = j2; } printLog("bdiff:" + bDiffTmp + "<br />"); j++; lastDiffResultType = "BDIFF"; continue; } var curMaxStep = aDiffLongest.length; var foundCloser = 0; while(++j2 < blen && curMaxStep-- > 0) { var i3 = 0; for (;i3 < aDiffLongest.length; i3++) { if(bValues[j2] == aDiffLongest[i3]) { // 相同段 foundCloser = 1; break; } } if(foundCloser == 1) { for (var c = i; c < i + i3; c++) { aDiffTmp.push(aValues[c]); } for (var c = j ; c < j2; c++) { bDiffTmp.push(bValues[c]); } if(aDiffTmp.length > 0) { aResult.push({"item": aDiffTmp, "type":"diff"}); printLog("adiff:" + aDiffTmp + "<p>"); } if(bDiffTmp.length > 0) { bResult.push({"item": bDiffTmp, "type":"diff"}); printLog("bdiff:" + bDiffTmp + "<p>"); } var eqItem = bValues[j2]; if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(eqItem); printLog("equal:<span style=\"background:#ffea00\">" + eqItem +"</span><br />"); aDiffLongest.splice(0, 1); i = i + i3; j = j2; i++; j++; break; } else { if(aDiffLongest.length == 0) { lastDiffResultType = "BDIFF"; } else{ lastDiffResultType = "ADIFF"; } lastDiffResultType = "DIFF"; } } if(!foundCloser) { for (var c = aDiffLongest.length - 1; c > 0; c--) { aDiffTmp.push(aDiffLongest[0]); aDiffLongest.splice(0, 1); } for (var c = j ; c < j2; c++) { bDiffTmp.push(bValues[c]); } if(aDiffTmp.length > 0) { aResult.push({"item": aDiffTmp, "type":"diff"}) printLog("adiff:" + aDiffTmp + "<p>"); } if(bDiffTmp.length > 0) { bResult.push({"item": bDiffTmp, "type":"diff"}); printLog("bdiff:" + bDiffTmp + "<p>"); } var eqItem = aDiffLongest[0]; if(lastDiffResultType != "EQUAL" || equalLongest.length == 0) { equalLongest = []; aDiffLongest = []; bDiffLongest = []; lastDiffResultType = "EQUAL"; aResult.push({"item": equalLongest, "type":"equal"}) bResult.push({"item": equalLongest, "type":"equal"}) } equalLongest.push(eqItem); printLog("equal:<span style=\"background:#ffea00\">" + eqItem +"</span><br />"); aDiffLongest.splice(0, 1); i = i2 - i3; j = j2; i++; j++; lastDiffResultType = "ADIFF"; continue; } } return {"a": aResult, "b":bResult}; } turnOffLogSwitch(); var aText = "今天 是个 好天气"; var bText = "今天, 真是 一个 好阴天啊"; diffWithWhitespaceAndAppendBody(aText, bText); aText = "ParseException"; bText = "RuntimeException"; diffWithWhitespaceAndAppendBody(aText, bText); function diffWithWhitespaceAndAppendBody(a, b) { printLog(aText); printLog("<p>"); printLog(bText); printLog("<p>compare result: <p>"); var diffResult = diffWithWhitespace(aText, bText); document.write("<p>A SIDE: <p>") diffResult.a.forEach(function (r) { writeDiffResult(r); }); document.write("<p>B SIDE: <p>") diffResult.b.forEach(function (r) { writeDiffResult(r); }); } function writeDiffResult(structText) { var item = structText.item.join(""); if(structText.type != "equal") { item = "<span style=\"background:#ffea00\">" + item + "</span>"; } document.write(item); } function printLog(msg) { if(logSwitch == 1) { document.write(msg); } } function turnOnLogSwitch() { logSwitch = 1; } function turnOffLogSwitch() { logSwitch = 0; } </script> </head> <body> </body> </html>
算法属于暴力解法,简单使用了双指针法,没有太多技巧,需要进一步优化。
3. 一点闲话
需要注意的量,虽然样子很像最长公共子序列的命题,但却并不是一回事。供参考。
与beyond compare软件结果相比,还是不太准确,最长匹配这个原则还没有体现好。另外,对于多行型 的字符串比较,并没有给出参考,但一般的,多行会被当作整体处理,行与行之间都有单字符类的比较。
不要害怕今日的苦,你要相信明天,更苦!