字符串编辑距离的计算,及相应操作过程的打印
/Files/rocketfan/editdistance_readme.pdf
先给一个例子,两个字符串eeba和abca相似度是多少呢,edit distance是一个很好的度量,定义从字符串a变到字符串b,所需要的最少的操作步骤(插入,删除,更改)为两个字符串之间的编辑距离。
对于eeba,abca它们之间的编辑距离为3,可以按照上面的操作步骤(不是唯一的)将eeba变到abca,1.将e变为a 2.删除e 3.添加c 共3个步骤。
典型的动态规划问题。
EDIT[i,j]表示对于字符串a从1到i的子串和字符串b从1到j的字串的编辑距离。(字符串下标从1开始)
EDIT[i - 1,j] + 1表示对a 在i 位置删除delete操作
EDIT[i,j - 1] + 1 表示insert添加操作
EDIT[i-1, j - 1] + f(x[i],y[j])这里如果x[i] == y[j] 则 f(x[i],y[j]) == 0 否则 ==1
表示不变或者是modify操作。
如果需要记录编辑过程如第一幅图所示,需要用二维数组记录下动态规划过程的路径信息,即记录下前一步骤的位置索引信息。
如下图
//edit_distance.h
1 /**
2 * \file edit_distance.h
3 * \author pku_goldenlock
4 * \date 2009-8-10
5 */
6
7
8 #ifndef _EDIT_DISTANCE_H
9 #define _EDIT_DISTANCE_H
10 #include <string>
11 using std::string;
12
13 class EditDistanceHelp {
14 private:
15 /**
16 * Define the array data type as ArrayData
17 */
18 struct ArrayData {
19 int dist; /**< The min edit distance until current pos*/
20 int pre_x; /**< Store the previous postion, x part horizontal */
21 int pre_y; /**< Store the previous postion, y part vertical*/
22 };
23 public:
24 /**
25 * Find the min edit distance and return the edit distance.
26 * will sotre the best path info in string rs1, rs2
27 * s1, s2 is the user given string for caculating the edit distance
28 * @param s1,s2 Input strings.
29 * @param rs1,rs2 Store the result path.
30 * @return Will return the edit distance for string s1 and s2.
31 */
32 static int CalcPath(const string &s1, const string &s2, string &rs1, string &rs2);
33 /**
34 * Find the min edit distance only do not need path info.
35 */
36 static int EditDistance(const string& s1, const string& s2);
37 private:
38 /**
39 * Set all data members value for one array element.
40 */
41 static void SetArrayData(ArrayData &a, int dist, int pre_x, int pre_y);
42 /**
43 * Find the best path and store result to string rs1 and rs2.
44 * Recursion function.
45 * @param array The array will store all the info of each positon (x, y).
46 * @param index_x Current postion x part.
47 * @param index_y Current postion y part.
48 * @param s1,s2 The input two string for caculating their best edit distance path.
49 * @param rs1,rs2 Will store the result.
50 */
51 static void StoreResult(ArrayData **array, int index_x, int index_y,
52 const string &s1, const string &s2,
53 string &rs1, string &rs2);
54 };
55
56
57 #endif //end of define _EDIT_DISTANCE_H
2 * \file edit_distance.h
3 * \author pku_goldenlock
4 * \date 2009-8-10
5 */
6
7
8 #ifndef _EDIT_DISTANCE_H
9 #define _EDIT_DISTANCE_H
10 #include <string>
11 using std::string;
12
13 class EditDistanceHelp {
14 private:
15 /**
16 * Define the array data type as ArrayData
17 */
18 struct ArrayData {
19 int dist; /**< The min edit distance until current pos*/
20 int pre_x; /**< Store the previous postion, x part horizontal */
21 int pre_y; /**< Store the previous postion, y part vertical*/
22 };
23 public:
24 /**
25 * Find the min edit distance and return the edit distance.
26 * will sotre the best path info in string rs1, rs2
27 * s1, s2 is the user given string for caculating the edit distance
28 * @param s1,s2 Input strings.
29 * @param rs1,rs2 Store the result path.
30 * @return Will return the edit distance for string s1 and s2.
31 */
32 static int CalcPath(const string &s1, const string &s2, string &rs1, string &rs2);
33 /**
34 * Find the min edit distance only do not need path info.
35 */
36 static int EditDistance(const string& s1, const string& s2);
37 private:
38 /**
39 * Set all data members value for one array element.
40 */
41 static void SetArrayData(ArrayData &a, int dist, int pre_x, int pre_y);
42 /**
43 * Find the best path and store result to string rs1 and rs2.
44 * Recursion function.
45 * @param array The array will store all the info of each positon (x, y).
46 * @param index_x Current postion x part.
47 * @param index_y Current postion y part.
48 * @param s1,s2 The input two string for caculating their best edit distance path.
49 * @param rs1,rs2 Will store the result.
50 */
51 static void StoreResult(ArrayData **array, int index_x, int index_y,
52 const string &s1, const string &s2,
53 string &rs1, string &rs2);
54 };
55
56
57 #endif //end of define _EDIT_DISTANCE_H
//edit_distance.cc
1 #include "edit_distance.h"
2 #include <iostream>
3 #include <iomanip>
4 using namespace std;
5 /**
6 * find the min edit distance and return the edit distance
7 * will sotre the best path info in string rs1, rs2
8 * s1, s2 is the user given string for caculating the edit distance
9 */
10 int EditDistanceHelp::CalcPath(const string &s1, const string &s2, string &rs1, string &rs2)
11 {
12 //first find min dist and store path info
13 int len1 = s1.length();
14 int len2 = s2.length();
15
16 //allocate space for array
17 ArrayData **array;
18 array = new ArrayData*[len1 + 1];
19 for (int i = 0; i <= len1; i++)
20 array[i] = new ArrayData[len2 + 1];
21
22 //kernal for finding the best path and store path info to array
23 for (int i = 0; i <= len1; i++)
24 SetArrayData(array[i][0], i, i - 1, 0);
25 for (int j = 0; j <= len2; j++)
26 SetArrayData(array[0][j], j, 0, j - 1);
27 int min_dist;
28 for (int i = 1; i <= len1; i++)
29 for (int j = 1; j <= len2; j++) {
30 if (array[i -1][j].dist < array[i][j - 1].dist) //can also be <=
31 SetArrayData(array[i][j], array[i - 1][j].dist + 1, i - 1, j);
32 else
33 SetArrayData(array[i][j], array[i][j - 1].dist + 1, i, j - 1);
34 min_dist = array[i - 1][j - 1].dist + (s1[i - 1] != s2[j - 1]);
35 if (min_dist < array[i][j].dist) // < is OK but <= make modify high priority
36 SetArrayData(array[i][j], min_dist, i - 1, j - 1);
37 }
38
39 //store the best path result to two result string rs1 and rs2
40 StoreResult(array, len1, len2, s1, s2, rs1, rs2);
41 min_dist = array[len1][len2].dist;
42
43 //print array
44 for (int i = 0; i <= len1; i++) {
45 for (int j = 0; j <= len2; j++) {
46 cout << "(" << array[i][j].pre_x << "," << setw(2) << array[i][j].pre_y << ") ";
47 }
48 cout << endl;
49 }
50 //free resources of array
51 for (int i = 0; i <= len1; i++)
52 delete array[i];
53 delete array;
54
55 //return min edit distance
56 return min_dist;
57 }
58
59 /**
60 * find the min edit distance only do not need path info
61 */
62 int EditDistanceHelp::EditDistance(const string& s1, const string& s2)
63 {
64 using std::min;
65 int len1 = s1.length();
66 int len2 = s2.length();
67 int array[len1 + 1][len2 + 1];
68 for (int i = 0; i <= len1; i++)
69 array[i][0] = i;
70 for (int j = 1; j <= len2; j++)
71 array[0][j] = j;
72 for (int i = 1; i <= len1; i++)
73 for (int j = 1; j <= len2; j++)
74 array[i][j] = min(min(array[i - 1][j] + 1, array[i][j - 1] + 1),
75 array[i - 1][j - 1] + (s1[i - 1] != s2[j - 1]));
76 return array[len1][len2];
77 }
78
79 /**
80 * Set all data members value for one array element
81 */
82 void EditDistanceHelp::SetArrayData(ArrayData &a, int dist, int pre_x, int pre_y)
83 {
84 a.dist = dist;
85 a.pre_x = pre_x;
86 a.pre_y = pre_y;
87 }
88
89 /**
90 * Based on the path info stored in array ,find the best path and store result to string rs1 and rs2
91 */
92 void EditDistanceHelp::StoreResult(ArrayData **array, int index_x, int index_y,
93 const string &s1, const string &s2,
94 string &rs1, string &rs2)
95 {
96 if (index_x == 0 && index_y== 0)
97 return;
98
99 if ((array[index_x][index_y].pre_x < index_x) && (array[index_x][index_y].pre_y < index_y)) {
100 StoreResult(array, index_x - 1, index_y - 1, s1, s2, rs1, rs2);
101 rs1 += s1[index_x - 1];
102 rs2 += s2[index_y - 1];
103 } else if (array[index_x][index_y].pre_x < index_x) {
104 StoreResult(array, index_x - 1, index_y, s1, s2, rs1, rs2);
105 rs1 += s1[index_x - 1];
106 rs2 += '-';
107 } else {
108 StoreResult(array, index_x, index_y - 1, s1, s2, rs1, rs2);
109 rs1 += '-';
110 rs2 += s2[index_y - 1];
111 }
112 }
113
2 #include <iostream>
3 #include <iomanip>
4 using namespace std;
5 /**
6 * find the min edit distance and return the edit distance
7 * will sotre the best path info in string rs1, rs2
8 * s1, s2 is the user given string for caculating the edit distance
9 */
10 int EditDistanceHelp::CalcPath(const string &s1, const string &s2, string &rs1, string &rs2)
11 {
12 //first find min dist and store path info
13 int len1 = s1.length();
14 int len2 = s2.length();
15
16 //allocate space for array
17 ArrayData **array;
18 array = new ArrayData*[len1 + 1];
19 for (int i = 0; i <= len1; i++)
20 array[i] = new ArrayData[len2 + 1];
21
22 //kernal for finding the best path and store path info to array
23 for (int i = 0; i <= len1; i++)
24 SetArrayData(array[i][0], i, i - 1, 0);
25 for (int j = 0; j <= len2; j++)
26 SetArrayData(array[0][j], j, 0, j - 1);
27 int min_dist;
28 for (int i = 1; i <= len1; i++)
29 for (int j = 1; j <= len2; j++) {
30 if (array[i -1][j].dist < array[i][j - 1].dist) //can also be <=
31 SetArrayData(array[i][j], array[i - 1][j].dist + 1, i - 1, j);
32 else
33 SetArrayData(array[i][j], array[i][j - 1].dist + 1, i, j - 1);
34 min_dist = array[i - 1][j - 1].dist + (s1[i - 1] != s2[j - 1]);
35 if (min_dist < array[i][j].dist) // < is OK but <= make modify high priority
36 SetArrayData(array[i][j], min_dist, i - 1, j - 1);
37 }
38
39 //store the best path result to two result string rs1 and rs2
40 StoreResult(array, len1, len2, s1, s2, rs1, rs2);
41 min_dist = array[len1][len2].dist;
42
43 //print array
44 for (int i = 0; i <= len1; i++) {
45 for (int j = 0; j <= len2; j++) {
46 cout << "(" << array[i][j].pre_x << "," << setw(2) << array[i][j].pre_y << ") ";
47 }
48 cout << endl;
49 }
50 //free resources of array
51 for (int i = 0; i <= len1; i++)
52 delete array[i];
53 delete array;
54
55 //return min edit distance
56 return min_dist;
57 }
58
59 /**
60 * find the min edit distance only do not need path info
61 */
62 int EditDistanceHelp::EditDistance(const string& s1, const string& s2)
63 {
64 using std::min;
65 int len1 = s1.length();
66 int len2 = s2.length();
67 int array[len1 + 1][len2 + 1];
68 for (int i = 0; i <= len1; i++)
69 array[i][0] = i;
70 for (int j = 1; j <= len2; j++)
71 array[0][j] = j;
72 for (int i = 1; i <= len1; i++)
73 for (int j = 1; j <= len2; j++)
74 array[i][j] = min(min(array[i - 1][j] + 1, array[i][j - 1] + 1),
75 array[i - 1][j - 1] + (s1[i - 1] != s2[j - 1]));
76 return array[len1][len2];
77 }
78
79 /**
80 * Set all data members value for one array element
81 */
82 void EditDistanceHelp::SetArrayData(ArrayData &a, int dist, int pre_x, int pre_y)
83 {
84 a.dist = dist;
85 a.pre_x = pre_x;
86 a.pre_y = pre_y;
87 }
88
89 /**
90 * Based on the path info stored in array ,find the best path and store result to string rs1 and rs2
91 */
92 void EditDistanceHelp::StoreResult(ArrayData **array, int index_x, int index_y,
93 const string &s1, const string &s2,
94 string &rs1, string &rs2)
95 {
96 if (index_x == 0 && index_y== 0)
97 return;
98
99 if ((array[index_x][index_y].pre_x < index_x) && (array[index_x][index_y].pre_y < index_y)) {
100 StoreResult(array, index_x - 1, index_y - 1, s1, s2, rs1, rs2);
101 rs1 += s1[index_x - 1];
102 rs2 += s2[index_y - 1];
103 } else if (array[index_x][index_y].pre_x < index_x) {
104 StoreResult(array, index_x - 1, index_y, s1, s2, rs1, rs2);
105 rs1 += s1[index_x - 1];
106 rs2 += '-';
107 } else {
108 StoreResult(array, index_x, index_y - 1, s1, s2, rs1, rs2);
109 rs1 += '-';
110 rs2 += s2[index_y - 1];
111 }
112 }
113