Soundex算法学习

什么是Soundex算法?

Soundex是一种语音算法,利用英文字的读音计算近似值,值由4个字符构成,第一个字符为英文字母,后三个为数字。在拼音文字中有时会有会念但不能拼出正确字的情形,可用Soundex做类似模糊匹配的功能。

例如Kunth和Kant二个字符串,它们的Soundex值都是“K530”。其在计算机科学家高德纳名著《计算机程序设计艺术》都有详细的介绍。

算法实现

/**
 * Mapping int and char by rule:
 * a e h i o u w y -> 0
   b f p v -> 1
   c g j k q s x z -> 2
   d t -> 3
   l -> 4
   m n -> 5
   r -> 6
 *
*/
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <string>
#include <vector>
using namespace std;

class Soundex {
 private:
  // the map for letter,int;
  vector<char> chars =
      // a,  b,  c,    d,   e,   f,   g,   h,   i,   j,   k,   l,   m,
      {'0', '1', '2', '3', '0', '1', '2', '0', '0', '2', '2', '4', '5',
       // n,   o,   p,   q,   r,   s,   t,   u,   v,   w,   x,   y,   z
       '5', '0', '1', '2', '6', '2', '3', '0', '1', '0', '2', '0', '2'};

 public:
  string soundDex(string s) {
    // to upper
    boost::to_upper(s);
    int size = s.size();
    string result = "";
    char c, prev = '?', prevOutput = '?';
    for (int i = 0; i < size; i++) {
      c = s[i];
      if (c == ',' || result.length() >= 4) {
        continue;
      }
      if (c >= 'A' && c <= 'Z' && c != prev) {
        prev = c;
        // us first letter, and otherwise use mapping
        if (i == 0) {
          result += c;
        } else {
          char m = chars[c - 'A'];
          // remove duplicates
          if (m != '0' && m != prevOutput) {
            result += m;
            prevOutput = m;
          }
        }
      }
    }  // end for
    // if the length of result less than 4, insert 0 to the end
    if (result.length() < 4) {
      result += "0";
    }
    return result;
  }
};

int main() {
  Soundex soundex;
  soundex.soundDex("johnnyzhao");
  soundex.soundDex("Kunth");
}

 运行结果:

J520
K530

 

posted @ 2023-01-27 10:14  johnny_zhao  阅读(176)  评论(0编辑  收藏  举报