基于字的文本相似度算法——Jacard算法

一、算法原理

基于字的文本相似度Jacard 算法的原理是:
(1)计算两个文本中字的交集
(2)计算两个文本中字的并集
(3)交集内的字的个数除以并集内的字的个数即为文本相似度值
(4)根据设置的阈值判断是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自:

https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp

 

[cpp] view plain copy
 
  1. /* 
  2.  * JaccardSimilarity.hpp 
  3.  * 
  4.  *  Created: 2016年10月2日 
  5.  *   Author: tang 
  6.  */  
  7.   
  8. #ifndef SRC_JACCARD_SIMILARITY_HPP_  
  9. #define SRC_JACCARD_SIMILARITY_HPP_  
  10. #include <algorithm>  
  11. #include <iostream>  
  12. #include <vector>  
  13. #include <set>  
  14. #include "StringUtil.hpp"  
  15.   
  16. using namespace std;  
  17.   
  18. class JaccardSimilarity  
  19. {  
  20. public:  
  21.       
  22.     JaccardSimilarity()  
  23.     {  
  24.     }  
  25.   
  26.     double CalculateTextSimilarity(string &str1,string &str2)  
  27.     {  
  28.         vector<uint16_t> words_for_str1;  
  29.         vector<uint16_t> words_for_str2;  
  30.         vector<uint16_t>::iterator it;  
  31.   
  32.         if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||   
  33.             !utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )  
  34.         {  
  35.             cout<<"TransCode Error"<<endl;  
  36.             return 0.;  
  37.         }  
  38.   
  39.         for(it=words_for_str1.begin();it!=words_for_str1.end();)  
  40.         {  
  41.             if(codeFilter(*it))  
  42.             {  
  43.                 ++it;  
  44.             }  
  45.             else  
  46.             {  
  47.                 it=words_for_str1.erase(it);  
  48.             }  
  49.         }  
  50.   
  51.         for(it=words_for_str2.begin();it!=words_for_str2.end();)  
  52.                 {  
  53.                         if(codeFilter(*it))  
  54.                         {  
  55.                 ++it;  
  56.                         }  
  57.             else  
  58.             {  
  59.                 it=words_for_str2.erase(it);  
  60.             }  
  61.                 }  
  62.   
  63.         if(words_for_str1.size()+words_for_str2.size()<1)  
  64.             return 1.;  
  65.   
  66.         vector<uint16_t> words_intersection;  
  67.         vector<uint16_t> words_union;  
  68.         std::sort(words_for_str1.begin(),words_for_str1.end());  
  69.         std::sort(words_for_str2.begin(),words_for_str2.end());  
  70.         std::set_intersection(words_for_str1.begin(),words_for_str1.end(),  
  71.                     words_for_str2.begin(),words_for_str2.end(),  
  72.                     std::inserter(words_intersection,words_intersection.begin()));  
  73.   
  74.         std::set_union(words_for_str1.begin(),words_for_str1.end(),  
  75.                                         words_for_str2.begin(),words_for_str2.end(),  
  76.                     std::inserter(words_union,words_union.begin()));  
  77.   
  78.         double inter=words_intersection.size();  
  79.         double wunion=words_union.size();  
  80.   
  81.         return inter/wunion;  
  82.     }  
  83.   
  84.     bool codeFilter(int code)   
  85.     {  
  86.             if ((code < 0x4e00 || code > 0x9fa5) &&   
  87.             !(code >= '0' && code <= '9') &&   
  88.             !(code >= 'a' && code <= 'z') &&   
  89.             !(code >= 'A' && code <= 'Z'))  
  90.                  return false;  
  91.           
  92.             return true;  
  93.     }  
  94.   
  95. };  
  96.   
  97. #endif /* SRC_JACCARD_SIMILARITY_HPP_ */  


三、算法的java实现

 

 

[java] view plain copy
 
    1. import java.util.HashMap;  
    2. import java.util.HashSet;  
    3. import java.util.Map;  
    4. import java.util.Set;  
    5.   
    6.   
    7. public class JaccardSimilarity{  
    8.   
    9.     public JaccardSimilarity() {  
    10.     }  
    11.       
    12.     public boolean codeFilter(int code) {  
    13.         if ((code < 19968 || code > 40869)   
    14.         && !(code >= '0' && code <= '9')   
    15.         && !(code >= 'a' && code <= 'z')   
    16.         && !(code >= 'A' && code <= 'Z')) {  
    17.             return false;  
    18.         }  
    19.         return true;  
    20.     }  
    21.   
    22.     public double CalculateTextSim(String content, String compareContent) {  
    23.         if(null == content || null == compareContent)  
    24.             return 0.0;  
    25.         Map<String, Integer> cntMap = new HashMap<String, Integer>();  
    26.         Set<String> cntSet = new HashSet<String>();  
    27.         Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();  
    28.         Set<String> cmpCntSet = new HashSet<String>();  
    29.           
    30.     for (int i = 0; i != content.length(); i++) {  
    31.             int k = 0;  
    32.             if (codeFilter(content.codePointAt(i))) {  
    33.                 if (cntMap.containsKey("" + content.charAt(i))) {  
    34.                     Integer count = cntMap.get("" + content.charAt(i));  
    35.                     count = count + 1;  
    36.                     cntMap.put("" + content.charAt(i), count);  
    37.                     k = count;  
    38.                 } else {  
    39.                     cntMap.put("" + content.charAt(i), new Integer(1));  
    40.                     k = 1;  
    41.                 }  
    42.                 String tmpString = content.charAt(i) + "" + k;  
    43.                 cntSet.add(tmpString);  
    44.             }  
    45.         }  
    46.   
    47.         for (int i = 0; i != compareContent.length(); i++) {  
    48.             int k = 0;  
    49.             if (codeFilter(compareContent.codePointAt(i))) {  
    50.                 if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {  
    51.                     Integer count = cmpCntMap.get("" + compareContent.charAt(i));  
    52.                     count = count + 1;  
    53.                     cmpCntMap.put("" + compareContent.charAt(i), count);  
    54.                     k = count;  
    55.                 } else {  
    56.                     cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));  
    57.                     k = 1;  
    58.                 }  
    59.   
    60.                 String tmpString = compareContent.charAt(i) + "" + k;  
    61.                 cmpCntSet.add(tmpString);  
    62.             }  
    63.         }  
    64.   
    65.         Set<String> tmpSet = new HashSet<String>();  
    66.         tmpSet.addAll(cntSet);  
    67.         cntSet.retainAll(cmpCntSet);  
    68.         double intCount = cntSet.size();  
    69.   
    70.         tmpSet.addAll(cmpCntSet);  
    71.   
    72.   
    73.         if (tmpSet.size() == 0)  
    74.             return 0;  
    75.         double uniCount = tmpSet.size();  
    76.   
    77.   
    78.         return intCount / uniCount;  
    79.     }  
    80.   
    81. }  
posted @ 2017-11-28 13:45  Histring  阅读(357)  评论(0编辑  收藏  举报