基于字的文本相似度算法——Jacard算法
一、算法原理
基于字的文本相似度Jacard 算法的原理是:
(1)计算两个文本中字的交集
(2)计算两个文本中字的并集
(3)交集内的字的个数除以并集内的字的个数即为文本相似度值
(4)根据设置的阈值判断是否相似
二、算法的C++实现
这里引用的StringUtil.hpp文件引自:
https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp
- /*
- * JaccardSimilarity.hpp
- *
- * Created: 2016年10月2日
- * Author: tang
- */
- #ifndef SRC_JACCARD_SIMILARITY_HPP_
- #define SRC_JACCARD_SIMILARITY_HPP_
- #include <algorithm>
- #include <iostream>
- #include <vector>
- #include <set>
- #include "StringUtil.hpp"
- using namespace std;
- class JaccardSimilarity
- {
- public:
- JaccardSimilarity()
- {
- }
- double CalculateTextSimilarity(string &str1,string &str2)
- {
- vector<uint16_t> words_for_str1;
- vector<uint16_t> words_for_str2;
- vector<uint16_t>::iterator it;
- if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
- !utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
- {
- cout<<"TransCode Error"<<endl;
- return 0.;
- }
- for(it=words_for_str1.begin();it!=words_for_str1.end();)
- {
- if(codeFilter(*it))
- {
- ++it;
- }
- else
- {
- it=words_for_str1.erase(it);
- }
- }
- for(it=words_for_str2.begin();it!=words_for_str2.end();)
- {
- if(codeFilter(*it))
- {
- ++it;
- }
- else
- {
- it=words_for_str2.erase(it);
- }
- }
- if(words_for_str1.size()+words_for_str2.size()<1)
- return 1.;
- vector<uint16_t> words_intersection;
- vector<uint16_t> words_union;
- std::sort(words_for_str1.begin(),words_for_str1.end());
- std::sort(words_for_str2.begin(),words_for_str2.end());
- std::set_intersection(words_for_str1.begin(),words_for_str1.end(),
- words_for_str2.begin(),words_for_str2.end(),
- std::inserter(words_intersection,words_intersection.begin()));
- std::set_union(words_for_str1.begin(),words_for_str1.end(),
- words_for_str2.begin(),words_for_str2.end(),
- std::inserter(words_union,words_union.begin()));
- double inter=words_intersection.size();
- double wunion=words_union.size();
- return inter/wunion;
- }
- bool codeFilter(int code)
- {
- if ((code < 0x4e00 || code > 0x9fa5) &&
- !(code >= '0' && code <= '9') &&
- !(code >= 'a' && code <= 'z') &&
- !(code >= 'A' && code <= 'Z'))
- return false;
- return true;
- }
- };
- #endif /* SRC_JACCARD_SIMILARITY_HPP_ */
三、算法的java实现
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Map;
- import java.util.Set;
- public class JaccardSimilarity{
- public JaccardSimilarity() {
- }
- public boolean codeFilter(int code) {
- if ((code < 19968 || code > 40869)
- && !(code >= '0' && code <= '9')
- && !(code >= 'a' && code <= 'z')
- && !(code >= 'A' && code <= 'Z')) {
- return false;
- }
- return true;
- }
- public double CalculateTextSim(String content, String compareContent) {
- if(null == content || null == compareContent)
- return 0.0;
- Map<String, Integer> cntMap = new HashMap<String, Integer>();
- Set<String> cntSet = new HashSet<String>();
- Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();
- Set<String> cmpCntSet = new HashSet<String>();
- for (int i = 0; i != content.length(); i++) {
- int k = 0;
- if (codeFilter(content.codePointAt(i))) {
- if (cntMap.containsKey("" + content.charAt(i))) {
- Integer count = cntMap.get("" + content.charAt(i));
- count = count + 1;
- cntMap.put("" + content.charAt(i), count);
- k = count;
- } else {
- cntMap.put("" + content.charAt(i), new Integer(1));
- k = 1;
- }
- String tmpString = content.charAt(i) + "" + k;
- cntSet.add(tmpString);
- }
- }
- for (int i = 0; i != compareContent.length(); i++) {
- int k = 0;
- if (codeFilter(compareContent.codePointAt(i))) {
- if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {
- Integer count = cmpCntMap.get("" + compareContent.charAt(i));
- count = count + 1;
- cmpCntMap.put("" + compareContent.charAt(i), count);
- k = count;
- } else {
- cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));
- k = 1;
- }
- String tmpString = compareContent.charAt(i) + "" + k;
- cmpCntSet.add(tmpString);
- }
- }
- Set<String> tmpSet = new HashSet<String>();
- tmpSet.addAll(cntSet);
- cntSet.retainAll(cmpCntSet);
- double intCount = cntSet.size();
- tmpSet.addAll(cmpCntSet);
- if (tmpSet.size() == 0)
- return 0;
- double uniCount = tmpSet.size();
- return intCount / uniCount;
- }
- }