基于字的文本相似度算法——余弦定理
一、算法原理
基于字的文本相似度余弦定理算法的原理是:
(1)分别统计两个比较文本中所有字出现的频率,从而得出两个文本对应的向量(2)利用余弦定理计算这两个向量的夹角余弦值
(3)根据自设置的阈值判断两个文本是否相似
二、算法的C++实现
这里引用的StringUtil.hpp文件引自:
https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp
- /*
- * CosineSimilarity.hpp
- *
- * Created: 2016年10月2日
- * Author: tang
- */
- #ifndef SRC_COSINE_SIMILARITY_HPP_
- #define SRC_COSINE_SIMILARITY_HPP_
- #include <iostream>
- #include <vector>
- #include <map>
- #include <math.h>
- #include "StringUtil.hpp"
- using namespace std;
- class CosineSimilarity
- {
- public:
- CosineSimilarity()
- {
- }
- double CalculateTextSimilarity(string &str1,string &str2)
- {
- vector<uint16_t> words_for_str1;
- vector<uint16_t> words_for_str2;
- vector<uint16_t>::iterator it;
- if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
- !utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
- {
- cout<<"TransCode Error"<<endl;
- return 0.;
- }
- map< uint16_t,pair<int,int> >seq_map;
- map< uint16_t,pair<int,int> >::iterator map_it;
- for(it=words_for_str1.begin();it!=words_for_str1.end();++it)
- {
- if(isHanzi(*it))
- {
- map_it=seq_map.find(*it);
- if(map_it!=seq_map.end())
- {
- map_it->second.first++;
- }
- else
- {
- pair<int,int> seq;
- seq.first=1;
- seq.second=0;
- seq_map[*it]=seq;
- }
- }
- }
- for(it=words_for_str2.begin();it!=words_for_str2.end();++it)
- {
- if(isHanzi(*it))
- {
- map_it=seq_map.find(*it);
- if(map_it!=seq_map.end())
- {
- map_it->second.second++;
- }
- else
- {
- pair<int,int> seq;
- seq.first=0;
- seq.second=1;
- seq_map[*it]=seq;
- }
- }
- }
- double sqdoc1 = 0.;
- double sqdoc2 = 0.;
- double denominator = 0.;
- for(map_it=seq_map.begin();map_it!=seq_map.end();++map_it)
- {
- pair<int,int> c=map_it->second;
- denominator +=(c.first * c.second);
- sqdoc1+=(c.first * c.first);
- sqdoc2+=(c.second * c.second);
- }
- if(0==sqdoc1 * sqdoc2)
- return -1.0;
- return denominator/sqrt(sqdoc1 * sqdoc2);
- }
- bool codeFilter(int code)
- {
- if ((code < 0x4e00 || code > 0x9fa5) &&
- !(code >= '0' && code <= '9') &&
- !(code >= 'a' && code <= 'z') &&
- !(code >= 'A' && code <= 'Z'))
- return false;
- return true;
- }
- bool isHanzi(uint16_t ch)
- {
- return (ch >= 0x4E00 && ch <= 0x9FA5);
- }
- };
三、算法的Java实现
- import java.io.UnsupportedEncodingException;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.Map;
- public class CosineSimilarity{
- /**
- * 输入两段文本利用孜频率的余弦定理判断二者间的相似度
- *
- * @param doc1,文本1
- * @param doc2,文本2
- * @return 相似度值
- */
- public double CalculateTextSim(String doc1, String doc2) {
- if (doc1 != null && doc1.trim().length() > 0 && doc2 != null
- && doc2.trim().length() > 0) {
- Map<Integer, int[]> AlgorithmMap = new HashMap<Integer, int[]>();
- //将两个字符串中的中文字符以及出现的总数封装到,AlgorithmMap中
- for (int i = 0; i < doc1.length(); i++) {
- char d1 = doc1.charAt(i);
- if(isHanZi(d1)){
- int charIndex = getGB2312Id(d1);
- if(charIndex != -1){
- int[] fq = AlgorithmMap.get(charIndex);
- if(fq != null && fq.length == 2){
- fq[0]++;
- }else {
- fq = new int[2];
- fq[0] = 1;
- fq[1] = 0;
- AlgorithmMap.put(charIndex, fq);
- }
- }
- }
- }
- for (int i = 0; i < doc2.length(); i++) {
- char d2 = doc2.charAt(i);
- if(isHanZi(d2)){
- int charIndex = getGB2312Id(d2);
- if(charIndex != -1){
- int[] fq = AlgorithmMap.get(charIndex);
- if(fq != null && fq.length == 2){
- fq[1]++;
- }else {
- fq = new int[2];
- fq[0] = 0;
- fq[1] = 1;
- AlgorithmMap.put(charIndex, fq);
- }
- }
- }
- }
- Iterator<Integer> iterator = AlgorithmMap.keySet().iterator();
- double sqdoc1 = 0;
- double sqdoc2 = 0;
- double denominator = 0;
- while(iterator.hasNext()){
- int[] c = AlgorithmMap.get(iterator.next());
- denominator += c[0]*c[1];
- sqdoc1 += c[0]*c[0];
- sqdoc2 += c[1]*c[1];
- }
- return denominator / Math.sqrt(sqdoc1*sqdoc2);
- } else {
- throw new NullPointerException("the Document is null or have not cahrs!!");
- }
- }
- /**
- * 输入一个字符判断是否为中文汉字
- *
- * @param ch,字符
- * @return true为中文汉字,否则为false
- */
- public boolean isHanZi(char ch) {
- return (ch >= 0x4E00 && ch <= 0x9FA5);
- }
- /**
- * 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
- *
- * @param ch,输入的GB2312中文字符或者ASCII字符(128个)
- * @return ch在GB2312中的位置,-1表示该字符不认识
- */
- public static short getGB2312Id(char ch) {
- try {
- byte[] buffer = Character.toString(ch).getBytes("GB2312");
- if (buffer.length != 2) {
- // 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
- return -1;
- }
- int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
- int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
- return (short) (b0 * 94 + b1);
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- return -1;
- }
- }