基于字的文本相似度算法——余弦定理

一、算法原理

基于字的文本相似度余弦定理算法的原理是：

（1）分别统计两个比较文本中所有字出现的频率，从而得出两个文本对应的向量
（2）利用余弦定理计算这两个向量的夹角余弦值

（3）根据自设置的阈值判断两个文本是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自：

https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp

[cpp] view plain copy

/*
* CosineSimilarity.hpp
*
* Created: 2016年10月2日
* Author: tang
*/
#ifndef SRC_COSINE_SIMILARITY_HPP_
#define SRC_COSINE_SIMILARITY_HPP_
#include <iostream>
#include <vector>
#include <map>
#include <math.h>
#include "StringUtil.hpp"
using namespace std;
class CosineSimilarity
{
public:
CosineSimilarity()
{
}
double CalculateTextSimilarity(string &str1,string &str2)
{
vector<uint16_t> words_for_str1;
vector<uint16_t> words_for_str2;
vector<uint16_t>::iterator it;
if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
!utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
{
cout<<"TransCode Error"<<endl;
return 0.;
}
map< uint16_t,pair<int,int> >seq_map;
map< uint16_t,pair<int,int> >::iterator map_it;
for(it=words_for_str1.begin();it!=words_for_str1.end();++it)
{
if(isHanzi(*it))
{
map_it=seq_map.find(*it);
if(map_it!=seq_map.end())
{
map_it->second.first++;
}
else
{
pair<int,int> seq;
seq.first=1;
seq.second=0;
seq_map[*it]=seq;
}
}
}
for(it=words_for_str2.begin();it!=words_for_str2.end();++it)
{
if(isHanzi(*it))
{
map_it=seq_map.find(*it);
if(map_it!=seq_map.end())
{
map_it->second.second++;
}
else
{
pair<int,int> seq;
seq.first=0;
seq.second=1;
seq_map[*it]=seq;
}
}
}
double sqdoc1 = 0.;
double sqdoc2 = 0.;
double denominator = 0.;
for(map_it=seq_map.begin();map_it!=seq_map.end();++map_it)
{
pair<int,int> c=map_it->second;
denominator +=(c.first * c.second);
sqdoc1+=(c.first * c.first);
sqdoc2+=(c.second * c.second);
}
if(0==sqdoc1 * sqdoc2)
return -1.0;
return denominator/sqrt(sqdoc1 * sqdoc2);
}
bool codeFilter(int code)
{
if ((code < 0x4e00 || code > 0x9fa5) &&
!(code >= '0' && code <= '9') &&
!(code >= 'a' && code <= 'z') &&
!(code >= 'A' && code <= 'Z'))
return false;
return true;
}
bool isHanzi(uint16_t ch)
{
return (ch >= 0x4E00 && ch <= 0x9FA5);
}
};

三、算法的Java实现

[java] view plain copy

import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
public class CosineSimilarity{
/**
* 输入两段文本利用孜频率的余弦定理判断二者间的相似度
*
* @param doc1,文本1
* @param doc2,文本2
* @return 相似度值
*/
public double CalculateTextSim(String doc1, String doc2) {
if (doc1 != null && doc1.trim().length() > 0 && doc2 != null
&& doc2.trim().length() > 0) {
Map<Integer, int[]> AlgorithmMap = new HashMap<Integer, int[]>();
//将两个字符串中的中文字符以及出现的总数封装到，AlgorithmMap中
for (int i = 0; i < doc1.length(); i++) {
char d1 = doc1.charAt(i);
if(isHanZi(d1)){
int charIndex = getGB2312Id(d1);
if(charIndex != -1){
int[] fq = AlgorithmMap.get(charIndex);
if(fq != null && fq.length == 2){
fq[0]++;
}else {
fq = new int[2];
fq[0] = 1;
fq[1] = 0;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
for (int i = 0; i < doc2.length(); i++) {
char d2 = doc2.charAt(i);
if(isHanZi(d2)){
int charIndex = getGB2312Id(d2);
if(charIndex != -1){
int[] fq = AlgorithmMap.get(charIndex);
if(fq != null && fq.length == 2){
fq[1]++;
}else {
fq = new int[2];
fq[0] = 0;
fq[1] = 1;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
Iterator<Integer> iterator = AlgorithmMap.keySet().iterator();
double sqdoc1 = 0;
double sqdoc2 = 0;
double denominator = 0;
while(iterator.hasNext()){
int[] c = AlgorithmMap.get(iterator.next());
denominator += c[0]*c[1];
sqdoc1 += c[0]*c[0];
sqdoc2 += c[1]*c[1];
}
return denominator / Math.sqrt(sqdoc1*sqdoc2);
} else {
throw new NullPointerException("the Document is null or have not cahrs!!");
}
}
/**
* 输入一个字符判断是否为中文汉字
*
* @param ch，字符
* @return true为中文汉字，否则为false
*/
public boolean isHanZi(char ch) {
return (ch >= 0x4E00 && ch <= 0x9FA5);
}
/**
* 根据输入的Unicode字符，获取它的GB2312编码或者ascii编码，
*
* @param ch,输入的GB2312中文字符或者ASCII字符(128个)
* @return ch在GB2312中的位置，-1表示该字符不认识
*/
public static short getGB2312Id(char ch) {
try {
byte[] buffer = Character.toString(ch).getBytes("GB2312");
if (buffer.length != 2) {
// 正常情况下buffer应该是两个字节，否则说明ch不属于GB2312编码，故返回'?'，此时说明不认识该字符
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始，因此减去0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字，因此每个区只收16*6-2=94个汉字
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return -1;
}
}

posted @ 2017-11-28 13:46 Histring 阅读(1377) 评论(0) 编辑收藏举报

刷新页面返回顶部

Histring

基于字的文本相似度算法——余弦定理

公告