文本相似度推荐
#通过结巴分词,这是基于内容的推荐 其中用到了相似度算法
import jieba
import re
import numpy as np
s="人要成长年轻,必有原因,背后的努力999与积累一定数倍于普通人。所以,关键还在于自己。"
m="人年轻时我们只因多愁善感就会大肆渲染,而今却是变得越难过越沉默。命运要你成长的时候"
n="我们要做的,和岁月较量,和时间奔跑,在卑微中活出精彩,在短暂中寻求永恒。"
b="时光真疯狂,我们一路执迷与匆忙,依稀悲伤,来不及遗忘,只有待风将她埋葬。"
wordl=[]
#构建训练集
def creatVocabList(wordList):
vocabSet=set([])
for document in wordList:
vocabSet=vocabSet|set(document)
vocabList=list(vocabSet)
return vocabList
#分词
def textParse1(line):
line=re.sub(r'[a-zA-Z.【】0-9、。,/!…~\*\n]','',line)
line=jieba.lcut(line,cut_all=True)
return [w for w in line if len(w)>1]
#构建向量
def setOfWords2Vec(vocabList,words):
wordVec=[0]*len(vocabList)
for word in words:
if word in vocabList:
wordVec[vocabList.index(word)]=1
return wordVec
wordl.append(textParse1(s))
wordl.append(textParse1(m))
wordl.append(textParse1(n))
wordl.append(textParse1(b))
print(wordl)
#训练集
vocabList=creatVocabList(wordl)
print(vocabList)
#向量化
trainMat=[]
for words in wordl:
trainMat.append(setOfWords2Vec(vocabList,words))
def cos(a,b):
return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
print(cos(trainMat[0],trainMat[1]))
print(cos(trainMat[0],trainMat[2]))
print(cos(trainMat[0],trainMat[3]))