#coding=utf-8
import numpy as np
import jieba
class TfIdf:
def __init__(self,doc):
self.doc = doc
self.get_dic()
def get_dic(self):
stop_path = 'stop_word.txt'
with open(stop_path,encoding="utf-8") as f:
stop_dic = set(f.read().split("\n"))
self.doc = [list(jieba.cut(sent)) for sent in self.doc]
self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic])))
def cal_tf(self):
self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
def cal_idf(self):
self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
def cal_tfidf(self):
self.cal_tf()
self.cal_idf()
self.tfidf = self.tf*self.idf
if __name__=="__main__":
doc = ['女排北京奥运会夺冠',
'北京奥运会的羽毛球男单决赛',
'中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
tf_idf = TfIdf(doc)
tf_idf.cal_tfidf()
print(tf_idf.tfidf)