TFIDF手写

#coding=utf-8
import numpy as np
import jieba

class TfIdf:
    def __init__(self,doc):
        self.doc = doc
        self.get_dic()
        
    def get_dic(self):
        stop_path = 'stop_word.txt'
        with open(stop_path,encoding="utf-8") as f:
            stop_dic = set(f.read().split("\n"))
        self.doc = [list(jieba.cut(sent)) for sent in self.doc]
        self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic])))
    
    def cal_tf(self):
        self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
    
    def cal_idf(self):
        self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
    
    def cal_tfidf(self):
        self.cal_tf()
        self.cal_idf()
        self.tfidf = self.tf*self.idf    

if __name__=="__main__":
    doc = ['女排北京奥运会夺冠',
           '北京奥运会的羽毛球男单决赛',
           '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
    tf_idf = TfIdf(doc)
    tf_idf.cal_tfidf()
    print(tf_idf.tfidf)

posted on 2021-11-30 17:07 哦哟这个怎么搞阅读(45) 评论(0) 收藏举报

刷新页面返回顶部

ruijiege

公告

TFIDF手写