文本的简单表示 boolean representation count-based Representation tf-idf python实现

1. Boolean representation

 

word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步']
def booleanRepresent(user_input):
    count = {}
    for word in word_dict:
        count[word] = 0      
    for word in user_input:
        if word in count:
            count[word] = 1 
        else:
            count[word] = 0 
    return count 

user_input1 = ['我们', '今天', '去', '爬山']
print(booleanRepresent(user_input1))
user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步']
print(booleanRepresent(user_input2))  

 

  输出结果:

{'我们': 1, '又': 0, '去': 1, '爬山': 1, '今天': 1, '你们': 0, '昨天': 0, '跑步': 0}
{'我们': 0, '又': 1, '去': 1, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}


2. Count-based Representation
word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步'] 
user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步']
def countRepresent(user_input):
    count = {}
    for word in word_dict:
        count[word] = 0 
        
    for word in user_input2:
        if word in count:
            count[word] += 1 
        else:
            count[word] = 0 
    return count 
countRepresent(user_input2)

  输出结果:

{'我们': 0, '又': 2, '去': 2, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}

 3. Tf-Idf表示

import math 

word_dict = ['今天', '上', 'NLP', '课程', '的', '有', '意思', '数据', '也']
text1 = ['今天', '上', 'NLP', '课程']
text2 = ['今天', '的', '课程', '也', '有', '意思']
text3 = ['数据', '课程', '也', '有', '意思']
document = [text1, text2, text3]

def getIDF(word_dict, document):
    idf_of_word = {}
    for word in word_dict:
        w_in_f = 0.0 
        for text in document:
            if word in text:
                w_in_f += 1.0 
        idf_of_word[word] = math.log(len(document) / w_in_f) 
    return idf_of_word 

print(getIDF(word_dict, document))
 

   IDF输出结果:

{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 1.0986122886681098, '有': 0.4054651081081644, '意思': 0.4054651081081644, '数据': 1.0986122886681098, '也': 0.4054651081081644}

  

def getTfIdf(word_dict, text):
    tf_words = {}
    for w in word_dict:
        if w in text1:
            tf_words[w] = text1.count(w)
        else:
            tf_words[w] = 0  
        tf_idf_of_file[w] = tf_words[w] * idf_of_word[w] 
    return tf_idf_of_file 
    
print(tf_idf_of_file)

  Tf-Idf输出结果:

{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 0.0, '有': 0.0, '意思': 0.0, '数据': 0.0, '也': 0.0}

  

 

 

 
posted @ 2020-01-05 19:50  RamboBai  阅读(410)  评论(0编辑  收藏  举报