文本的简单表示 boolean representation count-based Representation tf-idf python实现
1. Boolean representation
word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步'] def booleanRepresent(user_input): count = {} for word in word_dict: count[word] = 0 for word in user_input: if word in count: count[word] = 1 else: count[word] = 0 return count user_input1 = ['我们', '今天', '去', '爬山'] print(booleanRepresent(user_input1)) user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步'] print(booleanRepresent(user_input2))
输出结果:
{'我们': 1, '又': 0, '去': 1, '爬山': 1, '今天': 1, '你们': 0, '昨天': 0, '跑步': 0} {'我们': 0, '又': 1, '去': 1, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}
2. Count-based Representation
word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步'] user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步'] def countRepresent(user_input): count = {} for word in word_dict: count[word] = 0 for word in user_input2: if word in count: count[word] += 1 else: count[word] = 0 return count countRepresent(user_input2)
输出结果:
{'我们': 0, '又': 2, '去': 2, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}
3. Tf-Idf表示
import math word_dict = ['今天', '上', 'NLP', '课程', '的', '有', '意思', '数据', '也'] text1 = ['今天', '上', 'NLP', '课程'] text2 = ['今天', '的', '课程', '也', '有', '意思'] text3 = ['数据', '课程', '也', '有', '意思'] document = [text1, text2, text3] def getIDF(word_dict, document): idf_of_word = {} for word in word_dict: w_in_f = 0.0 for text in document: if word in text: w_in_f += 1.0 idf_of_word[word] = math.log(len(document) / w_in_f) return idf_of_word print(getIDF(word_dict, document))
IDF输出结果:
{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 1.0986122886681098, '有': 0.4054651081081644, '意思': 0.4054651081081644, '数据': 1.0986122886681098, '也': 0.4054651081081644}
def getTfIdf(word_dict, text): tf_words = {} for w in word_dict: if w in text1: tf_words[w] = text1.count(w) else: tf_words[w] = 0 tf_idf_of_file[w] = tf_words[w] * idf_of_word[w] return tf_idf_of_file print(tf_idf_of_file)
Tf-Idf输出结果:
{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 0.0, '有': 0.0, '意思': 0.0, '数据': 0.0, '也': 0.0}