实验一:语言模型
实验思路:
对test中的每个句子,分别计算1文元法和2文元法,加入平滑,计算困惑度
1文元法——统计当前词在训练预料的频率,一个句子的概率为每个词的概率✖️起来,测试集的概率为句子✖️起来
所谓平滑:
原因,训练集里没有测试集的词
做法,就是训练集变成(训练集+测试集(去重之后),然后再正常去做
details:
- 把句子用open读入,去掉eou,前后加上bos,eos。
- 统计训练、测试集的一元词汇、二元词汇个数,训练集中('start',*)的个数也就是以某个单词为开头的二元组的个数,用的是字典+列表
- 一元的就是用纯频率,二元的用条件概率
- 用+1平滑,注意是相当于把测试集去重后加入训练集
导入库
import nltk,math
from textblob import TextBlob
from textblob import Word
from collections import defaultdict
数据预处理
def Pretreatment(row):
row = row.lower()
terms=TextBlob(row).words.singularize()
temp = ['bos']
for term in terms:
k = Word(term)
k = k.lemmatize("v")
if k == 'eou':
temp.append('eos')
#uni
for i in temp:
train_uni[i] += 1
#bi
for i in nltk.bigrams(temp):
train_bi_num[i] += 1
if i[1] not in train_bi[i[0]]:
train_bi[i[0]].append(i[1])
temp = ['bos']
elif k != '’':
temp.append(k)
def Pre_test(row):
row = row.lower()
terms=TextBlob(row).words.singularize()
temp = ['bos']
for term in terms:
k = Word(term)
k = k.lemmatize("v")
if k == 'eou':
temp.append('eos')
#uni
for i in temp:
test_uni[i] += 1
#bi
for i in nltk.bigrams(temp):
test_bi[i] += 1
temp = ['bos']
elif k != '’':
temp.append(k)
train = open("train_LM.txt")
test = open("test_LM.txt")
global train_uni,train_bi,test_uni,test_bi,len_uni,len_bi,train_tot,test_tot_uni,test_tot_bi,train_bi_num
train_uni = defaultdict(int)
train_bi = defaultdict(list)
train_bi_num = defaultdict(int)
test_uni = defaultdict(int)
test_bi = defaultdict(int)
train_rows = train.readlines()
for row in train_rows:
Pretreatment(row)
test_rows = test.readlines()
for row in test_rows:
Pre_test(row)
len_uni = len(test_uni) #用来做加1平滑
len_bi = len(test_bi)
train_tot = 0
for i in train_uni.keys():
train_tot += train_uni[i]
test_tot_uni = 0
for i in test_uni.keys():
test_tot_uni += test_uni[i]
test_tot_bi = 0
for i in test_bi.keys():
test_tot_bi += test_bi[i]
计算困惑度
def deal(row):
global ans_uni
global ans_bi
row = row.lower()
terms=TextBlob(row).words.singularize()
temp = ['bos']
for term in terms:
k = Word(term)
k = k.lemmatize("v")
if k == 'eou':
temp.append('eos')
#uni
for i in temp:
ans_uni += math.log((train_uni[i]+1.0)/(train_tot+len_uni),2)
#bi
for i in nltk.bigrams(temp):
ans_bi += math.log((train_bi_num[i]+1.0)/(len(train_bi[i[0]])+len_bi),2)
temp = ['bos']
elif k != '’':
temp.append(k)
global ans_uni
global ans_bi
ans_uni = 0
ans_bi = 0
for row in test_rows:
deal(row)
ans_uni *= -1.0 / test_tot_uni
ans_bi *= -1.0 / test_tot_bi
ans_uni = pow(2,ans_uni)
ans_bi = pow(2,ans_bi)
print(ans_uni)
print(ans_bi)
280.4766308737408
96.29252770568773