实验一:语言模型

实验思路:
对test中的每个句子,分别计算1文元法和2文元法,加入平滑,计算困惑度
1文元法——统计当前词在训练预料的频率,一个句子的概率为每个词的概率✖️起来,测试集的概率为句子✖️起来
所谓平滑:
原因,训练集里没有测试集的词
做法,就是训练集变成(训练集+测试集(去重之后),然后再正常去做
details:

  1. 把句子用open读入,去掉eou,前后加上bos,eos。
  2. 统计训练、测试集的一元词汇、二元词汇个数,训练集中('start',*)的个数也就是以某个单词为开头的二元组的个数,用的是字典+列表
  3. 一元的就是用纯频率,二元的用条件概率
  4. 用+1平滑,注意是相当于把测试集去重后加入训练集

导入库

import nltk,math
from textblob import TextBlob
from textblob import Word
from collections import defaultdict

数据预处理

def Pretreatment(row):
    row = row.lower()
    terms=TextBlob(row).words.singularize()
    temp = ['bos']
    for term in terms:
        k = Word(term)
        k = k.lemmatize("v")
        if k == 'eou':
            temp.append('eos')
            #uni
            for i in temp:
                train_uni[i] += 1
            #bi
            for i in nltk.bigrams(temp):
                train_bi_num[i] += 1
                if i[1] not in train_bi[i[0]]:
                    train_bi[i[0]].append(i[1])
               
            temp = ['bos']
        elif k != '’':
            temp.append(k)
def Pre_test(row):
    row = row.lower()
    terms=TextBlob(row).words.singularize()
    temp = ['bos']
    for term in terms:
        k = Word(term)
        k = k.lemmatize("v")
        if k == 'eou':
            temp.append('eos')
            #uni
            for i in temp:
                test_uni[i] += 1
            #bi
            for i in nltk.bigrams(temp):
                test_bi[i] += 1
            temp = ['bos']
        elif k != '’':
            temp.append(k)
train = open("train_LM.txt")
test = open("test_LM.txt")
global train_uni,train_bi,test_uni,test_bi,len_uni,len_bi,train_tot,test_tot_uni,test_tot_bi,train_bi_num
train_uni = defaultdict(int)
train_bi = defaultdict(list)
train_bi_num = defaultdict(int)

test_uni = defaultdict(int)
test_bi = defaultdict(int)

train_rows = train.readlines()
for row in train_rows:
    Pretreatment(row)
    
test_rows = test.readlines()
for row in test_rows:
    Pre_test(row)
    
len_uni = len(test_uni) #用来做加1平滑
len_bi = len(test_bi)

train_tot = 0
for i in train_uni.keys():
    train_tot += train_uni[i]

test_tot_uni = 0
for i in test_uni.keys():
    test_tot_uni += test_uni[i]  
    
test_tot_bi = 0
for i in test_bi.keys():
    test_tot_bi += test_bi[i] 

计算困惑度

def deal(row):
    global ans_uni
    global ans_bi
    row = row.lower()
    terms=TextBlob(row).words.singularize()
    temp = ['bos']
    for term in terms:
        k = Word(term)
        k = k.lemmatize("v")
        if k == 'eou':
            temp.append('eos')
            #uni
            for i in temp:
                ans_uni += math.log((train_uni[i]+1.0)/(train_tot+len_uni),2)
            #bi
            for i in nltk.bigrams(temp):
                ans_bi += math.log((train_bi_num[i]+1.0)/(len(train_bi[i[0]])+len_bi),2)
            temp = ['bos']
        elif k != '’':
            temp.append(k)
global ans_uni
global ans_bi

ans_uni = 0
ans_bi = 0

for row in test_rows:
    deal(row)

ans_uni *= -1.0 / test_tot_uni
ans_bi *= -1.0 / test_tot_bi
ans_uni = pow(2,ans_uni)
ans_bi = pow(2,ans_bi)

print(ans_uni)
print(ans_bi)
280.4766308737408
96.29252770568773
posted @ 2021-03-25 18:55  WeiAR  阅读(245)  评论(0编辑  收藏  举报