一些好用的代码
按标点切分语料
src = ''
tgt = ''
temp = ",.!?;"
def fun(file1,file2,temp):
with open(file1,'r',encoding='utf-8') as fl1:
with open(file2,'w',encoding='utf-8') as fl2:
for line in fl1.readlines():
for word in line:
if word not in temp:
fl2.write(word)
else:
if word != '\n':
fl2.write(word+'\n')
else:
fl2.write(word)
fun(src,tgt,temp)
查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子,将其定位之后再交换句子并生成新的文件
### 本代码同过英文文档来定位中文文档
import langid
import tensorflow as tf
import codecs
from langdetect import detect ## detect()输出探测出的语言类型
from langdetect import detect_langs ## detect()输出探测出的所有语言类型及其所占的比例
src = '' ## 英文
tgt = '' ## 中文
file1 = ''
file2 = ''
def fun1(seq):
temp = langid.classify(seq)
return temp[0]
def fun(seq):
lemp = detect(seq)
return lemp
### 该函数的功能是定位外语句子
def fun2(src):
k = 0
temp = []
with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:
for line in fl.readlines():
k += 1
try:
temp1 = fun(line) ### 正常的情况下用langdetect
expect:
temp1 = fun1(line) ### 异常的情况下用langid
if temp1 == 'zh':
temp.append(k)
else:
pass
return temp
### 该函数的功能是交换句子
### src(英文):file1是切分后的英文句子,file2是切分后的中文句子
### tgt(中文):file1是切分后的中文句子,file2是切分后的英文句子
def fun3(temp,src,tgt,file1,file2):
num = 0
#s_file = open(src,'r',encoding='utf-8')
s_file = open(tgt,'r',encoding='utf-8')
fl1 = open(file1,'w',encoding = 'utf-8')
fl2 = open(file2,'w',encoding = 'utf-8')
for line in s_file.readlines():
num += 1
if num in temp:
fl2.write(line)
else:
fl1.write(line)
s_file.close()
fl1.close()
fl2.close()
if __name__ == "__main__":
temp = fun2(src)
fun3(temp,src,tgt,file1,file2)
分词
import jieba
src = ''
tgt = ''
def cut(file1,file2):
with open(file1,'r',encoding='utf-8') as fl1:
with open(file2,'w',encoding='utf-8') as fl2:
for line in fl1.readlines():
## seq = jieba.cut(line,cut_all=True) ## 全模式
## seq = jieba.cut_for_search(line ) ## 搜索引擎模式
seq = jieba.cut(line,cut_all = False) ## 精确模式
seq = ' '.join(seq)
fl2.write(seq)
还原句子
### 将分好词的结果文件还原成句子
file = ''
tgt_file = ''
def fun(file,file2):
with open(file,'r',encoding='utf-8') as fl1:
fl2 = open(file2,'w',encoding='utf-8')
for line in fl.readlines():
line = line.replace(" ",'')
fl2.write(line)
fl2.close()
fun(file,tgt_file)
随机生成测试集
### 本代码的功能是随机抽取测试集,并将文本除抽取出的测试集之后余下的部分生成训练集
import numpy as np
import random
src_en = ''
src_ch = ''
cut_num = 3000 ## 抽取句子的数量
tgt_train_en = ''
tgt_train_ch = ''
tgt_dev_en = ''
tgt_dev_ch = ''
## 生成随机数
def random_num():
temp = []
for i in range(cut_num):
a = random.randint(1,25000) ## 生成随机数的范围
if a not in temp:
temp.append(a)
print(len(temp)) ## 实际抽取出的数量
temp = sorted(temp,reverse=False) ## 升序
return temp
## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)
def new_file(file1,file2,file3,file4,file5,file6):
temp = random_num()
fl1 = open(file1,'r',encoding='utf-8')
fl2 = open(file2,'r',encoding='utf-8')
fl3 = open(file3,'r',encoding='utf-8')
fl4 = open(file4,'r',encoding='utf-8')
fl5 = open(file5,'r',encoding='utf-8')
fl6 = open(file6,'r',encoding='utf-8')
def fun(f1,f2,f3):
num = 0
i = 0
for line1 in fl.readlines():
num += 1
if i< len(temp):
if num == temp[i]:
f3.write(line1)
i += 1
else:
f2.write(line1)
fun(fl1,fl3,fl4)
fun(fl2,fl5,fl6)
fl1.close()
fl2.close()
fl3.close()
fl4.close()
fl5.close()
fl6.close()
new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)