一些好用的代码

按标点切分语料

src = ''
tgt = ''
temp = ",.!?;"

def fun(file1,file2,temp):
　　with open(file1,'r',encoding='utf-8') as fl1:
　　　　with open(file2,'w',encoding='utf-8') as fl2:
　　　　　　for line in fl1.readlines():
　　　　　　　　for word in line:
　　　　　　　　　　if word not in temp:
　　　　　　　　　　　　fl2.write(word)
　　　　　　　　　　　else:
　　　　　　　　　　　　if word != '\n':
　　　　　　　　　　　　　　fl2.write(word+'\n')
　　　　　　　　　　　　else:
　　　　　　　　　　　　　　fl2.write(word)
fun(src,tgt,temp)

查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子，将其定位之后再交换句子并生成新的文件

### 本代码同过英文文档来定位中文文档
import langid
import tensorflow as tf
import codecs
from langdetect import detect        ## detect()输出探测出的语言类型
from langdetect import detect_langs  ## detect()输出探测出的所有语言类型及其所占的比例

src = ''   ## 英文
tgt = ''   ## 中文
file1 = ''
file2 = ''

def fun1(seq）:
　　temp = langid.classify(seq)
　　return temp[0]

def fun(seq):
　　lemp = detect(seq)
　　return lemp

### 该函数的功能是定位外语句子
def fun2(src):
　　k = 0
　　temp = []
　　with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:
　　　　for line in fl.readlines():
　　　　　　k += 1
          try:
　　　　　　　　temp1 = fun(line)   ### 正常的情况下用langdetect
　　　　　　expect:
　　　　　　　　temp1 = fun1(line)  ### 异常的情况下用langid
　　　　　　if temp1 == 'zh':
　　　　　　　　temp.append(k)
　　　　　　else:
　　　　　　　　pass
　　　　return temp

### 该函数的功能是交换句子
### src(英文)：file1是切分后的英文句子，file2是切分后的中文句子
### tgt(中文)：file1是切分后的中文句子，file2是切分后的英文句子
def fun3(temp,src,tgt,file1,file2):
　　num = 0
　　#s_file = open(src,'r',encoding='utf-8')
　　s_file = open(tgt,'r',encoding='utf-8')
　　fl1 = open(file1,'w',encoding = 'utf-8')
　　fl2 = open(file2,'w',encoding = 'utf-8')
　　for line in s_file.readlines():
　　　　num += 1
　　　　if num in temp:
　　　　　　fl2.write(line)
　　　　else:
　　　　　　fl1.write(line)

　　s_file.close()
　　fl1.close()
　　fl2.close()



if __name__ == "__main__":
　　temp = fun2(src)
　　fun3(temp,src,tgt,file1,file2)

分词

import jieba

src = ''
tgt = ''

def cut(file1,file2):
　　with open(file1,'r',encoding='utf-8') as fl1:
　　　　with open(file2,'w',encoding='utf-8') as fl2:
　　　　　　for line in fl1.readlines():
　　　　　　　　## seq = jieba.cut(line,cut_all=True) ## 全模式
　　　　　　　　## seq = jieba.cut_for_search(line  ) ## 搜索引擎模式
　　　　　　　　seq = jieba.cut(line,cut_all = False) ## 精确模式
　　　　　　　　seq = ' '.join(seq)
　　　　　　　　fl2.write(seq)

还原句子

### 将分好词的结果文件还原成句子
file = ''
tgt_file = ''

def fun(file,file2):
　　with open(file,'r',encoding='utf-8') as fl1:
　　　　fl2 = open(file2,'w',encoding='utf-8')
　　　　　　for line in fl.readlines():
　　　　　　　　line = line.replace(" ",'')
　　　　　　　　fl2.write(line)
　　　　fl2.close()

fun(file,tgt_file)

随机生成测试集

### 本代码的功能是随机抽取测试集，并将文本除抽取出的测试集之后余下的部分生成训练集
import numpy as np
import random

src_en = ''
src_ch = ''
cut_num = 3000    ## 抽取句子的数量

tgt_train_en = ''
tgt_train_ch = ''
tgt_dev_en = ''
tgt_dev_ch = ''

## 生成随机数
def random_num():
　　temp = []
　　for i in range(cut_num):
　　　　a = random.randint(1,25000) ## 生成随机数的范围
　　　　if a not in temp:
　　　　　　temp.append(a)
　　print(len(temp))   ## 实际抽取出的数量
　　temp = sorted(temp,reverse=False) ## 升序
　　return temp

## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)
def new_file(file1,file2,file3,file4,file5,file6):
　　temp = random_num()
　　fl1 = open(file1,'r',encoding='utf-8')
　　fl2 = open(file2,'r',encoding='utf-8')
　　fl3 = open(file3,'r',encoding='utf-8')
　　fl4 = open(file4,'r',encoding='utf-8')
　　fl5 = open(file5,'r',encoding='utf-8')
　　fl6 = open(file6,'r',encoding='utf-8')
　　
   def fun(f1,f2,f3):
　　　　num = 0
　　　　i = 0
　　　　for line1 in fl.readlines():
　　　　　　num += 1
　　　　　　if i< len(temp):
　　　　　　　　if num == temp[i]:
　　　　　　　　　　f3.write(line1)
　　　　　　　　　　　i += 1
　　　　　　　　else:
　　　　　　　　　　f2.write(line1)
　　fun(fl1,fl3,fl4)
　　fun(fl2,fl5,fl6)

　　fl1.close()
　　fl2.close()
　　fl3.close()
　　fl4.close()
　　fl5.close()
　　fl6.close()

new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)

posted @ 2019-10-08 21:23 胡~萝~卜阅读(672) 评论(0) 编辑收藏举报

刷新页面返回顶部

Dylan~Han

一些好用的代码

公告