一些好用的代码

按标点切分语料

src = ''
tgt = ''
temp = ",.!?;"

def fun(file1,file2,temp):
  with open(file1,'r',encoding='utf-8') as fl1:
    with open(file2,'w',encoding='utf-8') as fl2:
      for line in fl1.readlines():
        for word in line:
          if word not in temp:
            fl2.write(word)
           else:
            if word != '\n':
              fl2.write(word+'\n')
            else:
              fl2.write(word)
fun(src,tgt,temp)
        

查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子,将其定位之后再交换句子并生成新的文件

### 本代码同过英文文档来定位中文文档
import langid
import tensorflow as tf
import codecs
from langdetect import detect ## detect()输出探测出的语言类型
from langdetect import detect_langs ## detect()输出探测出的所有语言类型及其所占的比例

src = '' ## 英文
tgt = '' ## 中文
file1 = ''
file2 = ''

def fun1(seq):
  temp = langid.classify(seq)
  return temp[0]

def fun(seq):
  lemp = detect(seq)
  return lemp

### 该函数的功能是定位外语句子
def fun2(src):
  k = 0
  temp = []
  with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:
    for line in fl.readlines():
      k += 1
try:
        temp1 = fun(line) ### 正常的情况下用langdetect
      expect:
        temp1 = fun1(line) ### 异常的情况下用langid
      if temp1 == 'zh':
        temp.append(k)
      else:
        pass
    return temp

### 该函数的功能是交换句子
### src(英文):file1是切分后的英文句子,file2是切分后的中文句子
### tgt(中文):
file1是切分后的中文句子,file2是切分后的英文句子
def fun3(temp,src,tgt,file1,file2):
  num = 0
  #s_file = open(src,'r',encoding='utf-8')
  s_file = open(tgt,'r',encoding='utf-8')
  fl1 = open(file1,'w',encoding = 'utf-8')
  fl2 = open(file2,'w',encoding = 'utf-8')
  for line in s_file.readlines():
    num += 1
    if num in temp:
      fl2.write(line)
    else:
      fl1.write(line)

  s_file.close()
  fl1.close()
  fl2.close()


if __name__ == "__main__":
  temp = fun2(src)
  fun3(temp,src,tgt,file1,file2)

 分词

import jieba

src = ''
tgt = ''

def cut(file1,file2):
  with open(file1,'r',encoding='utf-8') as fl1:
    with open(file2,'w',encoding='utf-8') as fl2:
      for line in fl1.readlines():
        ## seq = jieba.cut(line,cut_all=True) ## 全模式
        ## seq = jieba.cut_for_search(line ) ## 搜索引擎模式
        seq = jieba.cut(line,cut_all = False) ## 精确模式
        seq = ' '.join(seq)
        fl2.write(seq)

还原句子

### 将分好词的结果文件还原成句子
file = ''
tgt_file = ''

def fun(file,file2):
  with open(file,'r',encoding='utf-8') as fl1:
    fl2 = open(file2,'w',encoding='utf-8')
      for line in fl.readlines():
        line = line.replace(" ",'')
        fl2.write(line)
    fl2.close()

fun(file,tgt_file)

 随机生成测试集

### 本代码的功能是随机抽取测试集,并将文本除抽取出的测试集之后余下的部分生成训练集
import numpy as np
import random

src_en = ''
src_ch = ''
cut_num = 3000 ## 抽取句子的数量

tgt_train_en = ''
tgt_train_ch = ''
tgt_dev_en = ''
tgt_dev_ch = ''

## 生成随机数
def random_num():
  temp = []
  for i in range(cut_num):
    a = random.randint(1,25000) ## 生成随机数的范围
    if a not in temp:
      temp.append(a)
  print(len(temp)) ## 实际抽取出的数量
  temp = sorted(temp,reverse=False) ## 升序
  return temp

## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)
def new_file(file1,file2,file3,file4,file5,file6):
  temp = random_num()
  fl1 = open(file1,'r',encoding='utf-8')
  fl2 = open(file2,'r',encoding='utf-8')
  fl3 = open(file3,'r',encoding='utf-8')
  fl4 = open(file4,'r',encoding='utf-8')
  fl5 = open(file5,'r',encoding='utf-8')
  fl6 = open(file6,'r',encoding='utf-8')
  
def fun(f1,f2,f3):
    num = 0
    i = 0
    for line1 in fl.readlines():
      num += 1
      if i< len(temp):
        if num == temp[i]:
          f3.write(line1)
           i += 1
        else:
          f2.write(line1)
  fun(fl1,fl3,fl4)
  fun(fl2,fl5,fl6)

  fl1.close()
  fl2.close()
  fl3.close()
  fl4.close()
  fl5.close()
  fl6.close()
new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)
  

 

posted @ 2019-10-08 21:23  胡~萝~卜  阅读(672)  评论(0编辑  收藏  举报