python 语料处理(从文件夹中读取文件夹中文件,分词,去停用词,去单个字)
# -*- coding:utf8 -*- import os import jieba def splitSentence(inputFile): fin = open(inputFile, 'r') #以读的方式打开文件 global fout #以写得方式打开文件 #print fin global stop for eachLine in fin: #print eachLine line = eachLine.strip()#.decode('utf-8', 'ignore') #去除每行首尾可能出现的空格,并转为Unicode进行处理 line=line.strip('\n') #去掉多余空行 wordList = list(jieba.cut(line)) #用结巴分词,对每行内容进行分词 #wordList = list(jieba.cut_for_search(line)) outStr = '' for word in wordList:# if len(word)>1: if not word in stop: outStr += word outStr += ' ' fout.write(outStr.strip().encode('utf-8')) #将分词好的结果写入到输出文件 fout.write('\n') fin.close() #path=r'/media/软件/zhuomian/VARandLDAr/train' #r'D:/zhuomian/VARandLDA/train' path='/home/xdj/train' fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files] stop = [line.strip().decode('utf-8', 'ignore') for line in open('/home/xdj/chstop.txt').readlines()] fout = open('myOutput.txt', 'w') fout.write('%d' %len(fns)+'\n') for f in fns: splitSentence(f) #splitSentence('/home/xdj/train/C3-Art/C3-Art1459.txt', 'myOutput.txt') print(len(fns)) fout.close()