神经网络中文本预处理步骤
机器学习中的文本情感分类,往往需要将语料处理成神经网络能够识别的矩阵形式。
对于下载的语料,如果存在标签,如下面的形式:
<Product type>手机产品列表</Product type> 19489 <Rev_body> <Sentiment>pos</Sentiment> <Rev_text>功能比一般小灵通多许多,不可许多人不懂得用。电话本容量,短信信箱容量,通话记录等比一般小灵通手机多许多。还有很多小功能,比如群发短信,外键拒接,计算器,记事本。。。关键是信号好又稳定,这只小灵通是我用的最长时间的一只,用得很顺手了,现在还不舍得换。我还喜欢那外屏的形状,比较特别。呵呵!</Rev_text> </Rev_body> 19492 <Rev_body> <Sentiment>neg</Sentiment> <Rev_text>有时候爱死机,外屏有问题,很烦</Rev_text> </Rev_body>
将pos和neg两个不同的类别分别进行输出,可以用linux下的grep命令自动提取。
提取u.txt文本中包含“pos”关键字行的下一行内容输出到poscorpus.txt
cat u.txt | grep -A 1 "pos" > poscorpus.txt
提取poscorpus.txt文本中包含“<Rev_text>”标签的行输出到pos.txt
cat poscorpus.txt | grep "<Rev_text>" > pos.txt
之后在去掉<Rev_text>标签:
1 # -*- coding: utf-8 -*- 2 import os 3 import sys 4 import time 5 t1 = time.time() 6 reload(sys) 7 sys.setdefaultencoding("utf-8") 8 9 10 def qubiaoqian(argv): 11 filename = argv[1] 12 f = open(filename, 'r') 13 file_list = f.readlines() 14 f.close() 15 f1 = open("final.txt", 'w+') 16 f1.write('\xEF\xBB\xBF') 17 for line in file_list: 18 if "</" in line: 19 slope = line.index("</") 20 line2 = line[10:slope] 21 22 f1.write(line2+'\n') 23 else: 24 f1.write(line) 25 f1.close() 26 if __name__ == "__main__": 27 qubiaoqian(sys.argv) 28 t2 = time.time() 29 print("done! 耗时:" + str(t2 - t1) + "秒。")
语料由很多的短文本组成,为了建造词典,合并成一个大的txt文件,可以用以下代码:
1 # -*- coding: utf-8 -*- 2 import os 3 import sys 4 import glob 5 import time 6 t1 = time.time() 7 8 9 def change(argv): 10 dir = argv[1] 11 if not os.path.isdir(dir): 12 print("fail & This is not a dir", dir) 13 14 f1 = open("LargeTxt.txt", "w+") 15 for txtFile in glob.glob(os.path.join(dir, "*.txt")): 16 print txtFile 17 f = open(txtFile, "r") 18 filelist = f.readlines() 19 f.close() 20 for line in filelist: 21 f1.write(line) 22 f1.close() 23 24 if __name__ == "__main__": 25 change(sys.argv) 26 t2 = time.time() 27 print("done! 耗时:"+str(t2-t1)+"秒。")
之后用jieba进行分词、词性标注及过滤和去停用词操作:
# -*- coding: utf-8 -*- import jieba import os import glob import re import sys import jieba.posseg as pseg import time from collections import Counter t1 = time.time() reload(sys) sys.setdefaultencoding('utf8') def fc(object): path = '/home/liumingyu/fenci/pos/' os.chdir(path) for filename in glob.glob('*.txt'): f = open(filename, 'r+') file_list = f.read() f.close() # 分词并标注词性 seg_list = pseg.cut(file_list) fname, fextension = os.path.splitext(filename) path1 = '/home/liumingyu/fenci/result1/' os.chdir(path1) with open(fname+'result1.txt', "w+") as f: for word, flag in seg_list: f.write(word+"/"+flag+" ") f.close() # 词性过滤 f1 = open(fname+'result1.txt', "r") txt = f1.readlines() f1.close() path2 = '/home/liumingyu/fenci/result2/' os.chdir(path2) f2 = open(fname+'result2.txt', "w+") txtlist = [] cixing = ["/Ag", "/b", "/dg", "/e", "/f", "/g", "/h", "/i", "/j", "/k", "/l", "/m", "/Ng", "/o", "/p", "/q", "/r", "/s", "/tg", "/t", "/uj", "/vg", "/v", "/vd", "/vn", "/w", "/x", "/y", "/z", "/un"] for line in txt: line_list2 = re.split('[ ]', line) line_list = line_list2[:] for segs in line_list2: for k in cixing: if k in segs: line_list.remove(segs) break else: pass txtlist.extend(line_list) for v in txtlist: if "/" in v: slope = v.index("/") letter = v[0:slope] + " " f2.write(letter) else: f2.write(v) f2.close() # 去停用词 stopword = [] f5 = open("stop.txt", "r") stopwords = f5.readlines() f5.close() for li in stopwords: lin = li.strip() stopword.append(lin) f3 = open(fname+'result2.txt', "r") path3 = '/home/liumingyu/fenci/result3/' os.chdir(path3) test_content = f3.readlines() f3.close() wordlist = [] f4 = open(fname + 'result3.txt', "w+") for le in test_content: line_list2 = re.split('[ ]', le) line_list = line_list2[:] for i in line_list2: for st in stopword: if st == i: line_list.remove(i) break else: pass wordlist.extend(line_list) # wordlist = [i for i in le if i not in stopword] for wd in wordlist: if wd == '\n': f4.write(wd) else: f4.write(wd + " ") f4.close() # path3 = '/home/liumingyu/fenci/pos/' # os.chdir(path3) if __name__ == '__main__': fc(sys.argv) t2 = time.time() print("分词、词性标注、词性过滤及去停用词处理完成,耗时:"+str(t2-t1)+"秒。")
对于分好词的文本,可以用java中的HashSet生成词典:
1 package GenerateDictionary; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.FileWriter; 6 import java.io.InputStreamReader; 7 import java.io.LineNumberReader; 8 import java.util.HashSet; 9 //import java.util.LinkedHashSet; 10 11 import org.apache.commons.lang3.StringUtils; 12 13 public class GenerateDictionary { 14 private static HashSet<String> generateDictionary(String originFile) { 15 //初始化HashSet 16 HashSet<String> words = new HashSet<String>(); 17 String encoding = "utf-8"; 18 LineNumberReader reader; 19 File file = new File(originFile); 20 int count = 0; 21 try{ 22 reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), encoding)); 23 String pin = reader.readLine(); 24 //如果字符串不为空或长度为0或由空白符(whitespace)构成 25 while(!StringUtils.isBlank(pin)){ 26 //按照空格来分割字符串 27 String[] tmp = pin.split(" "); 28 int size = tmp.length; 29 for (int i = 0; i < size; i++) { 30 words.add(tmp[i]); 31 } 32 33 count ++; 34 if (count % 1000 == 0) { 35 System.out.println("System has handled " + count + " lines"); 36 } 37 38 pin = reader.readLine(); 39 } 40 } catch(Exception e) { 41 e.printStackTrace(); 42 } 43 44 return words; 45 } 46 47 private static void writeHashSet(HashSet<String> target, String targetFile){ 48 System.out.println("System starts writing dictionary to file..."); 49 try{ 50 FileWriter fileWriter = new FileWriter(targetFile); 51 52 for(String word : target) { 53 fileWriter.write(word + "\n"); 54 } 55 56 fileWriter.flush(); 57 fileWriter.close(); 58 } catch(Exception e) { 59 e.printStackTrace(); 60 } 61 62 System.out.println("System writes file success!"); 63 } 64 65 public static void main(String[] args){ 66 String originFile = "C:\\Users\\leo\\Desktop\\hbbresult2.txt"; 67 String targetFile = "C:\\Users\\leo\\Desktop\\hbbnewguolv.txt"; 68 69 HashSet<String> wordHashSet = generateDictionary(originFile); 70 writeHashSet(wordHashSet, targetFile); 71 } 72 }
之后生成01词向量:
1 package WordEmbedding; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.FileWriter; 6 import java.io.InputStreamReader; 7 import java.io.LineNumberReader; 8 import java.util.ArrayList; 9 import java.util.LinkedHashMap; 10 import java.util.LinkedHashSet; 11 import java.util.Map; 12 13 import org.apache.commons.lang3.StringUtils; 14 15 public class WordEmbedding { 16 //设置为全局变量,便于更新数据 17 private static LinkedHashMap<Integer, ArrayList<Integer>> vectorHashMap = new LinkedHashMap<Integer, ArrayList<Integer>>(); 18 public static void main(String[] args) { 19 String dictionaryFile = "C:\\Users\\leo\\Desktop\\txt\\test\\posdic.txt"; 20 String wordDirectory = "C:\\Users\\leo\\Desktop\\txt\\test\\"; 21 String saveVectorFileName = "C:\\Users\\leo\\Desktop\\txt\\test\\vector.txt"; 22 int maxNumber = 230; 23 24 LinkedHashSet<String> dictionaryHashSet = readHashSet(dictionaryFile); 25 26 for (int i = 1; i <= maxNumber; i++) { 27 String tmpWordFile = wordDirectory + i + "result2.txt"; 28 LinkedHashSet<String> wordHashSet = readHashSet(tmpWordFile); 29 30 generateWordVector(dictionaryHashSet, wordHashSet, i); 31 } 32 33 writeHashSet(vectorHashMap, saveVectorFileName); 34 } 35 36 /** 37 * 遍历dictionaryVector,根据wordHashSet是否包含dictionaryVector中的单词,赋值0或1到向量中 38 * @param dictionaryVector 39 * @param wordHashSet 40 * @param saveFileName 41 * @param id 42 * @return 43 */ 44 private static boolean generateWordVector(LinkedHashSet<String> dictionaryVector, 45 LinkedHashSet<String> wordHashSet, int id) { 46 ArrayList<Integer> wordVector = new ArrayList<Integer>(); 47 48 for (String tmpWord : dictionaryVector) { 49 if (wordHashSet.contains(tmpWord)) { 50 wordVector.add(1); 51 } else { 52 wordVector.add(0); 53 } 54 } 55 vectorHashMap.put(id, wordVector); 56 return true; 57 } 58 59 /** 60 * 读取文件到LinkedHashSet 61 * @param originFile 62 * @return 63 */ 64 private static LinkedHashSet<String> readHashSet(String originFile) { 65 LinkedHashSet<String> words = new LinkedHashSet<String>(); 66 String encoding = "utf-8"; 67 LineNumberReader reader; 68 File file = new File(originFile); 69 try { 70 int count = 0; 71 reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), encoding)); 72 String pin = reader.readLine(); 73 //如果字符串不为空或长度为0或由空白符(whitespace)构成 74 while(!StringUtils.isBlank(pin)){ 75 //按照空格来分割字符串 76 String[] tmp = pin.split(" "); 77 int size = tmp.length; 78 for (int i = 0; i < size; i++) { 79 words.add(tmp[i]); 80 } 81 82 count ++; 83 if (count % 1000 == 0) { 84 System.out.println("System has handled " + count + " lines"); 85 } 86 87 pin = reader.readLine(); 88 } 89 } catch(Exception e) { 90 e.printStackTrace(); 91 } 92 return words; 93 } 94 95 /** 96 * 将LinkedHashMap写入到文件中 97 * @param target 98 * @param targetFile 99 */ 100 private static void writeHashSet(LinkedHashMap<Integer, ArrayList<Integer>> target, String targetFile){ 101 System.out.println("System starts writing dictionary to file..."); 102 try{ 103 FileWriter fileWriter = new FileWriter(targetFile); 104 int count = 0; 105 for(Map.Entry<Integer, ArrayList<Integer>> entry : target.entrySet()) { 106 Integer id = entry.getKey(); 107 ArrayList<Integer> vectorList = entry.getValue(); 108 int size = entry.getValue().size(); 109 110 for (int i = 0; i < size-1; i++) { 111 fileWriter.write(vectorList.get(i)+""); 112 } 113 fileWriter.write(vectorList.get(size-1)+"\n"); 114 115 count ++; 116 if (count % 200 == 0) { 117 System.out.println("System Has Write " + count + " Lines Success!"); 118 } 119 System.out.println("System Has Write " + count + " Lines Success!"); 120 } 121 122 fileWriter.flush(); 123 fileWriter.close(); 124 } catch(Exception e) { 125 e.printStackTrace(); 126 } 127 128 System.out.println("System writes file success!"); 129 } 130 } 131
将pos、neg文本数据统一限制在5000*8081维,即5000条评论、词典个数为8081,并对评论向量进行标注,在向量开头进行标注,并与向量有一个空格,pos标注“1”,neg标注“-1”。
1 # coding=utf-8 2 import os 3 from random import shuffle 4 import numpy as np 5 import time 6 t1 = time.time() 7 8 9 # 去掉评论向量末尾的换行符、打乱数据,并只取5000条 10 def process_data(lines): 11 dataset = [] 12 for li in lines: 13 if "\n" in li: 14 slope = li.index("\n") 15 li1 = li[0:slope] 16 dataset.append(list(li1)) 17 data = np.array(dataset) 18 shuffle(data) 19 vector = data[:5000, :] 20 return vector 21 22 23 # 去掉最开始的label,便于作为rbm输入 24 def qulabel(lines): 25 dataset = [] 26 for li in lines: 27 slope1 = li.index(" ") 28 slope2 = li.index("\n") 29 line2 = li[slope1 + 1:slope2] 30 dataset.append(list(line2)) 31 data = np.array(dataset) 32 return data 33 34 f = open("C:/Users/leo/Desktop/vector.txt", 'r') 35 line = f.readlines() 36 f.close() 37 path = 'C:/Users/leo/Desktop/' 38 os.chdir(path) 39 40 f2 = open("posvector.txt", "w+") 41 f3 = open("negvector.txt", "w+") 42 n = 1 43 for i in line: 44 if n <= 5152: 45 f2.write(str(1)+" "+i) 46 if n % 1000 == 0: 47 print("System has write"+" "+str(n)+" "+"lines!") 48 if n == 5152: 49 print("Pos, done!") 50 if n >= 5153: 51 f3.write(str(-1)+" "+i) 52 n = n + 1 53 f2.close() 54 f3.close() 55 print("*********************************************") 56 print("Starting to shuffle and siphon 5000 data...") 57 s1 = open("posvector.txt", "r") 58 posline = s1.readlines() 59 p = process_data(posline) 60 np.savetxt("posvt.txt", p, fmt='%s', delimiter="") 61 s1.close() 62 63 s2 = open("negvector.txt", "r") 64 negline = s2.readlines() 65 g = process_data(negline) 66 np.savetxt("negvt.txt", g, fmt='%s', delimiter="") 67 s2.close() 68 ''' 69 # 保存无label的向量矩阵 70 h1 = open("posvt.txt", "r") 71 posvlines = h1.readlines() 72 h = qulabel(posvlines) 73 np.savetxt("posv.txt", h, fmt='%s', delimiter="") 74 75 h1 = open("negvt.txt", "r") 76 negvlines = h1.readlines() 77 h = qulabel(negvlines) 78 np.savetxt("negv.txt", h, fmt='%s', delimiter="") 79 print("done!") 80 ''' 81 t2 = time.time() 82 print("All done! Time:"+str(t2-t1)+"s")