神经网络中文本预处理步骤

机器学习中的文本情感分类,往往需要将语料处理成神经网络能够识别的矩阵形式。

对于下载的语料,如果存在标签,如下面的形式:

<Product type>手机产品列表</Product type>
19489
<Rev_body>
<Sentiment>pos</Sentiment>
<Rev_text>功能比一般小灵通多许多,不可许多人不懂得用。电话本容量,短信信箱容量,通话记录等比一般小灵通手机多许多。还有很多小功能,比如群发短信,外键拒接,计算器,记事本。。。关键是信号好又稳定,这只小灵通是我用的最长时间的一只,用得很顺手了,现在还不舍得换。我还喜欢那外屏的形状,比较特别。呵呵!</Rev_text>
</Rev_body>
19492
<Rev_body>
<Sentiment>neg</Sentiment>
<Rev_text>有时候爱死机,外屏有问题,很烦</Rev_text>
</Rev_body>

将pos和neg两个不同的类别分别进行输出,可以用linux下的grep命令自动提取。

提取u.txt文本中包含“pos”关键字行的下一行内容输出到poscorpus.txt

cat u.txt | grep -A 1 "pos" > poscorpus.txt

提取poscorpus.txt文本中包含“<Rev_text>”标签的行输出到pos.txt

cat poscorpus.txt | grep "<Rev_text>" > pos.txt

之后在去掉<Rev_text>标签:

 1 # -*- coding: utf-8 -*-
 2 import os
 3 import sys
 4 import time
 5 t1 = time.time()
 6 reload(sys)
 7 sys.setdefaultencoding("utf-8")
 8 
 9 
10 def qubiaoqian(argv):
11     filename = argv[1]
12     f = open(filename, 'r')
13     file_list = f.readlines()
14     f.close()
15     f1 = open("final.txt", 'w+')
16     f1.write('\xEF\xBB\xBF')
17     for line in file_list:
18         if "</" in line:
19             slope = line.index("</")
20             line2 = line[10:slope]
21 
22             f1.write(line2+'\n')
23         else:
24             f1.write(line)
25     f1.close()
26 if __name__ == "__main__":
27     qubiaoqian(sys.argv)
28 t2 = time.time()
29 print("done! 耗时:" + str(t2 - t1) + "秒。")

语料由很多的短文本组成,为了建造词典,合并成一个大的txt文件,可以用以下代码:

 1 # -*- coding: utf-8 -*-
 2 import os
 3 import sys
 4 import glob
 5 import time
 6 t1 = time.time()
 7 
 8 
 9 def change(argv):
10     dir = argv[1]
11     if not os.path.isdir(dir):
12         print("fail & This is not a dir", dir)
13 
14     f1 = open("LargeTxt.txt", "w+")
15     for txtFile in glob.glob(os.path.join(dir, "*.txt")):
16         print txtFile
17         f = open(txtFile, "r")
18         filelist = f.readlines()
19         f.close()
20         for line in filelist:
21             f1.write(line)
22     f1.close()
23 
24 if __name__ == "__main__":
25     change(sys.argv)
26 t2 = time.time()
27 print("done! 耗时:"+str(t2-t1)+"秒。")

之后用jieba进行分词、词性标注及过滤和去停用词操作:

# -*- coding: utf-8 -*-
import jieba
import os
import glob
import re
import sys
import jieba.posseg as pseg
import time
from collections import Counter
t1 = time.time()
reload(sys)
sys.setdefaultencoding('utf8')


def fc(object):
    path = '/home/liumingyu/fenci/pos/'
    os.chdir(path)
    for filename in glob.glob('*.txt'):
        f = open(filename, 'r+')
        file_list = f.read()
        f.close()
        # 分词并标注词性
        seg_list = pseg.cut(file_list)

        fname, fextension = os.path.splitext(filename)

        path1 = '/home/liumingyu/fenci/result1/'
        os.chdir(path1)

        with open(fname+'result1.txt', "w+") as f:

            for word, flag in seg_list:
               f.write(word+"/"+flag+" ")
            f.close()
        # 词性过滤
        f1 = open(fname+'result1.txt', "r")
        txt = f1.readlines()
        f1.close()
        path2 = '/home/liumingyu/fenci/result2/'
        os.chdir(path2)
        f2 = open(fname+'result2.txt', "w+")

        txtlist = []
        cixing = ["/Ag", "/b", "/dg", "/e", "/f", "/g", "/h", "/i", "/j", "/k", "/l", "/m", "/Ng", "/o", "/p", "/q", "/r",
              "/s", "/tg", "/t", "/uj", "/vg", "/v", "/vd", "/vn", "/w", "/x", "/y", "/z", "/un"]
        for line in txt:
            line_list2 = re.split('[ ]', line)
            line_list = line_list2[:]
            for segs in line_list2:
                for k in cixing:
                    if k in segs:
                        line_list.remove(segs)
                        break
                    else:
                        pass
            txtlist.extend(line_list)

        for v in txtlist:

            if "/" in v:
                slope = v.index("/")
                letter = v[0:slope] + " "
                f2.write(letter)
            else:
                f2.write(v)
        f2.close()
        # 去停用词
        stopword = []
        f5 = open("stop.txt", "r")
        stopwords = f5.readlines()
        f5.close()
        for li in stopwords:
            lin = li.strip()
            stopword.append(lin)
        f3 = open(fname+'result2.txt', "r")
        path3 = '/home/liumingyu/fenci/result3/'
        os.chdir(path3)
        test_content = f3.readlines()
        f3.close()
        wordlist = []
        f4 = open(fname + 'result3.txt', "w+")
        for le in test_content:
            line_list2 = re.split('[ ]', le)
            line_list = line_list2[:]
            for i in line_list2:
                for st in stopword:
                    if st == i:
                       line_list.remove(i)
                       break
                    else:
                        pass
            wordlist.extend(line_list)
            # wordlist = [i for i in le if i not in stopword]
        for wd in wordlist:
            if wd == '\n':
               f4.write(wd)
            else:
               f4.write(wd + " ")
        f4.close()
        # path3 = '/home/liumingyu/fenci/pos/'
        # os.chdir(path3)

if __name__ == '__main__':
    fc(sys.argv)
t2 = time.time()
print("分词、词性标注、词性过滤及去停用词处理完成,耗时:"+str(t2-t1)+"秒。")

对于分好词的文本,可以用java中的HashSet生成词典:

 1 package GenerateDictionary;
 2 
 3 import java.io.File;
 4 import java.io.FileInputStream;
 5 import java.io.FileWriter;
 6 import java.io.InputStreamReader;
 7 import java.io.LineNumberReader;
 8 import java.util.HashSet;
 9 //import java.util.LinkedHashSet;
10 
11 import org.apache.commons.lang3.StringUtils;
12 
13 public class GenerateDictionary {
14     private static HashSet<String> generateDictionary(String originFile) {
15         //初始化HashSet
16         HashSet<String> words = new HashSet<String>();
17         String encoding = "utf-8";
18         LineNumberReader reader;
19         File file = new File(originFile);
20         int count = 0;
21         try{
22             reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), encoding));
23             String pin = reader.readLine();
24             //如果字符串不为空或长度为0或由空白符(whitespace)构成 
25             while(!StringUtils.isBlank(pin)){
26                 //按照空格来分割字符串
27                 String[] tmp = pin.split(" ");
28                 int size = tmp.length;
29                 for (int i = 0; i < size; i++) {
30                     words.add(tmp[i]);
31                 }
32                 
33                 count ++;
34                 if (count % 1000 == 0) {
35                     System.out.println("System has handled " + count + " lines");
36                 }
37                 
38                 pin = reader.readLine();
39             }
40         } catch(Exception e) {
41             e.printStackTrace();
42         }
43         
44         return words;
45     }
46     
47     private static void writeHashSet(HashSet<String> target, String targetFile){
48         System.out.println("System starts writing dictionary to file...");
49         try{
50             FileWriter fileWriter = new FileWriter(targetFile);
51             
52             for(String word : target) {
53                 fileWriter.write(word + "\n");
54             }
55             
56             fileWriter.flush();
57             fileWriter.close();
58         } catch(Exception e) {
59             e.printStackTrace();
60         }
61         
62         System.out.println("System writes file success!");
63     }
64     
65     public static void main(String[] args){
66         String originFile = "C:\\Users\\leo\\Desktop\\hbbresult2.txt";
67         String targetFile = "C:\\Users\\leo\\Desktop\\hbbnewguolv.txt";
68         
69         HashSet<String> wordHashSet = generateDictionary(originFile);
70         writeHashSet(wordHashSet, targetFile);
71     }
72 }

之后生成01词向量:

  1 package WordEmbedding;
  2 
  3 import java.io.File;
  4 import java.io.FileInputStream;
  5 import java.io.FileWriter;
  6 import java.io.InputStreamReader;
  7 import java.io.LineNumberReader;
  8 import java.util.ArrayList;
  9 import java.util.LinkedHashMap;
 10 import java.util.LinkedHashSet;
 11 import java.util.Map;
 12 
 13 import org.apache.commons.lang3.StringUtils;
 14 
 15 public class WordEmbedding {
 16     //设置为全局变量,便于更新数据
 17     private static LinkedHashMap<Integer, ArrayList<Integer>> vectorHashMap = new LinkedHashMap<Integer, ArrayList<Integer>>();
 18     public static void main(String[] args) {
 19         String dictionaryFile = "C:\\Users\\leo\\Desktop\\txt\\test\\posdic.txt";
 20         String wordDirectory = "C:\\Users\\leo\\Desktop\\txt\\test\\";
 21         String saveVectorFileName = "C:\\Users\\leo\\Desktop\\txt\\test\\vector.txt";
 22         int maxNumber = 230;
 23     
 24         LinkedHashSet<String> dictionaryHashSet = readHashSet(dictionaryFile);
 25         
 26         for (int i = 1; i <= maxNumber; i++) {
 27             String tmpWordFile = wordDirectory + i + "result2.txt";
 28             LinkedHashSet<String> wordHashSet = readHashSet(tmpWordFile);
 29             
 30             generateWordVector(dictionaryHashSet, wordHashSet, i);
 31         }
 32         
 33         writeHashSet(vectorHashMap, saveVectorFileName);
 34     }
 35     
 36     /**
 37      * 遍历dictionaryVector,根据wordHashSet是否包含dictionaryVector中的单词,赋值0或1到向量中
 38      * @param dictionaryVector
 39      * @param wordHashSet
 40      * @param saveFileName
 41      * @param id
 42      * @return
 43      */
 44     private static boolean generateWordVector(LinkedHashSet<String> dictionaryVector,
 45                                               LinkedHashSet<String> wordHashSet, int id) {
 46         ArrayList<Integer> wordVector = new ArrayList<Integer>();
 47         
 48         for (String tmpWord : dictionaryVector) {
 49             if (wordHashSet.contains(tmpWord)) {
 50                 wordVector.add(1);
 51             } else {
 52                 wordVector.add(0);
 53             }
 54         }
 55         vectorHashMap.put(id,  wordVector);
 56         return true;
 57     }
 58     
 59     /**
 60      * 读取文件到LinkedHashSet
 61      * @param originFile
 62      * @return
 63      */
 64     private static LinkedHashSet<String> readHashSet(String originFile) {
 65         LinkedHashSet<String> words = new LinkedHashSet<String>();
 66         String encoding = "utf-8";
 67         LineNumberReader reader;
 68         File file = new File(originFile);
 69         try {
 70             int count = 0;
 71             reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), encoding));
 72             String pin = reader.readLine();
 73             //如果字符串不为空或长度为0或由空白符(whitespace)构成 
 74             while(!StringUtils.isBlank(pin)){
 75                 //按照空格来分割字符串
 76                 String[] tmp = pin.split(" ");
 77                 int size = tmp.length;
 78                 for (int i = 0; i < size; i++) {
 79                     words.add(tmp[i]);
 80                 }
 81                 
 82                 count ++;
 83                 if (count % 1000 == 0) {
 84                     System.out.println("System has handled " + count + " lines");
 85                 }
 86                 
 87                 pin = reader.readLine();
 88             }
 89         } catch(Exception e) {
 90             e.printStackTrace();
 91         }
 92         return words;
 93     }
 94     
 95     /**
 96      * 将LinkedHashMap写入到文件中
 97      * @param target
 98      * @param targetFile
 99      */
100     private static void writeHashSet(LinkedHashMap<Integer, ArrayList<Integer>> target, String targetFile){
101         System.out.println("System starts writing dictionary to file...");
102         try{
103             FileWriter fileWriter = new FileWriter(targetFile);
104             int count = 0;
105             for(Map.Entry<Integer, ArrayList<Integer>> entry : target.entrySet()) {
106                 Integer id = entry.getKey();
107                 ArrayList<Integer> vectorList = entry.getValue();
108                 int size = entry.getValue().size();
109                 
110                 for (int i = 0; i < size-1; i++) {
111                     fileWriter.write(vectorList.get(i)+"");
112                 }
113                 fileWriter.write(vectorList.get(size-1)+"\n");
114                 
115                 count ++;
116                 if (count % 200 == 0) {
117                     System.out.println("System Has Write " + count + " Lines Success!");
118                 }
119                 System.out.println("System Has Write " + count + " Lines Success!");
120             }
121             
122             fileWriter.flush();
123             fileWriter.close();
124         } catch(Exception e) {
125             e.printStackTrace();
126         }
127         
128         System.out.println("System writes file success!");
129     }
130 }
131     

将pos、neg文本数据统一限制在5000*8081维,即5000条评论、词典个数为8081,并对评论向量进行标注,在向量开头进行标注,并与向量有一个空格,pos标注“1”,neg标注“-1”。

 1 # coding=utf-8
 2 import os
 3 from random import shuffle
 4 import numpy as np
 5 import time
 6 t1 = time.time()
 7 
 8 
 9 # 去掉评论向量末尾的换行符、打乱数据,并只取5000条
10 def process_data(lines):
11     dataset = []
12     for li in lines:
13         if "\n" in li:
14             slope = li.index("\n")
15             li1 = li[0:slope]
16         dataset.append(list(li1))
17     data = np.array(dataset)
18     shuffle(data)
19     vector = data[:5000, :]
20     return vector
21 
22 
23 # 去掉最开始的label,便于作为rbm输入
24 def qulabel(lines):
25     dataset = []
26     for li in lines:
27         slope1 = li.index(" ")
28         slope2 = li.index("\n")
29         line2 = li[slope1 + 1:slope2]
30         dataset.append(list(line2))
31     data = np.array(dataset)
32     return data
33 
34 f = open("C:/Users/leo/Desktop/vector.txt", 'r')
35 line = f.readlines()
36 f.close()
37 path = 'C:/Users/leo/Desktop/'
38 os.chdir(path)
39 
40 f2 = open("posvector.txt", "w+")
41 f3 = open("negvector.txt", "w+")
42 n = 1
43 for i in line:
44     if n <= 5152:
45         f2.write(str(1)+" "+i)
46     if n % 1000 == 0:
47         print("System has write"+" "+str(n)+" "+"lines!")
48     if n == 5152:
49         print("Pos, done!")
50     if n >= 5153:
51         f3.write(str(-1)+" "+i)
52     n = n + 1
53 f2.close()
54 f3.close()
55 print("*********************************************")
56 print("Starting to shuffle and siphon 5000 data...")
57 s1 = open("posvector.txt", "r")
58 posline = s1.readlines()
59 p = process_data(posline)
60 np.savetxt("posvt.txt", p, fmt='%s', delimiter="")
61 s1.close()
62 
63 s2 = open("negvector.txt", "r")
64 negline = s2.readlines()
65 g = process_data(negline)
66 np.savetxt("negvt.txt", g, fmt='%s', delimiter="")
67 s2.close()
68 '''
69 # 保存无label的向量矩阵
70 h1 = open("posvt.txt", "r")
71 posvlines = h1.readlines()
72 h = qulabel(posvlines)
73 np.savetxt("posv.txt", h, fmt='%s', delimiter="")
74 
75 h1 = open("negvt.txt", "r")
76 negvlines = h1.readlines()
77 h = qulabel(negvlines)
78 np.savetxt("negv.txt", h, fmt='%s', delimiter="")
79 print("done!")
80 '''
81 t2 = time.time()
82 print("All done! Time:"+str(t2-t1)+"s")

 

posted on 2017-03-02 20:05  Pod32gleo  阅读(943)  评论(0编辑  收藏  举报

导航