6.1 处理文本数据

6.1.1 单词和字符的one-hot编码

(1)单词级的one-hot编码:

 1 # 单词级的one-hot编码
 2 import numpy as np
 3 
 4 # 初始数据:每个样本是列表的一个元素(本例中的样本是一个句子,但也可以是一整篇文档)
 5 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 6 
 7 # 构建数据中所有标记的索引
 8 token_index = {}
 9 for sample in samples:
10     # 利用split方法对样本进行分词,在实际应用中,还需要从样本中去掉标点和特殊符号。
11     for word in sample.split():
12         if word not in token_index:
13             # 为每一个唯一的单词分配一个唯一的索引            
14             token_index[word] = len(token_index) + 1
15             # 0号索引没有分配给任何单词            
16 
17 # 对样本进行分词。只考虑每个样本前max_length个单词
18 max_length = 10
19 
20 # 将结果保存在result中
21 # result是一个3D张量,第一维(高)是样本个数,第二维(行)是某样本第几个单词,
22 # 第三维(列)是这个单词的向量表示
23 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
24 for i, sample in enumerate(samples):
25     for j, word in list(enumerate(sample.split()))[:max_length]:
26         index = token_index.get(word)
27         results[i, j, index] = 1.#将3D向量中出现的单词标记为1.

 

(2)字符级的one-hot编码:

 1 import string
 2 import numpy as np
 3 
 4 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 5 characters = string.printable  # 所有可打印的ASCII字符
 6 token_index = dict(zip(characters, range(1, len(characters) + 1)))
 7 
 8 max_length = 50
 9 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
10 for i, sample in enumerate(samples):
11     for j, character in enumerate(sample[:max_length]):
12         index = token_index.get(character)
13         results[i, j, index] = 1.

 

(3)使用Keras实现单词级的one-hot编码:

 1 from keras.preprocessing.text import Tokenizer
 2 
 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 4 
 5 # 创建一个分词器,设置为只考虑前1000个最常见的单词
 6 tokenizer = Tokenizer(num_words=1000)
 7 # 构建单词索引
 8 tokenizer.fit_on_texts(samples)
 9 
10 # 将字符串转换为整数索引组成的列表
11 sequences = tokenizer.texts_to_sequences(samples)
12 
13 # 也可以直接得到one-hot二进制表示。这个分词器也支持除one-hot编码外的其他向量化模式
14 one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
15 
16 # 找回单词索引
17 word_index = tokenizer.word_index
18 print('Found %s unique tokens.' % len(word_index))

 

(4)使用散列技巧的单词级的one-hot编码:

 1 import numpy as np
 2 
 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 4 
 5 # 将单词保存为长度为1000的向量。如果单词数量接近1000个(或更多),
 6 # 那么会遇到很多散列冲突,这回降低这种编码方法的准确性
 7 dimensionality = 1000
 8 max_length = 10
 9 
10 results = np.zeros((len(samples), max_length, dimensionality))
11 for i, sample in enumerate(samples):
12     for j, word in list(enumerate(sample.split()))[:max_length]:
13         # 将单词散列为0~1000范围内的一个随机整数索引
14         index = abs(hash(word)) % dimensionality
15         results[i, j, index] = 1.

 

6.1.2 使用词嵌入

(1)利用Embedding层学习词嵌入: 

 1 from keras.layers import Embedding
 2 
 3 # Embedding层至少需要两个参数:标记的个数(这里是1000,即最大单词索引+1)
 4 # 和嵌入的维度(这里是64)
 5 embedding_layer = Embedding(1000, 64)
 6 
 7 from keras.datasets import imdb
 8 from keras import preprocessing
 9 
10 # 作为特征的单词个数
11 max_features = 10000
12 # 超出的单词会被截断 
13 # (这些单词是最常见单词)
14 maxlen = 20
15 
16 # 加载数据,整数列表
17 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
18 
19 # 将整数列表转换为(samples, maxlen)的2D张量
20 x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
21 x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
22 
23 
24 from keras.models import Sequential
25 from keras.layers import Flatten, Dense
26 
27 model = Sequential()
28 # 指定Embedding层的最大输入长度,以便后面将嵌入输入展平。
29 
30 model.add(Embedding(max_features, 8, input_length=maxlen))
31 # Embedding层激活的形状为(samples, maxlen, 8)
32 
33 # 将3D的嵌入张量展平成形状(samples, maxlen * 8)的2D张量
34 model.add(Flatten())
35 
36 # 添加分类器
37 model.add(Dense(1, activation='sigmoid'))
38 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
39 # model.summary()
40 
41 history = model.fit(x_train, y_train,
42                     epochs=10,
43                     batch_size=32,
44                     validation_split=0.2)

 

(2)使用预训练的词嵌入:

  1 import os
  2 
  3 imdb_dir = '/home/ubuntu/data/aclImdb'
  4 train_dir = os.path.join(imdb_dir, 'train')
  5 
  6 labels = []
  7 texts = []
  8 
  9 for label_type in ['neg', 'pos']:
 10     dir_name = os.path.join(train_dir, label_type)
 11     for fname in os.listdir(dir_name):
 12         if fname[-4:] == '.txt':
 13             f = open(os.path.join(dir_name, fname))
 14             texts.append(f.read())
 15             f.close()
 16             if label_type == 'neg':
 17                 labels.append(0)
 18             else:
 19                 labels.append(1)
 20 
 21 from keras.preprocessing.text import Tokenizer
 22 from keras.preprocessing.sequence import pad_sequences
 23 import numpy as np
 24 
 25 maxlen = 100  # We will cut reviews after 100 words
 26 training_samples = 200  # We will be training on 200 samples
 27 validation_samples = 10000  # We will be validating on 10000 samples
 28 max_words = 10000  # We will only consider the top 10,000 words in the dataset
 29 
 30 tokenizer = Tokenizer(num_words=max_words)
 31 tokenizer.fit_on_texts(texts)
 32 sequences = tokenizer.texts_to_sequences(texts)
 33 
 34 word_index = tokenizer.word_index
 35 print('Found %s unique tokens.' % len(word_index))
 36 
 37 data = pad_sequences(sequences, maxlen=maxlen)
 38 
 39 labels = np.asarray(labels)
 40 print('Shape of data tensor:', data.shape)
 41 print('Shape of label tensor:', labels.shape)
 42 
 43 # Split the data into a training set and a validation set
 44 # But first, shuffle the data, since we started from data
 45 # where sample are ordered (all negative first, then all positive).
 46 indices = np.arange(data.shape[0])
 47 np.random.shuffle(indices)
 48 data = data[indices]
 49 labels = labels[indices]
 50 
 51 x_train = data[:training_samples]
 52 y_train = labels[:training_samples]
 53 x_val = data[training_samples: training_samples + validation_samples]
 54 y_val = labels[training_samples: training_samples + validation_samples]
 55 
 56 glove_dir = '/home/ubuntu/data/'
 57 
 58 embeddings_index = {}
 59 f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
 60 for line in f:
 61     values = line.split()
 62     word = values[0]
 63     coefs = np.asarray(values[1:], dtype='float32')
 64     embeddings_index[word] = coefs
 65 f.close()
 66 
 67 print('Found %s word vectors.' % len(embeddings_index))
 68 
 69 embedding_dim = 100
 70 
 71 embedding_matrix = np.zeros((max_words, embedding_dim))
 72 for word, i in word_index.items():
 73     embedding_vector = embeddings_index.get(word)
 74     if i < max_words:
 75         if embedding_vector is not None:
 76             # Words not found in embedding index will be all-zeros.
 77             embedding_matrix[i] = embedding_vector
 78 
 79 from keras.models import Sequential
 80 from keras.layers import Embedding, Flatten, Dense
 81 
 82 model = Sequential()
 83 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
 84 model.add(Flatten())
 85 model.add(Dense(32, activation='relu'))
 86 model.add(Dense(1, activation='sigmoid'))
 87 model.summary()
 88 
 89 model.layers[0].set_weights([embedding_matrix])
 90 model.layers[0].trainable = False
 91 
 92 model.compile(optimizer='rmsprop',
 93               loss='binary_crossentropy',
 94               metrics=['acc'])
 95 history = model.fit(x_train, y_train,
 96                     epochs=10,
 97                     batch_size=32,
 98                     validation_data=(x_val, y_val))
 99 model.save_weights('pre_trained_glove_model.h5')
100 
101 import matplotlib.pyplot as plt
102 
103 acc = history.history['acc']
104 val_acc = history.history['val_acc']
105 loss = history.history['loss']
106 val_loss = history.history['val_loss']
107 
108 epochs = range(1, len(acc) + 1)
109 
110 plt.plot(epochs, acc, 'bo', label='Training acc')
111 plt.plot(epochs, val_acc, 'b', label='Validation acc')
112 plt.title('Training and validation accuracy')
113 plt.legend()
114 
115 plt.figure()
116 
117 plt.plot(epochs, loss, 'bo', label='Training loss')
118 plt.plot(epochs, val_loss, 'b', label='Validation loss')
119 plt.title('Training and validation loss')
120 plt.legend()
121 
122 plt.show()
123 
124 from keras.models import Sequential
125 from keras.layers import Embedding, Flatten, Dense
126 
127 model = Sequential()
128 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
129 model.add(Flatten())
130 model.add(Dense(32, activation='relu'))
131 model.add(Dense(1, activation='sigmoid'))
132 model.summary()
133 
134 model.compile(optimizer='rmsprop',
135               loss='binary_crossentropy',
136               metrics=['acc'])
137 history = model.fit(x_train, y_train,
138                     epochs=10,
139                     batch_size=32,
140                     validation_data=(x_val, y_val))
141 
142 acc = history.history['acc']
143 val_acc = history.history['val_acc']
144 loss = history.history['loss']
145 val_loss = history.history['val_loss']
146 
147 epochs = range(1, len(acc) + 1)
148 
149 plt.plot(epochs, acc, 'bo', label='Training acc')
150 plt.plot(epochs, val_acc, 'b', label='Validation acc')
151 plt.title('Training and validation accuracy')
152 plt.legend()
153 
154 plt.figure()
155 
156 plt.plot(epochs, loss, 'bo', label='Training loss')
157 plt.plot(epochs, val_loss, 'b', label='Validation loss')
158 plt.title('Training and validation loss')
159 plt.legend()
160 
161 plt.show()
162 
163 
164 test_dir = os.path.join(imdb_dir, 'test')
165 
166 labels = []
167 texts = []
168 
169 for label_type in ['neg', 'pos']:
170     dir_name = os.path.join(test_dir, label_type)
171     for fname in sorted(os.listdir(dir_name)):
172         if fname[-4:] == '.txt':
173             f = open(os.path.join(dir_name, fname))
174             texts.append(f.read())
175             f.close()
176             if label_type == 'neg':
177                 labels.append(0)
178             else:
179                 labels.append(1)
180 
181 sequences = tokenizer.texts_to_sequences(texts)
182 x_test = pad_sequences(sequences, maxlen=maxlen)
183 y_test = np.asarray(labels)
184 
185 
186 model.load_weights('pre_trained_glove_model.h5')
187 model.evaluate(x_test, y_test)

 

posted on 2021-01-25 14:58  Sempron2800+  阅读(269)  评论(0编辑  收藏  举报