Pytorch-torchtext的使用

使用torchtext的一般步骤https://www.cnblogs.com/cxq1126/p/13466998.html#_label9

1.使用torchtext默认支持的预训练词向量

默认情况下，会自动下载对应的预训练词向量文件到当前文件夹下的.vector_cache目录下，.vector_cache为默认的词向量文件和缓存文件的目录。

1 from torchtext.vocab import GloVe
2 from torchtext import data
3 TEXT = data.Field(sequential=True)
4 
5 TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
6 TEXT.build_vocab(train, vectors="glove.6B.300d")

2.使用外部预训练好的词向量

从网站中（https://github.com/Embedding/Chinese-Word-Vectors）下载中文词向量sgns.wiki.word

通过name参数可以指定预训练词向量文件所在的目录；
默认情况下预训练词向量文件和缓存文件的目录位置都为当前目录下的 .vector_cache目录，虽然通过name参数指定了预训练词向量文件存在的目录，但是因为缓存文件的目录没有特殊指定，此时在当前目录下仍然需要存在 .vector_cache 目录。

1 if not os.path.exists('.vector_cache'):
2     os.mkdir('.vector_cache')
3 vectors = Vectors(name='sgns.wiki.word')
4 TEXT.build_vocab(train_data, max_size=10000, vectors=vectors)

Embedding初始化还是一样

1 pretrained_embedding = TEXT.vocab.vectors
2 print('pretrained_embedding:', pretrained_embedding.shape)  #torch.Size([1727, 300])
3 model.src_embed[0].lut.weight.data.copy_(pretrained_embedding) 
4 print('Embedding初始化')

参考https://blog.csdn.net/leo_95/article/details/87708267

3.篇章级文本分类，将每一篇文档按长度分三段保存，共用一份词表

textfield可以定义多个属性，text1，text2，text3。

 1 from nltk.tokenize import word_tokenize
 2 from torchtext import data as tdata
 3 from torchtext.vocab import GloVe
 4 from torchtext.vocab import Vectors
 5 
 6 def read_data(data_path, text_field, label_field, split=3, overlap=0):
 7     fields = []
 8     for i in range(1, split+1):
 9         fields.append(('text'+str(i), text_field))
10     fields.append(('label', label_field))
11 
12     examples = []
13 
14     with open(data_path) as csv_file:
15         reader = csv.reader(csv_file, quotechar='"')
16         for idx, line in enumerate(reader):
17             text = ""
18             for tx in line[1:]:
19                 text += tx              #tx就是一篇文档
20                 text += " "
21                 word_tokens = word_tokenize(text)
22                 len_text = len(word_tokens)
23                 document_encode = []
24                 for i in range(split):
25                     len_true = int((len_text + overlap*(split-1)) / split)         #小文档的真实长度
26                     len_rel = len_true - overlap
27                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
28                     document_encode.append(doc)
29 
30             label = int(line[0])
31             document_encode.append(label)
32             examples.append(tdata.Example.fromlist(document_encode, fields))
33     return examples, fields
34 
35 def data_doc_iter(train_path, test_path, text_field, label_field, batch_size, embedding_dim=50):
36     train_examples, train_fields = read_data(train_path, text_field, label_field)
37     test_examples, test_fields = read_data(test_path, text_field, label_field)
38 
39     train_dataset = tdata.Dataset(train_examples, train_fields)
40     test_dataset = tdata.Dataset(test_examples, test_fields)
41 
42     #构建词表
43     text_field.build_vocab(train_dataset, vectors=GloVe(name='6B', dim=embedding_dim))        
44     label_field.build_vocab(train_dataset)
45 
46     train_iter = tdata.Iterator(train_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
47     test_iter = tdata.Iterator(test_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
48     vocabulary = text_field.vocab
49     return train_iter, test_iter, vocabulary

调用如下：

 1 text_field = tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=512, batch_first=True)       
 2 label_field = tdata.LabelField(dtype=torch.int) 
 3 train_iter, test_iter, vocabulary = data_doc_iter("./data/IMDB_new/train_shuffle.csv",  "./data/IMDB_new/test_new.csv", 
 4                                                         text_field, label_field, batch_size=8)
 5 
 6 for batch in train_iter:
 7     print(batch.text1.shape)
 8     print(batch.text2.shape)
 9     print(batch.text3.shape)
10     print(batch.label)

如果每个text的fix_length想设置得不一样，text_field可以不同

fields = [('text_en', text_field), ('text_ch', text_field), ('text', text_field2), ('label', label_field)]

4.篇章级文本分类，将每一篇文档按长度分三段保存，每一份文档的词表不同

即所有文档的第一份使用第一份词表，所有文档的第二份使用第二份词表，所有文档的第三份使用第三份词表。

每一份词表是从.vector_cache中的glove.6B.50d.txt中的词随机抽取一半当作新的词向量得到的，分别保存为cibiao1.txt，cibiao2.txt和cibiao3.txt。构建随机特征子空间。

 1 from nltk.tokenize import word_tokenize
 2 from torchtext import data as tdata
 3 from torchtext.vocab import GloVe
 4 from torchtext.vocab import Vectors
 5 
 6 def read_split_data(data_path, text_fields, label_fields, split=3, overlap=0):
 7     
 8     field1, field2, field3 = [], [], []
 9     field1.append(('text', text_fields[0]))
10     field1.append(('label', label_fields[0]))
11     field2.append(('text', text_fields[1]))
12     field2.append(('label', label_fields[1]))
13     field3.append(('text', text_fields[2]))
14     field3.append(('label', label_fields[2]))       
15 
16     examp1, examp2, examp3 = [], [], [], []
17 
18     with open(data_path) as csv_file:
19         reader = csv.reader(csv_file, quotechar='"')
20         for idx, line in enumerate(reader):
21             text = ""
22             for tx in line[1:]:
23                 text += tx              #tx就是一篇文档
24                 text += " "
25                 word_tokens = word_tokenize(text)
26                 len_text = len(word_tokens)
27                 document_encode = []
28                 for i in range(split):
29                     len_true = int((len_text + overlap*(split-1)) / split)         #小文档的真实长度
30                     len_rel = len_true - overlap
31                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
32                     document_encode.append(doc)
33 
34             label = int(line[0])        
35             
36             doc1, doc2, doc3 = [], [], []
37             doc1.append(document_encode[0])
38             doc1.append(label)
39             examp1.append(tdata.Example.fromlist(doc1, field1))
40 
41             doc2.append(document_encode[1])
42             doc2.append(label)
43             examp2.append(tdata.Example.fromlist(doc2, field2))
44 
45             doc3.append(document_encode[2])
46             doc3.append(label)
47             examp3.append(tdata.Example.fromlist(doc3, field3))
48 
49     return examp1, examp2, examp3, field1, field2, field3
50 
51 
52 def data_docsplit_iter(train_path, test_path, text_fields, label_fields, batch_size):
53     train1_examp, train2_examp, train3_examp, field1, field2, field3 = read_split_data(train_path, text_fields, label_fields)
54     test1_examp, test2_examp, test3_examp, tfield1, tfield2, tfield3 = read_split_data(test_path, text_fields, label_fields)
55 
56     #构建词表
57     train1_data = tdata.Dataset(train1_examp, field1)
58     train2_data = tdata.Dataset(train2_examp, field2)
59     train3_data = tdata.Dataset(train3_examp, field3)
60 
61     vectors1 = Vectors(name='cibiao1.txt')
62     vectors2 = Vectors(name='cibiao2.txt')
63     vectors3 = Vectors(name='cibiao3.txt')
64 
65     text_fields[0].build_vocab(train1_data, vectors=vectors1)
66     text_fields[1].build_vocab(train2_data, vectors=vectors2)
67     text_fields[2].build_vocab(train3_data, vectors=vectors3)
68 
69     label_fields[0].build_vocab(train1_data)
70     label_fields[1].build_vocab(train2_data)
71     label_fields[2].build_vocab(train3_data)
72 
73     test1_data = tdata.Dataset(test1_examp, tfield1)
74     test2_data = tdata.Dataset(test2_examp, tfield2)
75     test3_data = tdata.Dataset(test3_examp, tfield3)
76     
77     train_iter1 = tdata.Iterator(train1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
78     train_iter2 = tdata.Iterator(train2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
79     train_iter3 = tdata.Iterator(train3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
80 
81     test_iter1 = tdata.Iterator(test1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
82     test_iter2 = tdata.Iterator(test2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
83     test_iter3 = tdata.Iterator(test3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
84   
85     vocabulary1, vocabulary2, vocabulary3 = text_fields[0].vocab, text_fields[1].vocab, text_fields[2].vocab
86     return train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3

调用如下：

 1 SENTENCE_LIMIT_SIZE = 512
 2 DATAPATH = './data/IMDB_new/'
 3 
 4 text_fields, label_fields = [], []
 5 for i in range(3):
 6     text_fields.append(tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=SENTENCE_LIMIT_SIZE, batch_first=True))
 7     label_fields.append(tdata.LabelField(dtype=torch.int))
 8 
 9 train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3 = data_docsplit_iter(DATAPATH + "train_shuffle.csv", DATAPATH + "test_new.csv", 
10                                                                                                             text_fields, label_fields, batch_size=4)        
11 print('vocabulary1.vectors.shape = ', vocabulary1.vectors)
12 print('vocabulary2.vectors.shape = ', vocabulary2.vectors.shape)
13 print('vocabulary3.vectors.shape = ', vocabulary3.vectors.shape)          
14 for i, batch in enumerate(zip(train_iter1, train_iter2, train_iter3)):  
15     print(batch[0].text)
16     print(batch[0].label)
17     print(batch[1].text)
18     print(batch[1].label)
19     break

posted @ 2021-02-09 11:43 最咸的鱼阅读(2538) 评论(2) 编辑收藏举报

刷新页面返回顶部