python(十):列表转换成字典
一、列表转换成字典
self.cat_list = [] with open(os.path.join(self.raw_data, "cat.txt")) as f: for line in f.readlines(): self.cat_list.append(line.strip()) self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))
二、NLP生成字典
def remove_1a(content): # 去除标点字母数字 chinese = '[\u4e00-\u9fa5a-zA-Z0-9]+' str1 = re.findall(chinese, content) return ''.join(str1) def read_file(filename): """读取文件数据""" contents, labels = [], [] with open(filename, mode='r', encoding='utf-8', errors='ignore') as f: for line in f: try: label, content = line.split(" ") if content: content = remove_1a(content) contents.append(list(content)) labels.append(label) except: pass return contents, labels def build_vocab(train_dir, vocab_dir, vocab_size=5000): """根据训练集构建词汇表,存储""" data_train, _ = read_file(train_dir) all_data = [] for content in data_train: all_data.extend(content) counter = Counter(all_data) print(all_data) count_pairs = counter.most_common(vocab_size - 1) pairs = [] for i in count_pairs: if i[1] > 2: pairs.append(i) count_pairs = pairs words, _ = list(zip(*count_pairs)) # 添加一个 <PAD> 来将所有文本pad为同一长度 words = ['<PAD>'] + list(words) open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('\n'.join(words) + '\n')