深度学习模型-快速构建词典和id的映射

直接上代码

from collections import Counter
import numpy as np

text = 'I love china. the dog on the ground'
text = text.split()
# print(text)
vocab = dict(Counter(text).most_common(5))
vocab['<unk>'] = len(text) - np.sum(list(vocab.values()))

id_to_word = [word for word in vocab.keys()]
word_to_id = {word:i for i, word in enumerate(id_to_word)}

print(word_to_id)

# print(list(vocab.values()))

 

V2

from collections import Counter
import numpy as np
import pandas as pd
import csv

vocab_file = r"resources/vocab.txt"
cut_num = 7
vocab_df = pd.read_csv(vocab_file, encoding='utf-8', sep='\t', header=None, quoting=csv.QUOTE_NONE)

text = '我是中 国人'
text = [e for e in text.strip().replace(" ","")]
# data preprocess
if len(text) > cut_num:
    text = text[:cut_num]
else:
    text = text + ['<pad>']*(cut_num-len(text))

id_to_word = [word for word in vocab_df[0].tolist()]
word_to_id = {word:i for i, word in enumerate(id_to_word)}
text_encoded = []
for each in text:
    cur_id = word_to_id.get(each)
    if not cur_id:
        cur_id = word_to_id.get('<unk>')
    text_encoded.append(cur_id)

print("text:{}\n{}".format(text, text_encoded))

 

posted @ 2020-02-26 16:02  今夜无风  阅读(637)  评论(0编辑  收藏  举报