python(十)：列表转换成字典

一、列表转换成字典

        self.cat_list = []
        with open(os.path.join(self.raw_data, "cat.txt")) as f:
            for line in f.readlines():
                self.cat_list.append(line.strip())
        self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))

二、NLP生成字典

def remove_1a(content):
    # 去除标点字母数字
    chinese = '[\u4e00-\u9fa5a-zA-Z0-9]+'
    str1 = re.findall(chinese, content)
    return ''.join(str1)


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                label, content = line.split("  ")
                if content:
                    content = remove_1a(content)
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    print(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    pairs = []
    for i in count_pairs:
        if i[1] > 2:
            pairs.append(i)
    count_pairs = pairs
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('\n'.join(words) + '\n')

posted @ 2021-09-22 17:37 jasonzhangxianrong 阅读(1138) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称： jasonzhangxianrong
园龄： 5年6个月
粉丝： 107
关注： 1

+加关注

2025年3月

日

一

二

三

四

五

六

python(十)：列表转换成字典

一、列表转换成字典

二、NLP生成字典

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论