将txt文件转换成excel文件
import pandas as pd
import os
file_list = os.listdir('corpus')
for file in file_list:
file_path = 'corpus/'+file
data = pd.read_csv(file_path,encoding='utf-8',header=None)
new_file = file.replace('txt','xlsx')
data.to_excel(new_file,header=None)
统计类别分布
from collections import Counter
# 得到类别数据
tag_num_dict = Counter(df["某列数据"])\
tag_num_dict.most_common() # 将字典根据数量转换为一个有序列表
#可视化标签比例 使用自带api
label = df["label"].value_counts().sort_index()
print("\n原始数据占比\n")
print(label/len(df)*100)
# stratify split
train_set_income_count = train_set['label'].value_counts().sort_index()
test_set_income_count = test_set['label'].value_counts().sort_index()
print('\nStartify split train dataset, distribution: (%)')
print(train_set_income_count/len(train_set)*100)
print(test_set_income_count/len(test_set)*100)