import csv
import os
import numpy as np
labels = []
data = []
a_train_file = r'xxx\train.csv'
a_test_file = r'xxx\valid.csv'
a_file = r'\all.csv'
all_data_num = 20000
train_num = 18000
seed = 3
np.random.seed(seed)
train_indices = np.random.choice(all_data_num,train_num, replace=False) # 设置随机数生成从0-20000中随机挑选18000个随机数
residue = np.array(list(set(range(all_data_num)) - set(train_indices)))
test_indices = np.random.choice(len(residue),2000, replace=False) # 如果训练集和测试集综合的数据加起来就是一整个数据集则不需要这个操作,直接用residue即可
with open(a_file)as afile:
a_reader = csv.reader(afile) #从原始数据集中将所有数据读取出来并保存到a_reader中
# labels = next(a_reader) # 提取第一行设置为labels,没有则不需要
for row in a_reader: # 将a_reader中每一行的数据提取出来并保存到data的列表中
data.append(row)
# 生成训练数据集
if not os.path.exists(a_train_file):
with open(a_train_file, "w", newline='') as a_trian:
writer = csv.writer(a_trian)
# writer.writerows([labels]) #第一行为标签行,没有不需要
writer.writerows(np.array(data)[train_indices])
a_trian.close()
# 生成测试数据集
if not os.path.exists(a_test_file):
with open(a_test_file, "w", newline='')as a_test:
writer = csv.writer(a_test)
# writer.writerows([labels]) #第一行为标签行,没有不需要
writer.writerows(np.array(data)[test_indices])
a_test.close()