中文文本分类
import os path =r'C:\Users\Administrator\Desktop\0369data' def readfile(path): for root,dirs,files in os.walk(path): #print(root) #print(dirs) for f in files: fn = os.path.join(root,f) # size = os.path.getsize(fn) # print(fn,size) genInfo(fn) import numpy as np def genInfo(path): classfity = fn.split('\\')[-2] # 获取类别 with open(fn,'r',encoding='utf-8') as f: content = f.read() # 获取文本 # import jieba # import jieba.posseg as psg # file_path = r'C:\Users\Administrator\Desktop\stopsCN.txt' # fo = open(file_path,'r',encoding='utf-8').read() # stops = np.loadtxt(file_path,dtype=str,delimiter=r'\t',encoding='utf-8') # stops.shape # tokens=[token for token in tokens if token not in stops] # tokens