机器学习-文本分类实例-朴素贝叶斯
机器学习-文本分类实例-朴素贝叶斯
1.准备训练样本
使用的复旦大学文本分类样本数据
2.训练模型
3.准备测试数据
4.分类
训练模型
import os import jieba #Bunch类 from sklearn.datasets.base import Bunch import pickle from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类 from sklearn.feature_extraction.text import TfidfVectorizer # 定义两个函数读取和保存文件 # 保存至文件 def savefile(savepath, content): fp = open(savepath, "w", encoding="GBK") fp.write(content) fp.close() # 读文件 def readfile(path, encode): content = None try: fp = open(path, "r", encoding=encode) content = fp.read() fp.close() except UnicodeDecodeError: print("Error: 文件读取失败") else: return content # 1.读取和写入Bunch对象的函数 def readbunchobj(path): file_obj = open(path, "rb") bunch = pickle.load(file_obj) file_obj.close() return bunch # 写入Bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj, file_obj) file_obj.close() # 整个语料库分词的主程序 # 训练文本分词存储 def segment(corpus_path,seg_path): # 获取corpus_path下的所有子目录 cateList = os.listdir(corpus_path) for myDir in cateList: if not myDir.startswith("."): # 拼出分类子目录的路径 class_path = corpus_path+myDir+"/" # 拼出分词后的语料分类目录 seg_dir = seg_path+myDir+"/" # 是否存在目录,如果没有则创建 if not os.path.exists(seg_dir): os.makedirs(seg_dir) # 获得类别目录下的所有文件 file_list = os.listdir(class_path) # 遍历类别目录下的所有文件 for file_path in file_list: # 拼出文件名的全路径 fullname = class_path + file_path print("path:" + fullname) # 读取文件的内容 content = readfile(fullname, "GBK") if content != None: content = content.strip() # 删除换行和多余的空格 content = content.replace("\r\n", "").strip() # 为文件的内容分词 content_seg = jieba.cut(content) # 将处理后的文件保存到分词后的语目录 savefile(seg_dir + file_path, "".join(content_seg)) print("中文语料分析结束!!!") # 将分好词的文本文件转换并持久化为Bunch类形式 def bunchObj(wordbag_path,seg_path): bunch = Bunch(target_name=[], label=[], filename=[], contents=[]) # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下: catelist = os.listdir(seg_path) # 按类别信息保存到Bunch对象中 bunch.target_name.extend(catelist) for myDir in catelist: if not myDir.startswith("."): class_path = seg_path + myDir + "/" file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path print(fullname) # 保存当前文件的分类标签 bunch.label.append(myDir) # 保存当前文件路径 bunch.filename.append(fullname) # 保存文件词向量 bunch.contents.append(readfile(fullname, "GBK").strip()) # Bunch对象的持久化 file_obj = open(wordbag_path, "wb") pickle.dump(bunch, file_obj) file_obj.close() print("构建文本对象结束!!!") #训练模型 def startTrain(stopword_path, wordbag_path, space_path): stpwrdlst = readfile(stopword_path,"UTF-8").splitlines() # 从训练集生成TF-IDF向量词袋 # 2.导入分词后的词向量Bunch对象 bunch = readbunchobj(wordbag_path) # 3.构建TF-IDF向量空间模型 tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filename=bunch.filename, tdm=[], vocabulary={}) # 使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) transform = TfidfTransformer() # 该类会统计每个词语放入Tf-IDF权重 # 4.文本转化为词频矩阵:单独保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ # 5.创建词袋的持久化 writebunchobj(space_path, tfidfspace) print("文本分类模型训练完成") # 未分词分类语料库路径 corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/文本集/train/" # 分词后的分类语料库路径 segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_segment/" # 分词语料Bunch对象持久化文件路径 wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/train_set.dat" # 停用词路径 stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt" # 创建词袋的持久化路径 space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" # 训练文本分词存储 # segment(corpus_path, segment_path) # 将分好词的文本文件转换并持久化为Bunch类形式 # bunchObj(wordbag_path, segment_path) #开始训练 startTrain(stop_words_path, wordbag_path, space_path)
准备测试数据
import os import jieba #Bunch类 from sklearn.datasets.base import Bunch import pickle from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类 from sklearn.feature_extraction.text import TfidfVectorizer # 定义两个函数读取和保存文件 # 保存至文件 def savefile(savepath, content): fp = open(savepath, "w", encoding="GBK") fp.write(content) fp.close() # 读文件 def readfile(path, encode): content = None try: fp = open(path, "r", encoding=encode) content = fp.read() fp.close() except UnicodeDecodeError: print("Error: 文件读取失败") else: return content # 1.读取和写入Bunch对象的函数 def readbunchobj(path): file_obj = open(path, "rb") bunch = pickle.load(file_obj) file_obj.close() return bunch # 写入Bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj, file_obj) file_obj.close() # 整个语料库分词的主程序 # 训练文本分词存储 def segment(corpus_path,seg_path): # 获取corpus_path下的所有子目录 cateList = os.listdir(corpus_path) for myDir in cateList: if not myDir.startswith("."): # 拼出分类子目录的路径 class_path = corpus_path+myDir+"/" # 拼出分词后的语料分类目录 seg_dir = seg_path+myDir+"/" # 是否存在目录,如果没有则创建 if not os.path.exists(seg_dir): os.makedirs(seg_dir) # 获得类别目录下的所有文件 file_list = os.listdir(class_path) # 遍历类别目录下的所有文件 for file_path in file_list: # 拼出文件名的全路径 fullname = class_path + file_path print("path:" + fullname) # 读取文件的内容 content = readfile(fullname, "GBK") if content != None: content = content.strip() # 删除换行和多余的空格 content = content.replace("\r\n", "").strip() # 为文件的内容分词 content_seg = jieba.cut(content) # 将处理后的文件保存到分词后的语目录 savefile(seg_dir + file_path, "".join(content_seg)) print("中文语料分析结束!!!") # 将分好词的文本文件转换并持久化为Bunch类形式 def bunchObj(wordbag_path,seg_path): bunch = Bunch(target_name=[], label=[], filename=[], contents=[]) # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下: catelist = os.listdir(seg_path) # 按类别信息保存到Bunch对象中 bunch.target_name.extend(catelist) for myDir in catelist: if not myDir.startswith("."): class_path = seg_path + myDir + "/" file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path print(fullname) # 保存当前文件的分类标签 bunch.label.append(myDir) # 保存当前文件路径 bunch.filename.append(fullname) # 保存文件词向量 bunch.contents.append(readfile(fullname, "GBK").strip()) # Bunch对象的持久化 file_obj = open(wordbag_path, "wb") pickle.dump(bunch, file_obj) file_obj.close() print("构建文本对象结束!!!") #训练模型 def startTrain(stopword_path, wordbag_path, space_path, train_space_path): stpwrdlst = readfile(stopword_path,"UTF-8").splitlines() # 从训练集生成TF-IDF向量词袋 # 2.导入分词后的词向量Bunch对象 bunch = readbunchobj(wordbag_path) # 3.构建测试集TF-IDF向量空间 testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames= bunch.filename, tdm=[], vocabulary={}) # 4.导入训练集词袋 trainbunch = readbunchobj(train_space_path) # 5.使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary) # 使用训练集词袋向量 transformer = TfidfTransformer() testspace.tdm = vectorizer.fit_transform(bunch.contents) testspace.vocabulary = trainbunch.vocabulary writebunchobj(space_path, testspace) print("文本分类模型训练完成") # 未分词分类语料库路径 corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/测试文本集/" # 分词后的分类语料库路径 segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_segment/" # 分词语料Bunch对象持久化文件路径 wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/test_set.dat" # 停用词路径 stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt" # 创建词袋的持久化路径 space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat" train_space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" # 训练文本分词存储 # segment(corpus_path, segment_path) # # # 将分好词的文本文件转换并持久化为Bunch类形式 # bunchObj(wordbag_path, segment_path) #开始训练 startTrain(stop_words_path, wordbag_path, space_path,train_space_path)
测试
import pickle from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包 def readbunchobj(path): file_obj = open(path,"rb") bunch = pickle.load(file_obj) file_obj.close() return bunch #导入训练向量空间 trainpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" train_set = readbunchobj(trainpath) #导入测试集向量空间 testpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat" test_set = readbunchobj(testpath) #应用朴素贝叶斯 #alpha:0.001 alpha越小,迭代次数越多,精度越高 clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm,train_set.label) #预测分类结果 predicted = clf.predict(test_set.tdm) total = len(predicted) rate = 0 for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted): print(file_name, u":实际类别:", flabel, u"-->预测类别:", expct_cate) if flabel != expct_cate: rate += 1 # print(file_name,u":实际类别:",flabel,u"-->预测类别:",expct_cate) #精度 print("error rate:",float(rate)*100/float(total),"%")
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示