liblinear是libsvm的线性核的改进版本,专门适用于百万数据量的分类。正好适用于我这次数据挖掘的实验。
liblinear用法和libsvm很相似,我是用的是.exe文件,利用python的subprocess向控制台发送命令即可完成本次试验。
其中核心两句即
train train.txt
predict test.txt train.txt.model output.txt
由于是线性核,没有设置参数c、g
对于50W篇文章模型训练仅需340秒,50W篇文章的预测仅需6秒
1 from subprocess import * 2 import time 3 4 time = time.time 5 6 start_time = time() 7 print("训练") 8 cmd = "train train.txt" 9 Popen(cmd, shell = True, stdout = PIPE).communicate() 10 print("训练结束",str(time() - start_time)) 11 12 13 start_time = time() 14 print("预测") 15 cmd = "predict test.txt train.txt.model output.txt" 16 Popen(cmd, shell = True).communicate() 17 print("预测结束",str(time() - start_time)) 18 19 20 #进行统计 21 #读测试集真实label 22 start_time = time() 23 print("统计") 24 test_filename = "test.txt" 25 f = open(test_filename,"r",encoding = "utf-8") 26 real_class = [] 27 for line in f: 28 real_class.append(line[0]) 29 30 #总样本 31 total_sample = len(real_class) 32 33 #读预测结果label 34 predict_filename = "output.txt" 35 f_predict = open(predict_filename,"r",encoding = "utf-8") 36 s = f_predict.read() 37 predict_class = s.split() 38 39 #对预测正确的文章进行计数 40 T = 0 41 for real, predict in zip(real_class,predict_class): 42 if int(real) == int(predict): 43 T += 1 44 accuracy = T / total_sample * 100 45 print("正确率 为", str(accuracy) + "%") 46 47 48 # class_label = ["0","1","2","3","4","5","6","7","8","9"] 49 num_to_cate = {0:"it",1:"体育",2:"军事",3:"金融",4:"健康",5:"汽车",6:"房产",7:"文化",8:"教育",9:"娱乐"} 50 51 class_label = ["it","体育","军事","金融","健康","汽车","房产","文化","教育","娱乐"] 52 53 predict_precision = dict.fromkeys(class_label,1.0) 54 predict_true = dict.fromkeys(class_label,1.0) 55 56 predict_recall = dict.fromkeys(class_label,1.0) 57 predict_F = dict.fromkeys(class_label,0.0) 58 # print(str(predict_precision)) 59 # print(str(predict_precision)) 60 # print(str(predict_recall)) 61 # print(str(predict_true)) 62 mat = dict.fromkeys(class_label,{}) 63 for k,v in mat.items(): 64 mat[k] = dict.fromkeys(class_label,0) 65 66 # print(str(mat)) 67 68 for real, predict in zip(real_class,predict_class): 69 real = int(real) 70 predict = int(predict) 71 # print(num_to_cate[real]) 72 # print(num_to_cate[predict]) 73 mat[num_to_cate[real]][num_to_cate[predict]] += 1 74 predict_precision[num_to_cate[predict]] += 1 75 predict_recall[num_to_cate[real]] += 1 76 77 if int(real) == int(predict): 78 predict_true[num_to_cate[predict]] += 1 79 80 # print(str(predict_precision)) 81 # print(str(predict_recall)) 82 # print(str(predict_true)) 83 84 #输出混淆矩阵 85 for k, v in mat.items(): 86 print(k + ":" + str(v)) 87 88 #计算精确率和召回率 89 for x in range(len(class_label)): 90 # x = str(x) 91 predict_precision[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_precision[num_to_cate[x]] 92 predict_recall[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_recall[num_to_cate[x]] 93 94 # print(str(predict_precision)) 95 # print(str(predict_recall)) 96 # print(str(predict_true)) 97 98 #计算F测度 99 for x in range(len(class_label)): 100 # x = str(x) 101 predict_F[num_to_cate[x]] = 2 * predict_recall[num_to_cate[x]] * predict_precision[num_to_cate[x]] / (predict_precision[num_to_cate[x]] + predict_recall[num_to_cate[x]]) 102 103 print("统计结束",str(time() - start_time)) 104 print("精确率为",str(predict_precision)) 105 print("召回率为",str(predict_recall)) 106 print("F测度为",str(predict_F)) 107 108 print("保存结果") 109 final_result_filename = "./finalresult.txt" 110 f = open(final_result_filename,"w",encoding = "utf-8") 111 for k, v in mat.items(): 112 f.write(k + ":" + str(v) + "\n") 113 114 f.write("\n") 115 f.write("正确率为" + str(accuracy) + "%" + "\n\n") 116 f.write("精确率为" + str(predict_precision) + "\n\n") 117 f.write("召回率为" + str(predict_recall) + "\n\n") 118 f.write("F测度为" + str(predict_F) + "\n\n") 119 print("保存结果结束") 120 121 122 # cate_to_num = {"it":0,"体育":1,"军事":2,"华人":3,"国内":4,"国际":5,"房产":6,"文娱":7,"社会":8,"财经":9} 123 # num_to_cate = {0:"it",1:"体育",2:"军事",3:"华人",4:"国内",5:"国际",6:"房产",7:"文娱",8:"社会",9:"财经"}