fasttext模型 训练THUCNews
# _*_coding:utf-8 _*_ import fasttext import jieba from sklearn import metrics import random def read_file(filename): i=0; sentences =[] out = open('data/cnews/fast_test.txt','a+') with open(filename) as ft: for line in ft: label, content = line.strip().split('\t') segs = jieba.cut(content) segs = filter(lambda x:len(x)>1,segs) sentences.append("__label__"+str(label)+"\t"+" ".join(segs)) random.shuffle(sentences) for sentence in sentences: out.write(sentence+"\n") out.close() read_file('data/cnews/cnews.train.txt') classifier = fasttext.supervised('data/cnews/fast_train.txt','new_fasttext.model') classifier = fasttext.load_model('new_fasttext.model.bin') categories = ['体育', '财经','房产','家居','教育', '科技', '时尚', '时政', '游戏', '娱乐'] read_file('data/cnews/cnews.test.txt') result = classifier.test('data/cnews/fast_test.txt') print("准确率为:%f"%result.precision) print("召回率为: %f"%result.recall) with open('data/cnews/cnews.test.txt') as fw: contents,labels = [],[] for line in fw: label ,content = line.strip().split('\t') segs = jieba.cut(content) segs = filter(lambda x:len(x)>1,segs) contents.append(" ".join(segs)) labels.append('__label__'+label) label_predict = [e[0] for e in classifier.predict(contents)] print("Precision,Recall and F1-Score....") print(metrics.classification_report(labels,label_predict,target_names=categories))
关于fasttext的使用一些疑问:fasttext.supervised的参数label_prefix 一直提示我这个参数使用有问题... 然而,搜素了半天,我也没搞明白这个参数哪里有问题
还有一点需要注意的地方:fasttext的识别标签统一需要在标签前面加上"__label__"
后续会更新fastext的原理