doc2vec使用笔记
#!/usr/bin/env Python # coding:utf-8 #improt依赖包 # import sys # reload(sys) # sys.setdefaultencoding('utf-8') import chardet from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import numpy from random import shuffle from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix import sklearn.metrics as metrics # Doc2vec需要以LabeledLineSentece对象作为输入,所以需要构建一个类将文本转化为LabeledLineStentece对象 class LabeledLineSentence(object): def __init__(self, sources): self.sources = sources flipped = {} # make sure that keys are unique for key, value in sources.items(): if value not in flipped: flipped[value] = [key] else: raise Exception('Non-unique prefix encountered') def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]) def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): print chardet.detect(line) line=line.decode("GB2312",'ignore').encode("utf-8") print chardet.detect(line) self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) # self.sentences.append(LabeledSentence(utils.to_utf8(line).split(), [prefix + '_%s' % item_no])) return self.sentences def sentences_perm(self): shuffle(self.sentences) return self.sentences #将文本数据以以下方式导入到Doc2vec中 # sources = {u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/neg_train.txt':'TRAIN_NEG', # u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/pos_train.txt':'TRAIN_POS' # ,u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_train.txt':'TRAIN_UNS', # u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_test.txt':'TEST_UNS'} sources = {\ './yuliao/fYuliao0.txt':'TRAIN_0', './yuliao/fYuliao1.txt':'TRAIN_1', './yuliao/fYuliao2.txt':'TRAIN_2', './yuliao/fYuliao3.txt':'TRAIN_3', './yuliao/fYuliao4.txt':'TRAIN_4', './yuliao/fYuliao5.txt':'TRAIN_5',\ } sentences = LabeledLineSentence(sources) #构建Doc2vec模型 model = Doc2Vec(min_count=1, window=15, size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(sentences.to_array()) #训练Doc2vec模型(本例迭代次数为10,如果时间允许,可以迭代更多的次数) for epoch in range(2): model.train(sentences.sentences_perm()) model.save("model.txt") # model=Doc2Vec.load("model.txt") #将训练好的句子向量装进array里面,后文作为分类器的输入 train_arrays = numpy.zeros((5000, 100)) train_labels = numpy.zeros(5000) test_arrays = [] true_labels=[] train_data=[] train_lb=[] for i in range(5000): if(i<=645): prefix_train_0 = 'TRAIN_0_' + str(i) train_arrays[i] = model.docvecs[prefix_train_0] train_labels[i] = 0 elif(i>645 and i<=4249): j=i-646 prefix_train_1 = 'TRAIN_1_' + str(j) train_arrays[i]=model.docvecs[prefix_train_1] train_labels[i]=1 elif(i>4249 and i<=4800): j=i-4250 prefix_train_2 = 'TRAIN_2_' + str(j) train_arrays[i]=model.docvecs[prefix_train_2] train_labels[i]=2 elif(i>4800 and i<=4965): j=i-4801 prefix_train_3 = 'TRAIN_3_' + str(j) train_arrays[i]=model.docvecs[prefix_train_3] train_labels[i]=3 elif(i>4965 and i<=4994): j=i-4966 prefix_train_4 = 'TRAIN_4_' + str(j) train_arrays[i]=model.docvecs[prefix_train_4] train_labels[i]=4 else: j=i-4995 prefix_train_5 = 'TRAIN_5_' + str(j) train_arrays[i]=model.docvecs[prefix_train_5] train_labels[i]=5 #载入测试集数据 a=open("./yuliao/fYuliao0_test.txt") b=open("./yuliao/fYuliao1_test.txt") c=open("./yuliao/fYuliao2_test.txt") d=open("./yuliao/fYuliao3_test.txt") e=open("./yuliao/fYuliao4_test.txt") f=open("./yuliao/fYuliao5_test.txt") test_content1=a.readlines() test_content2=b.readlines() test_content3=c.readlines() test_content4=d.readlines() test_content5=e.readlines() test_content6=f.readlines() g=open("./yuliao/fYuliao0_test.txt") test_content7=g.readline() inferred_docvec=model.infer_vector(test_content7) print model.docvecs.most_similar([inferred_docvec], topn=3) for i in test_content1: test_arrays.append(model.infer_vector(i)) true_labels.append(0) for i in test_content2: test_arrays.append(model.infer_vector(i)) true_labels.append(1) for i in test_content3: test_arrays.append(model.infer_vector(i)) true_labels.append(2) for i in test_content4: test_arrays.append(model.infer_vector(i)) true_labels.append(3) for i in test_content5: test_arrays.append(model.infer_vector(i)) true_labels.append(4) for i in test_content6: test_arrays.append(model.infer_vector(i)) true_labels.append(5) #构建逻辑回归分类器 classifier = LogisticRegression(class_weight={0:0.38,1:0.62}) classifier.fit(train_arrays, train_labels) # 构建随机森林分类器 ''' from sklearn.ensemble import RandomForestClassifier RF = RandomForestClassifier(n_estimators=1200,max_depth=14,class_weight={0:0.3,1:0.7}) RF.fit(train_arrays, train_labels) ''' #构建GBDT分类器 ''' from sklearn.ensemble import GradientBoostingClassifier GBDT = GradientBoostingClassifier(n_estimators=1000,max_depth=14) GBDT.fit(train_arrays, train_labels) ''' #对Test数据进行预测 test_labels_LR=[] # test_labels_RF=[] # test_labels_GBDT=[] for i in range(len(test_arrays)): test_labels_LR.append(classifier.predict(test_arrays[i])) ''' test_labels_RF.append(RF.predict(test_arrays[i])) test_labels_GBDT.append(GBDT.predict(test_arrays[i])) ''' #打印各个模型的准确率和召回率 print("LR:") test_labels_LR1 = [] count = 0 for i in range(len(test_labels_LR)): if (test_labels_LR[i][0] == true_labels[i]): count +=1 print count ''' print("RF:") print(metrics.accuracy_score(test_labels_RF,true_labels)) print(confusion_matrix(test_labels_RF,true_labels)) print("GBDT:") print(metrics.accuracy_score(test_labels_GBDT,true_labels)) print(confusion_matrix(test_labels_GBDT,true_labels)) '''