gensim自然语言处理
参考代码 ChineseClean_demo1.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' # file 表示源文件名字,修改此处即可 file="./data/answer_detail_5_15307860968687.xls" dirs="./result" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿 ''' 创建第一个sheet: sheet1 ''' sheet1 = f.add_sheet(u'sheet1_1',cell_overwrite_ok=True) #创建sheet sheet2 = f.add_sheet(u'sheet1_2',cell_overwrite_ok=True) #创建sheet row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] # 打开文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始 cols = sheet0.col_values(cols_numb) rows_list_1=[] rows_list_2=[] for i in range(1,len(cols)): if cols[i] == '0': rows_list_1.append(i) else: rows_list_2.append(i) for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) ''' sheet1_1保存0分数据 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 获取行内容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet1_2保存非0分数据 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 获取行内容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) f.save('./data/demo1.xls') #保存文件 if __name__ == '__main__': # 读取文件的行和列 rows_numb=0 cols_numb=6 read_excel(rows_numb,cols_numb) ChineseClean_demo2.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' # file 表示源文件名字,修改此处即可 file="./data/demo1.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿 ''' 创建第sheet: ''' sheet1 = f.add_sheet(u'sheet2_1',cell_overwrite_ok=True) #创建sheet sheet2 = f.add_sheet(u'sheet2_2',cell_overwrite_ok=True) #创建sheet sheet3 = f.add_sheet(u'sheet2_3',cell_overwrite_ok=True) #创建sheet sheet4 = f.add_sheet(u'sheet2_4',cell_overwrite_ok=True) #创建sheet row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) sheet3.write(0,i,row0[i]) sheet4.write(0,i,row0[i]) # 打开文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始 cols = sheet0.col_values(cols_numb) # 获取列内容 rows_list_1=[] rows_list_2=[] rows_list_3=[] rows_list_4=[] for i in range(1,len(cols)): if float(cols[i]) < 12.0: rows_list_1.append(i) if float(cols[i]) >= 12.0 and float(cols[i]) < 16.0: rows_list_2.append(i) if float(cols[i]) >= 16.0 and float(cols[i]) < 18.0: rows_list_3.append(i) if float(cols[i]) >= 18.0: print(i) print(type(cols[i])) exit() rows_list_4.append(i) ''' sheet2_1保存差,小于12分 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 获取行内容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet2_2保存中,大于等于12,且小于16分 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 获取行内容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) ''' sheet2_3保存良,大于等于16,且小于18分 ''' for j in range(0,len(rows_list_3)): rows = sheet0.row_values(rows_list_3[j]) # 获取行内容 for i in range(0,len(rows)): sheet3.write(j+1,i,rows[i]) ''' sheet2_4保存优,大于等于18分 ''' for j in range(0,len(rows_list_4)): rows = sheet0.row_values(rows_list_4[j]) # 获取行内容 for i in range(0,len(rows)): sheet4.write(j+1,i,rows[i]) f.save('./data/demo2.xls') if __name__ == '__main__': # 读取文件的行和列 rows_numb=0 cols_numb=6 read_excel(rows_numb,cols_numb) ChineseClean_demo3.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' file="./data/answer_detail_5_15307860968687.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿 ''' 创建第一个sheet: sheet1 ''' sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet sheet2 = f.add_sheet(u'sheet2',cell_overwrite_ok=True) #创建sheet sheet3 = f.add_sheet(u'sheet3',cell_overwrite_ok=True) #创建sheet sheet4 = f.add_sheet(u'sheet4',cell_overwrite_ok=True) #创建sheet sheet5 = f.add_sheet(u'sheet5',cell_overwrite_ok=True) row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) sheet3.write(0,i,row0[i]) sheet4.write(0,i,row0[i]) sheet5.write(0,i,row0[i]) # 打开文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始 cols = sheet0.col_values(cols_numb) # 获取列内容 rows_list_1=[] rows_list_2=[] rows_list_3=[] rows_list_4=[] rows_list_5=[] for i in range(1,len(cols)): if cols[i] == '100012': rows_list_1.append(i) if cols[i] == '100014': rows_list_2.append(i) if cols[i] == '100007': rows_list_3.append(i) if cols[i] == '100016': rows_list_4.append(i) if cols[i] == '100017': print(i) print(type(cols[i])) rows_list_5.append(i) ''' sheet1保存 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 获取第四行内容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet2保存 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 获取第四行内容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) ''' sheet3保存 ''' for j in range(0,len(rows_list_3)): rows = sheet0.row_values(rows_list_3[j]) # 获取第四行内容 for i in range(0,len(rows)): sheet3.write(j+1,i,rows[i]) ''' sheet4保存 ''' for j in range(0,len(rows_list_4)): rows = sheet0.row_values(rows_list_4[j]) # 获取第四行内容 for i in range(0,len(rows)): sheet4.write(j+1,i,rows[i]) ''' sheet5保存 ''' for j in range(0,len(rows_list_5)): rows = sheet0.row_values(rows_list_5[j]) # 获取第四行内容 for i in range(0,len(rows)): sheet5.write(j+1,i,rows[i]) f.save('./data/demo3.xls') #保存文件 if __name__ == '__main__': # 读取文件的行和列 rows_numb=0 cols_numb=7 read_excel(rows_numb,cols_numb) ChineseClean_demo4or5.py: 同ChineseClean_demo3.py ChineseClean_answer_QA.py: # -*- coding:utf-8 -*- import re import xlrd file="./data/demo5.xls" dirs="./result" def read_excel(rows_numb,cols1_numb): number='1' f2 = open(dirs+'./demo5_sheet1_%s.csv'%number, 'a', encoding='utf-8') # 打开文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(int(number)-1) # sheet索引从0开始 cols1 = sheet0.col_values(cols1_numb[3]) [1:]# 获取列内容 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+" pattern1 = re.compile(p1) for i in range(len(cols1)): matcher1 = re.findall(pattern1, cols1[i]) str1=str() if matcher1: str1 = ' '.join(matcher1) f2.write(str1) f2.write('\n') f2.close() if __name__ == '__main__': # 读取文件的行和列 rows_numb=0 cols1_numb=[0,1,2,3,4,5,6,7] read_excel(rows_numb,cols1_numb) qa_test_clean_word.py: # -*- coding: utf-8 -*- import jieba # 创建停用词list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords # 对句子进行分词 def seg_sentence(sentence): sentence_seged = jieba.lcut_for_search(sentence.strip(),HMM=True) stopwords = stopwordslist('./test/stopwords.txt') # 这里加载停用词的路径 outstr = '' for word in sentence_seged: if word not in stopwords: if word != '\t': outstr += word outstr += " " return(outstr) inputs = open('./data/demo5_answer_csv/demo5_sheet5_5.csv', 'r', encoding='utf-8') outputs = open('./test/demo5_sheet5_5_5.csv', 'w') for line in inputs: line_seg = seg_sentence(line) try: if len(line_seg): outputs.write(line_seg + '\n') except: pass outputs.close() inputs.close() word_fre.py: # -*- coding: utf-8 -*- import matplotlib.pyplot as plt from matplotlib.font_manager import * import numpy as np def drawStatBarh(): ''' 画出词频统计条形图,用渐变颜色显示,选取前N个词频 ''' fig, ax = plt.subplots() myfont = FontProperties(fname='./data/simfang.ttf') N = 30 words = [] counts = [] for line in open('./data/word_fre.txt'): if line == '\n': continue line.strip('\n') words.append(line.split(' ')[0]) print(line.split(' ')[0]) # exit() counts.append(int(line.split(' ')[1].strip('\n'))) y_pos = np.arange(N) colors = ['#FA8072'] #这里是为了实现条状的渐变效果,以该色号为基本色实现渐变效果 for i in range(len(words[:N]) - 1): colors.append('#FA' + str(int(colors[-1][3:]) - 1)) rects = ax.barh(y_pos, counts[:N], align='center', color=colors) ax.set_yticks(np.arange(N)) ax.set_yticklabels(words[:N],fontproperties=myfont) ax.invert_yaxis() # labels read top-to-bottom ax.set_title('报告中的高频词汇',fontproperties=myfont, fontsize=17) ax.set_xlabel(u"出现次数",fontproperties=myfont) autolabel(rects, ax) plt.show() def autolabel(rects, ax): """ 给条形图加上文字标签 """ #fig, ax = plt.subplots() for rect in rects: width = rect.get_width() ax.text(1.03 * width, rect.get_y() + rect.get_height()/2., '%d' % int(width),ha='center', va='center') def wordCount(segment_list): ''' 该函数实现词频的统计,并将统计结果存储至本地。 在制作词云的过程中用不到,主要是在画词频统计图时用到。 ''' word_lst = [] word_dict = {} with open('./data/word_fre.txt','w') as wf2: word_lst.append(segment_list.split(' ')) for item in word_lst: for item2 in item: if item2 not in word_dict: word_dict[item2] = 1 else: word_dict[item2] += 1 # print(type(word_dict)) # print(word_dict) word_dict_sorted =list(sorted(word_dict.items(),key = lambda jj:jj[1],reverse=True))#list是关键,按照词频从大到小排序 # word_dict_sorted = dict(sorted(word_dict.items(),key = lambda item:item[1], reverse=True))#按照词频从大到小排序 print(word_dict_sorted) # exit() for tup in word_dict_sorted: # print(type(tup)) # print(tup) # exit() if tup[0] != '': wf2.write(tup[0].strip('\n')+' '+str(tup[1])+'\n') wf2.close() if __name__ == "__main__": segment_list_remove_stopwords=open('./data/demo5_sheet5_1_1.csv').read() wordCount(segment_list_remove_stopwords) drawStatBarh() wordcloud_test2.py: # - * - coding: utf - 8 -*- from os import path from scipy.misc import imread import matplotlib.pyplot as plt import jieba # jieba.load_userdict("txt\userdict.txt") # 添加用户词库为主词典,原词典变为非主词典 from wordcloud import WordCloud, ImageColorGenerator # 获取当前文件路径 # __file__ 为当前文件, 在ide中运行此行会报错,可改为 # d = path.dirname('.') d = path.dirname(__file__) stopwords = {} isCN = 1 #默认启用中文分词 back_coloring_path = "data/lz1.jpg" # 设置背景图片路径 text_path = 'data/demo5_sheet5_1_1.csv' #设置要分析的文本路径,讲原始文件转化为‘ANSI编码即可’ font_path = 'data/simfang.ttf' # 为matplotlib设置中文字体路径 stopwords_path = 'data/stopwords.txt' # 停用词词表 imgname1 = "data/WordCloudDefautColors.png" # 保存的图片名字1(只按照背景图片形状) imgname2 = "data/WordCloudColorsByImg.png"# 保存的图片名字2(颜色按照背景图片颜色布局生成) # my_words_list = ['CHENGLEI'] # 在结巴的词库中添加新词 back_coloring = imread(path.join(d, back_coloring_path))# 设置背景图片 # 设置词云属性 wc = WordCloud(font_path=font_path, # 设置字体 background_color="white", # 背景颜色 max_words=2000, # 词云显示的最大词数 mask=back_coloring, # 设置背景图片 max_font_size=100, # 字体最大值 random_state=42, width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) # 添加自己的词库分词 # def add_word(list): # for items in list: # jieba.add_word(items) # add_word(my_words_list) text = open(path.join(d, text_path)).read() # def jiebaclearText(text): # mywordlist = [] # seg_list = jieba.cut(text, cut_all=False) # liststr="/ ".join(seg_list) # f_stop = open(stopwords_path) # try: # f_stop_text = f_stop.read( ) # f_stop_text=unicode(f_stop_text,'utf-8') # finally: # f_stop.close( ) # f_stop_seg_list=f_stop_text.split('\n') # for myword in liststr.split('/'): # if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1: # mywordlist.append(myword) # return ''.join(mywordlist) # # if isCN: # text = jiebaclearText(text) # 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数 wc.generate(text) # wc.generate_from_frequencies(text) # txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)] # 从背景图片生成颜色值 image_colors = ImageColorGenerator(back_coloring) plt.figure() # 以下代码显示图片 plt.imshow(wc) plt.axis("off") plt.show() # 绘制词云 # 保存图片 wc.to_file(path.join(d, imgname1)) image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") # 绘制背景图片为颜色的图片 plt.figure() plt.imshow(back_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show() # 保存图片 wc.to_file(path.join(d, imgname2)) lda_test_ok.py: # coding=utf-8 import numpy as np import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer import lda def doc_topic_word(): print(doc_topic[:, :3])#输出文档主题分布情况(前3列) print(topic_word[:, :3])#输出主题词分布情况(前3列),采用ifidf计算词频 #导出分布图 def plot_1(): # 计算各个主题中单词权重分布的情况 f, ax= plt.subplots(2, 1, figsize=(6, 6), sharex=True) for i, k in enumerate([0, 9]): #任意选择两个主题 ax[i].stem(topic_word[k,:], linefmt='b-', markerfmt='bo', basefmt='w-') ax[i].set_xlim(-2,2000) ax[i].set_ylim(0, 1) ax[i].set_ylabel("Prob") ax[i].set_title("topic {}".format(k)) ax[1].set_xlabel("word") plt.tight_layout() plt.show() def plot_2(): # 计算文档具体分布在那个主题,代码如下所示: f, ax= plt.subplots(2, 1, figsize=(8, 8), sharex=True) for i, k in enumerate([0,9]): #任意选择两个主题 ax[i].stem(doc_topic[k,:], linefmt='r-', markerfmt='ro', basefmt='w-') ax[i].set_xlim(-1, 20) #x坐标下标,即主题的取值范围 ax[i].set_ylim(0, 1.2) #y坐标下标 ax[i].set_ylabel("Prob") ax[i].set_title("Document {}".format(k)) ax[1].set_xlabel("Topic") plt.tight_layout() plt.show() if __name__ == "__main__": #存储读取语料 一行预料为一个文档 corpus = [] for line in open('./data/demo5_sheet5_1_1.csv', 'r').readlines(): corpus.append(line.strip()) #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() print (vectorizer) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() print("type(X): {}".format(type(X))) print("shape: {}\n".format(X.shape)) print (len(weight)) print (weight[:5, :5]) #LDA算法 print ('LDA:') model = lda.LDA(n_topics=20, n_iter=50, random_state=1) # model.fit_transform(X) model.fit(np.asarray(weight)) # model.fit_transform(X) is also available? topic_word = model.topic_word_ # model.components_ also works #文档-主题(Document-Topic)分布 doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) #输出前10篇文章最可能的Topic label = [] for n in range(10): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr)) #输出主题中的TopN关键词 word = vectorizer.get_feature_names() n = 6 for i, topic_dist in enumerate(topic_word): topic_words = np.array(word)[np.argsort(topic_dist)][:-(n+1):-1] print(u'*Topic {}\n- {}'.format(i, ' '.join(topic_words))) # doc_topic_word() # plot_1() plot_2() gensimTopicTest0803.py: # coding=utf-8 import re import xlrd import codecs import jieba from gensim import corpora, models, similarities FILE="demo5"#选择要训练的文件 ID='1'#选择要训练的能力ID # 读取停用词表 stopwords = [line.strip() for line in codecs.open('./data/stopwords.txt', 'r', encoding='utf-8').readlines()] def cleanAnswer(cols_numb): f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8') # 打开文件 workbook = xlrd.open_workbook('./data/%s.xls'%FILE) #根据sheet索引或者名称获取sheet内容 sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始 cols1 = sheet0.col_values(cols_numb[3])[1:]# 获取第三列内容,从第一行开始 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配 pattern1 = re.compile(p1) for i in range(len(cols1)): matcher1 = re.findall(pattern1, cols1[i])#以列表形式返回所有能匹配到的子串 str1=str() if matcher1: str1 =''.join(matcher1) f1.write(str1.strip()) f1.write('\n') f1.close() def ldaAnaly(): print("构造分词库-----train-----") #去停用词,构建分词库 train = [] fp = codecs.open('./result/%s_sheet%s.csv'%(FILE,ID),'r',encoding='utf8') for line in fp.readlines(): line = line.strip() if not len(line):#判断是否为空行 continue outstr = ' ' seg_list =jieba.cut(line,cut_all=False)#采用精确模式分词,效果最好 for word in seg_list: if word not in stopwords: if word != '\t': outstr += word outstr += " " train.append(outstr.strip().split(" "))#字符串转列表 fp.close() print("构造分词库,并保存----“dict_v1.dict”----") dic = corpora.Dictionary(train) dic.save('./result/dict_v1.dict') print("保存可读取的分词库----“dic.csv”----") fd = codecs.open('./result/dic.csv', 'a',encoding = 'utf-8') for word,index in dic.token2id.items(): fd.write(word +':'+ str(index)+'\n') fd.close() print("生成语料库,并保存-----“corpus.mm”-----") corpus = [dic.doc2bow(text) for text in train] corpora.MmCorpus.serialize('./result/corpus.mm', corpus) print("保存tfidf模型-----“corpus.tfidf_model”-----") tfidf = models.TfidfModel(corpus) tfidf.save('./result/corpus.tfidf_model') print("进行LDA主题分析,并保存-----“ldaModel.pkl”-----") #使用tf-idf模型训练语料库 corpus_tfidf = tfidf[corpus] #设置100个LDA主题,使用500次迭代 lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=100, iterations=500 ) lda.save('./result/ldaModel.pkl') print("评估文章属于不同主题的概率,一个词对文章的重要性-----“Demo:评估文章1”-----") for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1 * tup[1]): print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 10))) # 输出100个主题 # ldaOut = lda.print_topics(100) # print("默认返回每个主题的前10的概率最大的词") # print (ldaOut[0]) # print (ldaOut[1]) # print (ldaOut[2]) # corpus_lda = lda[corpus_tfidf] # print("每篇文章属于不同主题的概率分布") # k = 0 # for doc in corpus_lda: # print(doc) # k += 1 # if k == 3: # break def questionAnswer(cols_numb, questionNumber): lda = models.LdaModel.load('./result/ldaModel.pkl') dic = corpora.Dictionary.load('./result/dict_v1.dict') corpus = corpora.MmCorpus('./result/corpus.mm') tfidf = models.TfidfModel.load('./result/corpus.tfidf_model') # print("输入一个问题------------------") f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8') # 打开文件 workbook = xlrd.open_workbook('./data/%s.xls'%FILE) sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始 cols0 = sheet0.col_values(cols_numb[3])[questionNumber] # 获取第三列内容,从第一行开始 #对问题进行去乱码 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配 pattern1 = re.compile(p1) matcher1 = re.findall(pattern1, cols0)#以列表形式返回所有能匹配到的子串 query=str() if matcher1: query =''.join(matcher1) # print("待预测的问题(去乱码):", query) #对问题进行分词 seg_list = jieba.cut(query, cut_all=False) outstr = ' ' for word in seg_list: if word not in stopwords: if word != '\t': outstr += word outstr += " " inputTest=list(outstr.strip().split(" ")) # print("分词后的问题(去停用词):", inputTest) #将问题转成词袋 query_bow = dic.doc2bow(inputTest) # print("生成的词袋:", query_bow) #需要对查询语句进行tfidf转化 query_tfidf = tfidf[query_bow] lda_vec_tfidf = lda[query_tfidf] # print("问题对应的主题概率(tfidf)", lda_vec_tfidf) # print("预测问题属于不同主题的概率--------------------") #输出主题概率的代码 # for index, score in sorted(lda_vec_tfidf, key=lambda tup: -1 * tup[1]): # print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 20))) # print("预测问题与数据库中的哪些问题相似,并给出相似度排序(tfidf)--------------------") #进行相似性检索 similarity = similarities.MatrixSimilarity(corpus) #在TFIDF的基础上,进行相似性检测。query_lsi需要进行预先处理。先变化为dow2bow,然后tfidf. lda_vec = lda[query_bow] # sims = similarity[lda_vec] #相似度检测的词袋为no-tfidf sims = similarity[lda_vec_tfidf] #相似度检测的词袋为tfidf #先枚举出来,后进行排序输出 listSims = enumerate(sims) sort_sims = sorted(listSims, key=lambda item: -item[1]) # print(sort_sims[0:6])#前n名效果最好 #进行分数预测--版本1--- sort_sims_list = sort_sims[0:6] cols1 = sheet0.col_values(cols_numb[6])[1:]# 获取第三列内容,从第一行开始 f1.close() #采用百分比形式的加权平均法,实质就是加权平均偏差法 sumCore1 = 0 sumPro = 0 for i in range(len(sort_sims_list)): sumCore1 += float(cols1[sort_sims_list[i][0] - 1]) * sort_sims_list[i][1] # print(cols1[sort_sims_list[i][0] - 1]) sumPro += sort_sims_list[i][1] preCore1 = sumCore1 / sumPro # print("采用加权平均偏差法,预测分数1为:%s,实际分数为%s"%(preCore1, cols1[questionNumber-1])) print("保存预测结果----“pre.csv”----") return preCore1, cols1[questionNumber-1], abs(preCore1 - float(cols1[questionNumber-1])) if __name__ == '__main__': cols_numb = [0,1,2,3,4,5,6,7] #读取文件的列标号 # questionNumber = 124 #待测试的问题号,最大不超过问题总数,主要用于测试 # cleanAnswer(cols_numb) #对数据库中的问题进行提取,并去乱码 # ldaAnaly() #对问题进行训练,生成主题模型 # questionAnswer(cols_numb, questionNumber) #对问题进行预测,给出预测分数 #循环预测的demo fp = codecs.open('./result/pre_v1.csv', 'a', encoding='utf-8') sum = 0 i = 1 count = 0 while( i < 8717 ): questionNumber = i a = questionAnswer(cols_numb, questionNumber) sum += a[2] # print(a, a[2]) # exit() i += 8 count += 1 fp.write(str(i)+":"+str(a) + '\n') fp.close() ave = sum / count print(ave)