不同的关键词算法
微信公众号:数据运营人
本系列为博主的读书学习笔记,如需转载请注明出处。
不同的关键词算法
需求:通过项目概况,提取项目的核心关键词。
解决办法:寻找不同关键词提取的算法,对比每种算法提取的结果。
注:本文不对算法的推导进行描述,只写实现的路径。
模块导入数据导入1.tfidf2.加停词表的tfidf3.textrank4.word2vec5.rank6.nltk_rank结果展示
模块导入
import pandas as pd
from jieba.analyse import *
from collections import Counter
from jieba import analyse
import numpy as np
import gensim
import jieba
from rake_nltk import Rake
数据导入
df = pd.read_excel('E:/pywork/dataprocess/yangxm-20190320.xls')
print(df['项目概况'][0])
print(df['项目概况'][1])
print(len(df))
print(df['项目概况'][5725])
n = 5725
1.tfidf
未停词直接用tfidf提取关键词
tfidf_all = []
for i in range(n):
tfidf_all.append(str([keyword[0] for keyword in extract_tags(str(df['项目概况'][i]), withWeight=True)]))
print(tfidf_all)
df['tfidf'] = pd.DataFrame(tfidf_all)
2.加停词表的tfidf
根据自定义的停词表,然后使用tfidf算法
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子去除停用词
def movestopwords(sentence):
stopwords = stopwordslist('C:/Users/Admin/Desktop/types/stopwords.txt') # 加载停用词的路径
outstr = ''
for word in sentence:
if word not in stopwords:
if word != '\t'and'\n':
outstr += word
# outstr += " "
return outstr
tfidf = analyse.extract_tags
sptfidf_all = []
for i in range(n):
keywords = tfidf(movestopwords([keyword[0] for keyword in extract_tags(str(df['项目概况'][i]), withWeight=True)]))
sptfidf_all.append(str([keyword for keyword in keywords]))
df['sptfidf'] = pd.DataFrame(sptfidf_all)
3.textrank
根据jieba自带的方法textrank算法实现
textrank_all = []
for i in range(n):
textrank_all.append(str([keyword[0] for keyword in textrank(str(df['项目概况'][i]), withWeight=True)]))
# print(textrank_all)
df['textrank'] = pd.DataFrame(textrank_all)
4.word2vec
采用微信训练好的语料库,实现word2vec
# 采用别人训练的语料库
model = gensim.models.word2vec.Word2Vec.load('E:/pywork/dataprocess/yang/word2vec/word2vec_wx')
# 此函数计算某词对于模型中各个词的转移概率p(wk|wi)
def predict_proba(oword, iword):
# 获取输入词的词向量
iword_vec = model[iword]
# 获取保存权重的词的词库
oword = model.wv.vocab[oword]
oword_l = model.syn1[oword.point].T
dot = np.dot(iword_vec, oword_l)
lprob = -sum(np.logaddexp(0, -dot) + oword.code*dot)
return lprob
# 各个词对于某词wi转移概率的乘积即为p(content|wi),
# 如果p(content|wi)越大就说明在出现wi这个词的条件下,此内容概率越大,
# 那么把所有词的p(content|wi)按照大小降序排列,越靠前的词就越重要,越应该看成是本文的关键词。
def keywords(s):
# 抽出s中和与训练的model重叠的词
word2vec_one = []
s = [w for w in s if w in model]
ws = {w: sum([predict_proba(u, w) for u in s]) for w in s}
w_len = int(len(sorted(ws.items(), key=lambda item: item[1], reverse=True)) / 2)
w = sorted(ws.items(), key=lambda item : item[1], reverse=True)[:w_len]
for i in w:
word2vec_one.append(i[0])
return word2vec_one
word2vec_all = []
for i in range(n):
wor
d2vec_all.append(str(keywords(list(jieba.cut(str(df['项目概况'][i]))))))
df['word2vec'] = pd.DataFrame(word2vec_all)
5.rank
根据自定义的停词表和rank算法实现关键词的提取
# 自定义停词的rank
import jieba
import jieba.posseg as pseg
import operator
import json
from collections import Counter
# Data structure for holding data
class Word():
def __init__(self, char, freq=0, deg=0):
self.freq = freq
self.deg = deg
self.char = char
def returnScore(self):
return self.deg / self.freq
def updateOccur(self, phraseLength):
self.freq += 1
self.deg += phraseLength
def getChar(self):
return self.char
def updateFreq(self):
self.freq += 1
def getFreq(self):
return self.freq
# Check if contains num
def notNumStr(instr):
for item in instr:
if '\u0041' <= item <= '\u005a' or ('\u0061' <= item <= '\u007a') or item.isdigit():
return False
return True
# Read Target Case if Json
def readSingleTestCases(testFile):
with open(testFile) as json_data:
try:
testData = json.load(json_data)
except:
# This try block deals with incorrect json format that has ' instead of "
data = json_data.read().replace("'", '"')
try:
testData = json.loads(data)
# This try block deals with empty transcript file
except:
return ""
returnString = ""
for item in testData:
try:
returnString += item['text']
except:
returnString += item['statement']
return returnString
def run(rawText):
# Construct Stopword Lib
swLibList = [line.rstrip('\n') for line in open(r"C:/Users/Admin/Desktop/types/stopwords.txt", 'r', encoding='utf-8')]
# Construct Phrase Deliminator Lib
conjLibList = [line.rstrip('\n') for line in open(r"C:/Users/Admin/Desktop/types/stopwords.txt", 'r', encoding='utf-8')]
# Cut Text
rawtextList = pseg.cut(rawText)
# Construct List of Phrases and Preliminary textList
textList = []
listofSingleWord = dict()
lastWord = ''
poSPrty = ['m', 'x', 'uj', 'ul', 'mq', 'u', 'v', 'f']
meaningfulCount = 0
checklist = []
for eachWord, flag in rawtextList:
checklist.append([eachWord, flag])
if eachWord in conjLibList or not notNumStr(
eachWord) or eachWord in swLibList or flag in poSPrty or eachWord == '\n':
if lastWord != '|':
textList.append("|")
lastWord = "|"
elif eachWord not in swLibList and eachWord != '\n':
textList.append(eachWord)
meaningfulCount += 1
if eachWord not in listofSingleWord:
listofSingleWord[eachWord] = Word(eachWord)
lastWord = ''
# Construct List of list that has phrases as wrds
newList = []
tempList = []
for everyWord in textList:
if everyWord != '|':
tempList.append(everyWord)
else:
newList.append(tempList)
tempList = []
tempStr = ''
for everyWord in textList:
if everyWord != '|':
tempStr += everyWord + '|'
else:
if tempStr[:-1] not in listofSingleWord:
listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
tempStr = ''
# Update the entire List
for everyPhrase in newList:
res = ''
for everyWord in everyPhrase:
listofSingleWord[everyWord].updateOccur(len(everyPhrase))
res += everyWord + '|'
phraseKey = res[:-1]
if phraseKey not in listofSingleWord:
listofSingleWord[phraseKey] = Word(phraseKey)
else:
listofSingleWord[phraseKey].updateFreq()
# Get score for entire Set
outputList = dict()
for everyPhrase in newList:
if len(everyPhrase) > 5:
continue
score = 0
phraseString = ''
outStr = ''
for everyWord in everyPhrase:
score += listofSingleWord[everyWord].returnScore()
phraseString += everyWord + '|'
outStr += everyWord
phraseKey = phraseString[:-1]
freq = listofSingleWord[phraseKey].getFreq()
if freq !=0 and meaningfulCount !=0:
if freq / meaningfulCount < 0.01 and freq < 3:
continue
outputList[outStr] = score
sorted_list = sorted(outputList.items(), key=operator.itemgetter(1), reverse=True)
# print(sorted_list)
ranks = []
for i in sorted_list:
if i[0] != '':
ranks.append(i[0])
return ranks
rank_all = []
for i in range(n):
rank_all.append(str(run(str(df['项目概况'][i]))))
df['rank'] = pd.DataFrame(rank_all)
6.nltk_rank
根据nltk自带的rank算法实现
nltk_rank_all = []
for i in range(n):
r = Rake()
r.extract_keywords_from_text(str(df['项目概况'][i]))
nltk_rank_all.append(str(r.ranked_phrases))
print(nltk_rank_all)
df['nltk_rank'] = pd.DataFrame(nltk_rank_all)
df.to_excel('C:/Users/Admin/Desktop/keyword_all.xlsx')
结果展示
总体来看:rank算法提取都是关键词的词语较长,tfidf/word2vec和textrank算法提取的关键词的差异不是很大,主要取决于停词表和原始训练的语料库的质量。