NLP(十) 主题识别
原文链接:http://www.one2know.cn/nlp10/
- 主题识别
是发现输入文本集合中存在的主题的过程
LDA算法,即狄利克雷分布算法
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora,models
import feedparser
class IdentifyingTopicExample:
def getDocuments(self): # 获取文档 放到documents中
url = 'https://sports.yahoo.com/mlb/rss.xml'
feed = feedparser.parse(url)
self.documents = []
for entry in feed['entries'][:5]:
text = entry['summary']
if 'ex' in text:
continue
self.documents.append(text)
print('-- {}'.format(text))
print('INFO: Fetching documents from {} completed'.format(url))
def cleanDocuments(self):
tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # 想要只处理字母9
en_stop = set(stopwords.words('english')) # 英文停用词放到en_stop中
self.cleaned = [] # 用于存储所有被清洗且分词后的文档
for doc in self.documents:
lowercase_doc = doc.lower() # 字母都变小写
words = tokenizer.tokenize(lowercase_doc) # 分词
non_stopped_words = [i for i in words if not i in en_stop] # 过滤掉停用词
self.cleaned.append(non_stopped_words) # cleaned 二维列表
print('INFO: Clearning {} documents completed'.format(len(self.documents)))
def doLDA(self):
dictionary = corpora.Dictionary(self.cleaned) # 创建字典
corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
# 由每个清洗后的句子,以词袋形式定义corpus变量
ldamodel = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary)
# 在corpus上创建一个模型,主题数量设为2,id2word设置词典的大小/映射情况
print(ldamodel.print_topics(num_topics=2,num_words=4)) # 打印主题 每个主题含4个单词
def run(self):
self.getDocuments()
self.cleanDocuments()
self.doLDA()
if __name__ == "__main__":
topicExample = IdentifyingTopicExample()
topicExample.run()
输出:
-- MLB Network documentary shines spotlight on 1995 Mariners team that saved baseball in Seattle.
-- Marcus Semien's second big swing of the day finally gave the Oakland Athletics some breathing room in an oh-so-tight series with the AL Central-leading Twins. Semien hit a grand slam in the eighth inning after his tying homer leading off the fifth, Chris Herrmann had a career-high four hits, and
-- It wasn't long until Cleveland took advantage of it. Francisco Lindor drove in the go-ahead runs during a six-run seventh inning, Jose Ramirez homered twice and Carlos Santana pushed his on-base streak to 27 games as the Indians rallied to beat bumbling Kansas City 8-4 on Thursday and complete a
-- A look at what's happening around the majors Friday:
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Clearning 4 documents completed
[(0, '0.022*"look" + 0.022*"friday" + 0.022*"around" + 0.022*"majors"'), (1, '0.023*"leading" + 0.023*"semien" + 0.022*"inning" + 0.014*"homer"')]