自然语言处理 NLTK

from nltk.tokenize import MWETokenizer


tokenizer = MWETokenizer([('molecular','pathogenesis'), ('molecular','basis'), ('cognitive','assessment'),('clinical','intervention'),('clinical','interventions')
,('risk','factor'),('risk','factors'),('assisted','care')])

 

all_the_text = titleandabstractList[i].lower()
all_the_text = re.sub("\"|,|\.", "", all_the_text)
for word in tokenizer.tokenize(all_the_text.split()):

 

posted @ 2016-12-08 14:50  zdmlcmepl  阅读(172)  评论(0编辑  收藏  举报