自然语言处理 NLTK

from nltk.tokenize import MWETokenizer

tokenizer = MWETokenizer([('molecular','pathogenesis'), ('molecular','basis'), ('cognitive','assessment'),('clinical','intervention'),('clinical','interventions')
,('risk','factor'),('risk','factors'),('assisted','care')])

all_the_text = titleandabstractList[i].lower()
all_the_text = re.sub("\"|,|\.", "", all_the_text)
for word in tokenizer.tokenize(all_the_text.split()):