NLP整体流程的代码
import nltk import numpy as np import re from nltk.corpus import stopwords # 1 分词1 text = "Sentiment analysis is a challenging subject in machine learning.\ People express their emotions in language that is often obscured by sarcasm,\ ambiguity, and plays on words, all of which could be very misleading for \ both humans and computers. There's another Kaggle competition for movie review \ sentiment analysis. In this tutorial we explore how Word2Vec can be applied to \ a similar problem.".lower() text_list = nltk.word_tokenize(text) #2 q去掉标点符号和停用词 #去掉标点符号 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] text_list = [word for word in text_list if word not in english_punctuations] #去掉停用词 stops = set(stopwords.words("english")) text_list = [word for word in text_list if word not in stops] #3统计词频 freq_dist = nltk.FreqDist(text_list) freq_list = [] num_words = len(freq_dist.values()) for i in range(num_words): freq_list.append([list(freq_dist.keys())[i],list(freq_dist.values())[i]]) freqArr = np.array(freq_list) print(freqArr) #4词性标注 print(nltk.pos_tag(text_list))