文本向量化 - 词袋模型, N-gram 特征
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop_list = list(set(stopwords.words('english'))) # set()集合函数消除重复项
corpus = ['This is the first document.', # 语料库
'This is the second second document.',
'And the third one.',
'Is this the first document?']
# -----------------------------------
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus) # 向量化,得到词袋模型
print(X.toarray())
print(vectorizer.get_feature_names())
print()
# -----------------------------------
bigram_vectorizer = CountVectorizer(ngram_range=(1,3), # N元特征
stop_words = stop_list) # 停用词
X = bigram_vectorizer.fit_transform(corpus)
print(X.toarray())
print(bigram_vectorizer.get_feature_names())
print()
# ------------------------------------
analyze = vectorizer.build_analyzer()
print(analyze('This is a text document to analyze.'))
print(vectorizer.transform(['something completely new.',
'and this has something old.']).toarray())
[[0 1 1 1 0 0 1 0 1]
[0 1 0 1 0 2 1 0 1]
[1 0 0 0 1 0 1 1 0]
[0 1 1 1 0 0 1 0 1]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[1 1 1 0 0 0 0 0 0 0]
[1 0 0 0 2 1 1 1 0 0]
[0 0 0 1 0 0 0 0 1 1]
[1 1 1 0 0 0 0 0 0 0]]
['document', 'first', 'first document', 'one', 'second', 'second document', 'second second', 'second second document', 'third', 'third one']
['this', 'is', 'text', 'document', 'to', 'analyze']
[[0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0 1]]
CountVectorizer和TfidfVectorizer的参数:https://blog.csdn.net/du_qi/article/details/51564303
stopwords:https://www.cnblogs.com/webRobot/p/6079919.html