文档0文档1文档[[ 2.]]
文档0文档2文档[[ 2.44948974]]
文档1文档2文档[[ 2.44948974]]
## Stop-word filtering 停用词过滤
CountVectorizer类可以通过设置stop_words参数过滤停用词,默认是英语常用的停用词。
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game',
'I ate a sandwich'
]
vectorizer=CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
输出结果:
[[0 1 1 0 0 1 0 1]
[0 1 1 1 1 0 0 0]
[1 0 0 0 0 0 1 0]]
{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}
# Stemming and lemmatization 词根还原和词形还原
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['He ate the sandwiches',
'Every sandwich was eaten by him']
vectorizer=CountVectorizer(binary=True,stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
输出结果:
[[1 0 0 1]
[0 1 1 0]]
{u'sandwich': 2, u'ate': 0, u'sandwiches': 3, u'eaten': 1}
### 让我们分析一下单词gathering的词形还原:
corpus = [
'I am gathering ingredients for the sandwich.',
'There were many wizards at the gathering.'
]
import nltk
nltk.download()
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
wordnet_tags = ['n', 'v']
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]
stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])
输出结果:
('Stemmed:', [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']])
def lemmatize(token, tag):
if tag[0].lower() in ['n', 'v']:
return lemmatizer.lemmatize(token, tag[0].lower())
return token
lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus])
输出结果:
('Lemmatized:', [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']])
## 带TF-IDF权重的扩展词库
from sklearn.feature_extraction.text import CountVectorizer
corpus=['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer=CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
输出结果:
[[2 1 3 1 1]]
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['The dog ate a sandwich and I ate a sandwich','The wizard transfigured a sandwich']
vectorizer=TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
输出结果:
[[ 0.75458397 0.37729199 0.53689271 0. 0. ]
[ 0. 0. 0.44943642 0.6316672 0.6316672 ]]
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}
## 通过哈希技巧实现特征向量
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print(vectorizer.transform(corpus).todense())
输出结果:
[[-1. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 1. 0. 0.]
[ 0. 0. 0. 0. -1. 0.]
[ 0. 1. 0. 0. 0. 0.]]
设置成6是为了演示。另外,注意有些单词频率是负数。由于Hash碰撞可能发生,所以HashingVectorizer用有符号哈希函数(signed hash function)。特征值和它的词块的哈希值带
同样符号,如果cats出现过两次,被哈希成-3,文档特征向量的第四个元素要减去2。如果dogs出现过两次,被哈希成3,文档特征向量的第四个元素要加上2。
## 图片特征提取
#通过像素值提取特征
scikit-learn的digits数字集包括至少1700种0-9的手写数字图像。每个图像都有8x8像像素构成。每
个像素的值是0-16,白色是0,黑色是16。如下图所示:
%matplotlib inline
from sklearn import datasets
import matplotlib.pyplot as plt
digits=datasets.load_digits()
print 'Digit:',digits.target[0]
print digits.images[0]
plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
输出结果:
Digit: 0
[[ 0. 0. 5. 13. 9. 1. 0. 0.]
[ 0. 0. 13. 15. 10. 15. 5. 0.]
[ 0. 3. 15. 2. 0. 11. 8. 0.]
[ 0. 4. 12. 0. 0. 8. 8. 0.]
[ 0. 5. 8. 0. 0. 9. 8. 0.]
[ 0. 4. 11. 0. 1. 12. 7. 0.]
[ 0. 2. 14. 5. 10. 12. 0. 0.]
[ 0. 0. 6. 13. 10. 0. 0. 0.]]
![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAPcAAAD7CAYAAAC2TgIoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAADCRJREFUeJzt3U2MXXUdxvHnGSrQijRFDQqVl8ZUQzelrWXRmowvoxUTWLWCJMYudAMp0cS0YUPZdYckumlEfMOXmUaUBZI2wWLACJ3SSqEt1bZAq6XUZAIhBQPyc3FPTVOmzpm55/+fmR/fTzLpnZsz9/lPZ545Z+49c36OCAHIZ2C6FwCgDMoNJEW5gaQoN5AU5QaSotxAUnO6eiDbvKYGTJOI8Ln3dVbuJmDSH7N582Zt3ry5y2UUyRoZGZn0xwwPD2vdunVTytu4ceOkP2ZsbEwLFiyYUt7Q0NCkP2Z0dFQrVqyYUt6WLVum9DGbNm2aUt5U/l9qfm/2k2e/p9eSOCwH0qLcQFLTXu7BwcGUWZK0ZMmSqnkXX3xx1bwrrriiat7q1aur5tX+fuk6j3IXVLvcc+fOrZpHuWd23rSXG0AZlBtIinIDSbUqt+01tg/aPmR78i/AAqhuwnLbHpD0A0lflrRE0q22P116YQD602bPvVLS3yLipYh4W9KvJd1cdlkA+tWm3FdKOnbW+8eb+wDMYDyhBiTV5g9H/iHpqrPeX9jc9x5nn/Q+ODhY/SQA4P1g586d2rlz54TbtSn3LkmftH21pBOSbpF063gb1vwLGuD96twd5z333DPudhOWOyL+Y/sOSdvVO4y/PyIOdLNMAKW0+nvuiHhU0qcKrwVAh3hCDUiKcgNJUW4gKcoNJEW5gaQoN5AU5QaSotxAUpQbSKrTiSOZTWUCSD+OHj1aNW9sbKxq3mWXXVY1b3h4uGre2rVrq+aNhz03kBTlBpKi3EBSlBtIinIDSVFuICnKDSRFuYGkKDeQVJtxQvfbPmn72RoLAtCNNnvuB9SbEwZgFpmw3BHxhKS6Jx4D6Bu/cwNJdfpXYYwTAsrrcpxQa4wTAsprO06o7WG5mzcAs0Sbl8J+KenPkhbbftn2+vLLAtCvNoMAv15jIQC6xbPlQFKUG0iKcgNJUW4gKcoNJEW5gaQoN5AU5QaSotxAUrN2Vtju3bur5tWe3XX48OGqeYsWLaqaNzQ0VDWv9vcLs8IAFEO5gaQoN5AU5QaSotxAUpQbSIpyA0lRbiApyg0k1eYCiQttP2b7edv7bG+osTAA/Wlz+uk7kr4bEXttXyJpt+3tEXGw8NoA9KHNrLBXImJvc/sNSQckXVl6YQD6M6nfuW1fI2mppKdKLAZAd1r/VVhzSL5N0p3NHvw9mBUGlNfprDDbc9Qr9s8j4vfn245ZYUB5Xc8K+7Gk/RFxX98rA1BFm5fCVkm6TdLnbe+x/YztNeWXBqAfbWaFPSnpggprAdAhzlADkqLcQFKUG0iKcgNJUW4gKcoNJEW5gaQoN5AU5QaSmrWzwsbGxqrmLVu2rGpe7dldtS1fvny6l5Aee24gKcoNJEW5gaQoN5AU5QaSotxAUpQbSIpyA0lRbiCpCc9Qs32RpD9JurDZfltEjH8tVQAzRpsLJP7b9uci4rTtCyQ9afsPEfF0hfUBmKJWh+URcbq5eZF6PxCi2IoAdKJVuW0P2N4j6RVJOyJiV9llAehXq78Ki4h3JV1v+1JJv7N9XUTsP3c7ZoUB5XU6K+yMiHjd9h8lrZH0f8sNoIzOZoXZ/ojt+c3tuZKGJB3sZJUAimmz5/64pJ/aHlDvh8FvIuKRsssC0K82L4Xtk1T3MiQA+sYZakBSlBtIinIDSVFuICnKDSRFuYGkKDeQFOUGkqLcQFLMCmtpaGioal52tb9+CxYsqJo3E7DnBpKi3EBSlBtIinIDSVFuICnKDSRFuYGkKDeQFOUGkmpd7mYwwTO2Hy65IADdmMye+06Nc61yADNT23FCCyXdKOlHZZcDoCtt99z3SvqeGAAIzBpt5nN/VdLJiNhre1CSz7cts8KA8rqcFbZK0k22b5Q0V9KHbP8sIr5x7obMCgPK62xWWETcFRFXRcQiSbdIemy8YgOYWXidG0hqsiN8H5f0eKG1AOgQe24gKcoNJEW5gaQoN5AU5QaSotxAUpQbSIpyA0lRbiCpWTsrrPbsp927d1fNq6327K7R0dGqeevWrauaNxOw5waSotxAUpQbSIpyA0lRbiApyg0kRbmBpCg3kBTlBpJqdYaa7RclvSbpXUlvR8TKkosC0L+2p5++K2kwIuqeowhgytoelnsS2wKYAdoWNiTtsL3L9rdKLghAN9oelq+KiBO2P6peyQ9ExBPnbsSsMKC8LmeFKSJONP+esv2QpJWS/m+5AZTR2aww2/NsX9Lc/qCkL0l6rpNVAiimzZ77ckkP2Y5m+wcjYnvZZQHo14TljoijkpZWWAuADvHyFpAU5QaSotxAUpQbSIpyA0lRbiApyg0kRbmBpCg3kJQjopsHsqOrx2rjyJEj1bIkafny5VXztm7dWjVvZGSkat7hw4er5mWe9WZbEeFz72fPDSRFuYGkKDeQFOUGkqLcQFKUG0iKcgNJUW4gKcoNJNWq3Lbn2x6xfcD287ZvKL0wAP1pO5TgPkmPRMRa23MkzSu4JgAdmLDcti+V9NmI+KYkRcQ7kl4vvC4AfWpzWH6tpH/ZfsD2M7a32p5bemEA+tPmsHyOpGWSbo+IUdvfl7RJ0t3nbsisMKC8LmeFHZd0LCJGm/e3Sdo43obMCgPK62xWWESclHTM9uLmri9I2t//EgGU1PbZ8g2SHrT9AUlHJK0vtyQAXWg7wvevkj5TeC0AOsQZakBSlBtIinIDSVFuICnKDSRFuYGkKDeQFOUGkqLcQFKzdlZYbbVnd23ZsqVq3ooVK6rmDQ8PV83LjFlhwPsM5QaSotxAUpQbSIpyA0lRbiApyg0kRbmBpCYst+3Ftvc01yzfY/s12xtqLA7A1E14DbWIOCTpekmyPaDepY4fKrwuAH2a7GH5FyUdjohjJRYDoDuTLffXJP2qxEIAdKt1uZtrlt8kaaTccgB0pe1QAkn6iqTdEXHqfBswKwwor8tZYWfcqgkOyZkVBpTX2awwSbI9T70n037bwdoAVNB2nNBpSR8tvBYAHeIMNSApyg0kRbmBpCg3kBTlBpKi3EBS017uNmfazMYsSXrhhReq5r355ptV81599dWqebW/frM9j3IXdOjQoap5b731VtW8U6fOeyZyEbO9bLXzpr3cAMqg3EBSnc4K6+SBAEzaeLPCOis3gJmFw3IgKcoNJDVt5ba9xvZB24dsbyycdb/tk7afLZlzVt5C24/Zft72vtKXgrZ9ke2nmktP77N9d8m8JnOgudz1w6WzmrwXbf+1+RyfLpw13/aI7QPN1/CGglnlLh0eEdXf1Puh8ndJV0v6gKS9kj5dMG+1pKWSnq30+X1M0tLm9iWSXij5+TU585p/L5D0F0krC+d9R9IvJD1c6f/0iKQFlbJ+Iml9c3uOpEsr5Q5I+qekT3TxeNO1514p6W8R8VJEvC3p15JuLhUWEU9IGiv1+OPkvRIRe5vbb0g6IOnKwpmnm5sXqfcNWeyZUtsLJd0o6UelMsaLVYUjTduXSvpsRDwgSRHxTkS8Xjq30emlw6er3FdKOvsTOK7C3/zTxfY16h01PFU4Z8D2HkmvSNoREbsKxt0r6Xsq+ANkHCFph+1dtr9VMOdaSf+y/UBzqLzV9tyCeWfr9NLhPKFWkO1LJG2TdGezBy8mIt6NiOslLZR0g+3rSuTY/qqkk82RiZu3GlZFxDL1jhhut726UM4cScsk/bDJOy1pU6Gs/ylx6fDpKvc/JF111vsLm/vSsD1HvWL/PCJ+Xyu3OYT8o6Q1hSJWSbrJ9hH19jKfs/2zQln/ExEnmn9PqTfOamWhqOOSjkXEaPP+NvXKXtqElw6frOkq9y5Jn7R9te0LJd0iqfSzrjX3MpL0Y0n7I+K+0kG2P2J7fnN7rqQhSQdLZEXEXRFxVUQsUu/r9lhEfKNE1hm25zVHQbL9QUlfkvRciayIOCnpmO3FzV1fkLS/RNY5Jrx0+GRN5rrlnYmI/9i+Q9J29X7A3B8RB0rl2f6lpEFJH7b9sqS7zzxhUihvlaTbJO1rfg8OSXdFxKOFIj8u6afNoMYBSb+JiEcKZU2HyyU91JziPEfSgxGxvWDeBkkPNofKRyStL5h19qXDv93p4zZPwQNIhifUgKQoN5AU5QaSotxAUpQbSIpyA0lRbiApyg0k9V/fFLmHc3t/gAAAAABJRU5ErkJggg==)
digits=datasets.load_digits()
print('Feature vector:\n',digits.images[0].reshape(-1,64))
输出结果:
('Feature vector:\n', array([[ 0., 0., 5., 13., 9., 1., 0., 0., 0., 0., 13.,
15., 10., 15., 5., 0., 0., 3., 15., 2., 0., 11.,
8., 0., 0., 4., 12., 0., 0., 8., 8., 0., 0.,
5., 8., 0., 0., 9., 8., 0., 0., 4., 11., 0.,
1., 12., 7., 0., 0., 2., 14., 5., 10., 12., 0.,
0., 0., 0., 6., 13., 10., 0., 0., 0.]]))
%matplotlib inline
import numpy as np
from skimage.feature import corner_harris,corner_peaks
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
import skimage.io as io
from skimage.exposure import equalize_hist
def show_corners(corners,image):
fig=plt.figure()
plt.gray()
plt.imshow(image)
y_corner,x_corner=zip(*corners)
plt.plot(x_corner,y_corner,'or')
plt.xlim(0,image.shape[1])
plt.ylim(image.shape[0],0)
fig.set_size_inches(np.array(fig.get_size_inches())*1.5)
plt.show()