import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()
data = pd.read_table('./Seattle_Hotels.txt',encoding="latin-1",sep = ',')
data
|
name |
address |
desc |
0 |
Hilton Garden Seattle Downtown |
1821 Boren Avenue, Seattle Washington 98101 USA |
Located on the southern tip of Lake Union, the... |
1 |
Sheraton Grand Seattle |
1400 6th Avenue, Seattle, Washington 98101 USA |
Located in the city's vibrant core, the Sherat... |
2 |
Crowne Plaza Seattle Downtown |
1113 6th Ave, Seattle, WA 98101 |
Located in the heart of downtown Seattle, the ... |
3 |
Kimpton Hotel Monaco Seattle |
1101 4th Ave, Seattle, WA98101 |
What?s near our hotel downtown Seattle locatio... |
4 |
The Westin Seattle |
1900 5th Avenue, Seattle, Washington 98101 USA |
Situated amid incredible shopping and iconic a... |
... |
... |
... |
... |
147 |
The Halcyon Suite Du Jour |
1125 9th Ave W, Seattle, WA 98119 |
Located in Queen Anne district, The Halcyon Su... |
148 |
Vermont Inn |
2721 4th Ave, Seattle, WA 98121 |
Just a block from the world famous Space Needl... |
149 |
Stay Alfred on Wall Street |
2515 4th Ave, Seattle, WA 98121 |
Stay Alfred on Wall Street resides in the hear... |
150 |
Pike's Place Lux Suites by Barsala |
2nd Ave and Stewart St, Seattle, WA 98101 |
The perfect marriage of heightened convenience... |
151 |
citizenM Seattle South Lake Union hotel |
201 Westlake Ave N, Seattle, WA 98109 |
Yes, it's true. Every room at citizenM is the ... |
152 rows × 3 columns
data.shape
(152, 3)
data['desc'][100]
看一下酒店的主要描述信息
vec = CountVectorizer().fit(data['desc'])
bag_of_words = vec.transform(data['desc'])
bag_of_words.toarray()
array([[0, 1, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 1, 0, 0]], dtype=int64)
bag_of_words.shape
(152, 3200)
sum_words = bag_of_words.sum(axis =0 )
sum_words
matrix([[ 1, 11, 11, ..., 2, 6, 2]], dtype=int64)
def get_top_n_words(corpus,n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis =0 )
word_freqs = [(word,sum_words[0,idex]) for word,idex in vec.vocabulary_.items()]
word_freq = sorted(word_freqs,key = lambda x:x[1],reverse=True)
return word_freq[:n]
common_words = get_top_n_words(data['desc'],20)
common_words
[('the', 1258),
('and', 1062),
('of', 536),
('seattle', 533),
('to', 471),
('in', 449),
('our', 359),
('you', 304),
('hotel', 295),
('with', 280),
('is', 271),
('at', 231),
('from', 224),
('for', 216),
('your', 186),
('or', 161),
('center', 151),
('are', 136),
('downtown', 133),
('on', 129)]
df1 = pd.DataFrame(common_words,columns=['desc','counts'])
df1.head()
|
desc |
counts |
0 |
seattle |
533 |
1 |
hotel |
295 |
2 |
center |
151 |
3 |
downtown |
133 |
4 |
free |
123 |
df1.groupby('desc').sum()['counts'].sort_values().iplot(kind='barh',yTitle='counts',linecolor='black',title='top 20 before remove stopwords')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人