python酒店相似度推荐系统

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random

import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

#加载数据集
data = pd.read_table('./Seattle_Hotels.txt',encoding="latin-1",sep = ',')
data

	name	address	desc
0	Hilton Garden Seattle Downtown	1821 Boren Avenue, Seattle Washington 98101 USA	Located on the southern tip of Lake Union, the...
1	Sheraton Grand Seattle	1400 6th Avenue, Seattle, Washington 98101 USA	Located in the city's vibrant core, the Sherat...
2	Crowne Plaza Seattle Downtown	1113 6th Ave, Seattle, WA 98101	Located in the heart of downtown Seattle, the ...
3	Kimpton Hotel Monaco Seattle	1101 4th Ave, Seattle, WA98101	What?s near our hotel downtown Seattle locatio...
4	The Westin Seattle	1900 5th Avenue, Seattle, Washington 98101 USA	Situated amid incredible shopping and iconic a...
...	...	...	...
147	The Halcyon Suite Du Jour	1125 9th Ave W, Seattle, WA 98119	Located in Queen Anne district, The Halcyon Su...
148	Vermont Inn	2721 4th Ave, Seattle, WA 98121	Just a block from the world famous Space Needl...
149	Stay Alfred on Wall Street	2515 4th Ave, Seattle, WA 98121	Stay Alfred on Wall Street resides in the hear...
150	Pike's Place Lux Suites by Barsala	2nd Ave and Stewart St, Seattle, WA 98101	The perfect marriage of heightened convenience...
151	citizenM Seattle South Lake Union hotel	201 Westlake Ave N, Seattle, WA 98109	Yes, it's true. Every room at citizenM is the ...

152 rows × 3 columns

data.shape

(152, 3)

data['desc'][100]

'On a budget in Seattle or looking for something different? The historic charm and "home away from home" atmosphere of The Baroness will be sure to make you feel like one of the family. Conveniently located on First Hill, we are proud to be part of the Virginia Mason Hospital campus and only minutes from Harborview Medical Center and Swedish Hospital. The Baroness Hotel is a great option for short or long term medical, patient or family stays. Whether you are visiting the area\'s world-class medical facilities or on a budget vacation, our goal is to ensure a wonderful stay. Guest Amenities: Complimentary Internet access, Two twin, one or two queen studios with mini fridge and microwave, Two twin or one queen suites with full kitchens, Laundry facilities available, Flat screen cable television with HBO, Complimentary local calls, Ice and vending machines located in the lobby, Coffee maker and hairdryers in all guestrooms, Room service available seven days a week from the Rhododendron Cafe, Limited wheelchair accessibility, Guest library and business center, Printing & fax services available, 100% non-smoking and pet free, Rooms are not air conditioned - fans are available, Self-parking available at Virginia Mason hospital for a fee.'

看一下酒店的主要描述信息

vec = CountVectorizer().fit(data['desc'])
bag_of_words = vec.transform(data['desc'])

bag_of_words.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

bag_of_words.shape

(152, 3200)

sum_words = bag_of_words.sum(axis =0 )

sum_words

matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)

def get_top_n_words(corpus,n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis =0 )
    word_freqs = [(word,sum_words[0,idex]) for word,idex in vec.vocabulary_.items()]
    word_freq = sorted(word_freqs,key = lambda x:x[1],reverse=True)
    return word_freq[:n]

common_words = get_top_n_words(data['desc'],20)
common_words

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129)]

df1 = pd.DataFrame(common_words,columns=['desc','counts'])
df1.head()

	desc	counts
0	seattle	533
1	hotel	295
2	center	151
3	downtown	133
4	free	123

df1.groupby('desc').sum()['counts'].sort_values().iplot(kind='barh',yTitle='counts',linecolor='black',title='top 20 before remove stopwords')

posted @ 2024-03-10 18:16 AIIsFuture 阅读(43) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· python音乐推荐系统

· python信用卡欺诈检测

· 数据可视化-matplotlib-note

· DS / ML Basic Notes

· Python数据分析numpy、pandas、matplotlib包

阅读排行：
· 震惊！C++程序真的从main开始吗？99%的程序员都答错了
· winform 绘制太阳，地球，月球运作规律
· 【硬核科普】Trae如何「偷看」你的代码？零基础破解AI编程运行原理
· 上周热点回顾（3.3-3.9）
· 超详细：普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人

公告

昵称： AIIsFuture
园龄： 7年4个月
粉丝： 1
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

随笔分类

大数据(1)

随笔档案

阅读排行榜

评论排行榜

1. python音乐推荐系统(1)

AI Is Future

微信公众号：AI Is Future，深耕数字化、AI践行者

python酒店相似度推荐系统

看一下酒店的主要描述信息

公告

搜索

常用链接

积分与排名

随笔分类

随笔档案

阅读排行榜

评论排行榜

最新评论