python酒店相似度推荐系统

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random

import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

#加载数据集
data = pd.read_table('./Seattle_Hotels.txt',encoding="latin-1",sep = ',')
data

name address desc
0 Hilton Garden Seattle Downtown 1821 Boren Avenue, Seattle Washington 98101 USA Located on the southern tip of Lake Union, the...
1 Sheraton Grand Seattle 1400 6th Avenue, Seattle, Washington 98101 USA Located in the city's vibrant core, the Sherat...
2 Crowne Plaza Seattle Downtown 1113 6th Ave, Seattle, WA 98101 Located in the heart of downtown Seattle, the ...
3 Kimpton Hotel Monaco Seattle 1101 4th Ave, Seattle, WA98101 What?s near our hotel downtown Seattle locatio...
4 The Westin Seattle 1900 5th Avenue, Seattle, Washington 98101 USA Situated amid incredible shopping and iconic a...
... ... ... ...
147 The Halcyon Suite Du Jour 1125 9th Ave W, Seattle, WA 98119 Located in Queen Anne district, The Halcyon Su...
148 Vermont Inn 2721 4th Ave, Seattle, WA 98121 Just a block from the world famous Space Needl...
149 Stay Alfred on Wall Street 2515 4th Ave, Seattle, WA 98121 Stay Alfred on Wall Street resides in the hear...
150 Pike's Place Lux Suites by Barsala 2nd Ave and Stewart St, Seattle, WA 98101 The perfect marriage of heightened convenience...
151 citizenM Seattle South Lake Union hotel 201 Westlake Ave N, Seattle, WA 98109 Yes, it's true. Every room at citizenM is the ...

152 rows × 3 columns


data.shape
(152, 3)

data['desc'][100]
'On a budget in Seattle or looking for something different? The historic charm and "home away from home" atmosphere of The Baroness will be sure to make you feel like one of the family. Conveniently located on First Hill, we are proud to be part of the Virginia Mason Hospital campus and only minutes from Harborview Medical Center and Swedish Hospital. The Baroness Hotel is a great option for short or long term medical, patient or family stays. Whether you are visiting the area\'s world-class medical facilities or on a budget vacation, our goal is to ensure a wonderful stay. Guest Amenities: Complimentary Internet access, Two twin, one or two queen studios with mini fridge and microwave, Two twin or one queen suites with full kitchens, Laundry facilities available, Flat screen cable television with HBO, Complimentary local calls, Ice and vending machines located in the lobby, Coffee maker and hairdryers in all guestrooms, Room service available seven days a week from the Rhododendron Cafe, Limited wheelchair accessibility, Guest library and business center, Printing & fax services available, 100% non-smoking and pet free, Rooms are not air conditioned - fans are available, Self-parking available at Virginia Mason hospital for a fee.'

看一下酒店的主要描述信息

vec = CountVectorizer().fit(data['desc'])
bag_of_words = vec.transform(data['desc'])

bag_of_words.toarray()
array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

bag_of_words.shape
(152, 3200)
sum_words = bag_of_words.sum(axis =0 )

sum_words
matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)





def get_top_n_words(corpus,n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis =0 )
    word_freqs = [(word,sum_words[0,idex]) for word,idex in vec.vocabulary_.items()]
    word_freq = sorted(word_freqs,key = lambda x:x[1],reverse=True)
    return word_freq[:n]



common_words = get_top_n_words(data['desc'],20)
common_words
[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129)]


df1 = pd.DataFrame(common_words,columns=['desc','counts'])
df1.head()
desc counts
0 seattle 533
1 hotel 295
2 center 151
3 downtown 133
4 free 123



df1.groupby('desc').sum()['counts'].sort_values().iplot(kind='barh',yTitle='counts',linecolor='black',title='top 20 before remove stopwords')
posted @ 2024-03-10 18:16  AIIsFuture  阅读(38)  评论(0编辑  收藏  举报