python音乐推荐系统

import pandas as pd
import numpy as np
import time
import sqlite3

data_home = 'E:/python学习/项目/python推荐系统/Python实现音乐推荐系统/'

读取数据


triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt', 
                              sep='\t', header=None, 
                              names=['user','song','play_count'])


triplet_dataset.shape
(48373586, 3)

## 数据占用内存和各指标
triplet_dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48373586 entries, 0 to 48373585
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user        object
 1   song        object
 2   play_count  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ GB

triplet_dataset.head(10)
user song play_count
0 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAKIMP12A8C130995 1
1 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAPDEY12A81C210A9 1
2 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBBMDR12A8C13253B 2
3 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFNSP12AF72A0E22 1
4 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFOVM12A58A7D494 1
5 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBNZDC12A6D4FC103 1
6 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBSUJE12A6D4F8CF5 2
7 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBVFZR12A6D4F8AE3 1
8 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBXALG12A8C13C108 1
9 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBXHDL12A81C204C0 1

对每一个用户,分别统计他的播放总量

output_dict = {}
with open(data_home+'train_triplets.txt') as f:
    for line_number, line in enumerate(f):
        #找到当前的用户
        user = line.split('\t')[0]
        #得到其播放量数据
        play_count = int(line.split('\t')[2])
        #如果字典中已经有该用户信息,在其基础上增加当前的播放量
        if user in output_dict:
            play_count +=output_dict[user]
            output_dict.update({user:play_count})
        output_dict.update({user:play_count})
# 统计 用户-总播放量
output_list = [{'user':k,'play_count':v} for k,v in output_dict.items()]
#转换成DF格式
play_count_df = pd.DataFrame(output_list)
#排序
play_count_df = play_count_df.sort_values(by = 'play_count', ascending = False)

play_count_df.to_csv(path_or_buf='user_playcount_df.csv', index = False)

对于每一首歌,分别统计它的播放总量

##对于每一首歌,分别统计它的播放总量
#统计方法跟上述类似
output_dict = {}
with open(data_home+'train_triplets.txt') as f:
    for line_number, line in enumerate(f):
        #找到当前歌曲
        song = line.split('\t')[1]
        #找到当前播放次数
        play_count = int(line.split('\t')[2])
        #统计每首歌曲被播放的总次数
        if song in output_dict:
            play_count +=output_dict[song]
            output_dict.update({song:play_count})
        output_dict.update({song:play_count})
output_list = [{'song':k,'play_count':v} for k,v in output_dict.items()]
#转换成df格式
song_count_df = pd.DataFrame(output_list)
song_count_df = song_count_df.sort_values(by = 'play_count', ascending = False)
#统计方法跟上述类似

song_count_df.to_csv(path_or_buf='song_playcount_df.csv', index = False)

看看目前的排行情况

play_count_df = pd.read_csv(filepath_or_buffer='user_playcount_df.csv')
play_count_df.head(n =10)
user play_count
0 093cb74eb3c517c5179ae24caf0ebec51b24d2a2 13132
1 119b7c88d58d0c6eb051365c103da5caf817bea6 9884
2 3fa44653315697f42410a30cb766a4eb102080bb 8210
3 a2679496cd0af9779a92a13ff7c6af5c81ea8c7b 7015
4 d7d2d888ae04d16e994d6964214a1de81392ee04 6494
5 4ae01afa8f2430ea0704d502bc7b57fb52164882 6472
6 b7c24f770be6b802805ac0e2106624a517643c17 6150
7 113255a012b2affeab62607563d03fbdf31b08e7 5656
8 6d625c6557df84b60d90426c0116138b617b9449 5620
9 99ac3d883681e21ea68071019dba828ce76fe94d 5602

song_count_df = pd.read_csv(filepath_or_buffer='song_playcount_df.csv')
song_count_df.head(10)
song play_count
0 SOBONKR12A58A7A7E0 726885
1 SOAUWYT12A81C206F1 648239
2 SOSXLTC12AF72A7F54 527893
3 SOFRQTD12A81C233C0 425463
4 SOEGIYH12A6D4FC0E3 389880
5 SOAXGDH12A8C13F8A1 356533
6 SONYKOW12AB01849C9 292642
7 SOPUCYA12A8C13A694 274627
8 SOUFTBI12AB0183F65 268353
9 SOVDSJC12A58A7A271 244730

最受欢迎的一首歌曲有726885次播放。 刚才大家也看到了,这个音乐数据量集十分庞大,考虑到执行过程的时间消耗以及矩阵稀疏性问题,我们依据播放量指标对数据集进行了截取。因为有些注册用户可能只是关注了一下之后就不再登录平台,这些用户对我们建模不会起促进作用,反而增大了矩阵的稀疏性。对于歌曲也是同理,可能有些歌曲根本无人问津。由于之前已经对用户与歌曲播放情况进行了排序,所以我们分别选择了其中的10W名用户和3W首歌曲,关于截取的合适比例大家也可以通过观察选择数据的播放量占总体的比例来设置。


取其中一部分数(按大小排好序的了,这些应该是比较重要的数据),作为我们的实验数据

#10W名用户的播放量占总体的比例
total_play_count = sum(song_count_df.play_count)
print ((float(play_count_df.head(n=100000).play_count.sum())/total_play_count)*100)
play_count_subset = play_count_df.head(n=100000)
40.8807280500655
(float(song_count_df.head(n=30000).play_count.sum())/total_play_count)*100
78.39315366645269
song_count_subset = song_count_df.head(n=30000)

取10W个用户,3W首歌

user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)

过滤掉其他用户数据

#读取原始数据集
triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt',sep='\t', 
                              header=None, names=['user','song','play_count'])
#只保留有这10W名用户的数据,其余过滤掉
triplet_dataset_sub = triplet_dataset[triplet_dataset.user.isin(user_subset) ]
del(triplet_dataset)
#只保留有这3W首歌曲的数据,其余也过滤掉
triplet_dataset_sub_song = triplet_dataset_sub[triplet_dataset_sub.song.isin(song_subset)]
del(triplet_dataset_sub)
triplet_dataset_sub_song.to_csv(path_or_buf=data_home+'triplet_dataset_sub_song.csv', index=False)
triplet_dataset_sub_song.shape
(10774558, 3)

数据样本个数此时只有原来的1/4不到,但是我们过滤掉的样本都是稀疏数据不利于建模,所以当拿到了数据之后对数据进行清洗和预处理工作还是非常有必要的,不单单提升计算的速度,还会影响最终的结果。

triplet_dataset_sub_song.head(n=10)
user song play_count
498 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12
499 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1
500 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1
501 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1
502 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7
503 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBONKR12A58A7A7E0 26
504 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBZZDU12A6310D8A3 7
505 d6589314c0a9bcbca4fee0c93b14bc402363afea SOCAHRT12A8C13A1A4 5
506 d6589314c0a9bcbca4fee0c93b14bc402363afea SODASIJ12A6D4F5D89 1
507 d6589314c0a9bcbca4fee0c93b14bc402363afea SODEAWL12AB0187032 8

加入音乐详细信息

conn = sqlite3.connect(data_home+'track_metadata.db')
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
cur.fetchall()
[('songs',)]
track_metadata_df = pd.read_sql(con=conn, sql='select * from songs')
track_metadata_df_sub = track_metadata_df[track_metadata_df.song_id.isin(song_subset)]

track_metadata_df_sub.to_csv(path_or_buf=data_home+'track_metadata_df_sub.csv', index=False)

track_metadata_df_sub.shape
(30447, 14)

我们现有的数据

triplet_dataset_sub_song = pd.read_csv(filepath_or_buffer=data_home+'triplet_dataset_sub_song.csv',encoding = "ISO-8859-1")
track_metadata_df_sub = pd.read_csv(filepath_or_buffer=data_home+'track_metadata_df_sub.csv',encoding = "ISO-8859-1")
triplet_dataset_sub_song.head()
user song play_count
0 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12
1 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1
2 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1
3 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1
4 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7
track_metadata_df_sub.head()
track_id title song_id release artist_id artist_mbid artist_name duration artist_familiarity artist_hotttnesss year track_7digitalid shs_perf shs_work
0 TRMMGCB128E079651D Get Along (Feat: Pace Won) (Instrumental) SOHNWIM12A67ADF7D9 Charango ARU3C671187FB3F71B 067102ea-9519-4622-9077-57ca4164cfbb Morcheeba 227.47383 0.819087 0.533117 2002 185967 -1 0
1 TRMMGTX128F92FB4D9 Viejo SOECFIW12A8C144546 Caraluna ARPAAPH1187FB3601B f69d655c-ffd6-4bee-8c2a-3086b2be2fc6 Bacilos 307.51302 0.595554 0.400705 0 6825058 -1 0
2 TRMMGDP128F933E59A I Say A Little Prayer SOGWEOB12AB018A4D0 The Legendary Hi Records Albums_ Volume 3: Ful... ARNNRN31187B9AE7B7 fb7272ba-f130-4f0a-934d-6eeea4c18c9a Al Green 133.58975 0.779490 0.599210 1978 5211723 -1 11898
3 TRMMHBF12903CF6E59 At the Ball_ That's All SOJGCRL12A8C144187 Best of Laurel & Hardy - The Lonesome Pine AR1FEUF1187B9AF3E3 4a8ae4fd-ad6f-4912-851f-093f12ee3572 Laurel & Hardy 123.71546 0.438709 0.307120 0 8645877 -1 0
4 TRMMHKG12903CDB1B5 Black Gold SOHNFBA12AB018CD1D Total Life Forever ARVXV1J1187FB5BF88 6a65d878-fcd0-42cf-aff9-ca1d636a8bcc Foals 386.32444 0.842578 0.514523 2010 9007438 -1 0

清洗数据集

# 去掉无用的信息
del(track_metadata_df_sub['track_id'])
del(track_metadata_df_sub['artist_mbid'])
# 去掉重复的
track_metadata_df_sub = track_metadata_df_sub.drop_duplicates(['song_id'])
# 将这份音乐信息数据和我们之前的播放数据整合到一起
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song, track_metadata_df_sub, how='left', left_on='song', right_on='song_id')
# 可以自己改变列名
triplet_dataset_sub_song_merged.rename(columns={'play_count':'listen_count'},inplace=True)
# 去掉不需要的指标
del(triplet_dataset_sub_song_merged['song_id'])
del(triplet_dataset_sub_song_merged['artist_id'])
del(triplet_dataset_sub_song_merged['duration'])
del(triplet_dataset_sub_song_merged['artist_familiarity'])
del(triplet_dataset_sub_song_merged['artist_hotttnesss'])
del(triplet_dataset_sub_song_merged['track_7digitalid'])
del(triplet_dataset_sub_song_merged['shs_perf'])
del(triplet_dataset_sub_song_merged['shs_work'])
triplet_dataset_sub_song_merged.head(n=10)
user song listen_count title release artist_name year
0 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12 You And Me Jesus Tribute To Jake Hess Jake Hess 2004
1 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1 Harder Better Faster Stronger Discovery Daft Punk 2007
2 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1 Uprising Uprising Muse 0
3 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1 Breakfast At Tiffany's Home Deep Blue Something 1993
4 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7 Lucky (Album Version) We Sing. We Dance. We Steal Things. Jason Mraz & Colbie Caillat 0
5 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBONKR12A58A7A7E0 26 You're The One If There Was A Way Dwight Yoakam 1990
6 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBZZDU12A6310D8A3 7 Don't Dream It's Over Recurring Dream_ Best Of Crowded House (Domest... Crowded House 1986
7 d6589314c0a9bcbca4fee0c93b14bc402363afea SOCAHRT12A8C13A1A4 5 S.O.S. SOS Jonas Brothers 2007
8 d6589314c0a9bcbca4fee0c93b14bc402363afea SODASIJ12A6D4F5D89 1 The Invisible Man The Invisible Man Michael Cretu 1985
9 d6589314c0a9bcbca4fee0c93b14bc402363afea SODEAWL12AB0187032 8 American Idiot [feat. Green Day & The Cast Of ... The Original Broadway Cast Recording 'American... Green Day 0

展示最流行的歌曲

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
#按歌曲名字来统计其播放量的总数
popular_songs = triplet_dataset_sub_song_merged[['title','listen_count']].groupby('title').sum().reset_index()
#对结果进行排序
popular_songs_top_20 = popular_songs.sort_values('listen_count', ascending=False).head(n=20)

#转换成list格式方便画图
objects = (list(popular_songs_top_20['title']))
#设置位置
y_pos = np.arange(len(objects))
#对应结果值
performance = list(popular_songs_top_20['listen_count'])
#绘图
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular songs')
 
plt.show()

最受欢迎的releases

#按专辑名字来统计播放总量
popular_release = triplet_dataset_sub_song_merged[['release','listen_count']].groupby('release').sum().reset_index()
#排序
popular_release_top_20 = popular_release.sort_values('listen_count', ascending=False).head(n=20)

objects = (list(popular_release_top_20['release']))
y_pos = np.arange(len(objects))
performance = list(popular_release_top_20['listen_count'])
#绘图 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Release')
 
plt.show()

最受欢迎的歌手

#按歌手来统计其播放总量
popular_artist = triplet_dataset_sub_song_merged[['artist_name','listen_count']].groupby('artist_name').sum().reset_index()
#排序
popular_artist_top_20 = popular_artist.sort_values('listen_count', ascending=False).head(n=20)

objects = (list(popular_artist_top_20['artist_name']))
y_pos = np.arange(len(objects))
performance = list(popular_artist_top_20['listen_count'])
#绘图 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Artists')
 
plt.show()

用户播放过歌曲量的分布

user_song_count_distribution = triplet_dataset_sub_song_merged[['user','title']].groupby('user').count().reset_index().sort_values(
by='title',ascending = False)
user_song_count_distribution.title.describe()
count    99996.000000
mean       107.749890
std         79.742561
min          1.000000
25%         53.000000
50%         89.000000
75%        141.000000
max       1189.000000
Name: title, dtype: float64
x = user_song_count_distribution.title
n, bins, patches = plt.hist(x, 50, facecolor='green', alpha=0.75)
plt.xlabel('Play Counts')
plt.ylabel('Num of Users')
plt.title(r'$\mathrm{Histogram\ of\ User\ Play\ Count\ Distribution}\ $')
plt.grid(True)
plt.show()

绝大多数用户播放歌曲的数量在100左右,关于数据的处理和介绍已经给大家都分析过了,接下来我们要做的就是构建一个能实际进行推荐的程序了。

开始构建推荐系统

import Recommenders as Recommenders
from sklearn.model_selection import train_test_split
triplet_dataset_sub_song_merged_set = triplet_dataset_sub_song_merged
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_set, test_size = 0.40, random_state=0)
train_data.head()
user song listen_count title release artist_name year
1901799 28866ea8a809d5d46273cd0989c5515c660ef8c7 SOEYVHS12AB0181D31 1 Monster The Fame Monster Lady GaGa 2009
4815185 c9608a24a2a40e0ec38993a70532e7bb56eff22b SOKIYKQ12A8AE464FC 2 Fight For Your Life Made In NYC The Casualties 2000
10513026 24f0b09c133a6a0fe42f097734215dceb468d449 SOETFVO12AB018DFF3 1 Free Style (feat. Kevo_ Mussilini & Lyrical 187) A Bad Azz Mix Tape Z-RO 0
2659073 4da3c59a0af73245cea000fd5efa30384182bfcb SOAXJOU12A6D4F6685 1 Littlest Things Alright_ Still Lily Allen 2006
5506263 b46c5ed385cad7ecea8af6214f440d19de6eb6c2 SOXBCAY12AB0189EE0 1 La trama y el desenlace Amar la trama Jorge Drexler 2010
def create_popularity_recommendation(train_data, user_id, item_id):
    #根据指定的特征来统计其播放情况,可以选择歌曲名,专辑名,歌手名
    train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index()
    #为了直观展示,我们用得分来表示其结果
    train_data_grouped.rename(columns = {user_id: 'score'},inplace=True)
    
    #排行榜单需要排序
    train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
    
    #加入一项排行等级,表示其推荐的优先级
    train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
    #返回指定个数的推荐结果
    popularity_recommendations = train_data_sort.head(20)
    return popularity_recommendations
recommendations = create_popularity_recommendation(triplet_dataset_sub_song_merged,'user','title')
recommendations
title score Rank
19580 Sehr kosmisch 18626 1.0
5780 Dog Days Are Over (Radio Edit) 17635 2.0
27314 You're The One 16085 3.0
19542 Secrets 15138 4.0
18636 Revelry 14945 5.0
25070 Undo 14687 6.0
7530 Fireflies 13085 7.0
9640 Hey_ Soul Sister 12993 8.0
25216 Use Somebody 12793 9.0
9921 Horn Concerto No. 4 in E flat K495: II. Romanc... 12346 10.0
24291 Tive Sim 11831 11.0
3629 Canada 11598 12.0
23468 The Scientist 11529 13.0
4194 Clocks 11357 14.0
12135 Just Dance 11058 15.0
26974 Yellow 10919 16.0
16438 OMG 10818 17.0
9844 Home 10512 18.0
3295 Bulletproof 10383 19.0
4760 Creep (Explicit) 10246 20.0

基于歌曲相似度的推荐

song_count_subset = song_count_df.head(n=5000)
user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)
triplet_dataset_sub_song_merged_sub = triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.song.isin(song_subset)]

triplet_dataset_sub_song_merged_sub.head()
user song listen_count title release artist_name year
0 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12 You And Me Jesus Tribute To Jake Hess Jake Hess 2004
1 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1 Harder Better Faster Stronger Discovery Daft Punk 2007
2 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1 Uprising Uprising Muse 0
3 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1 Breakfast At Tiffany's Home Deep Blue Something 1993
4 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7 Lucky (Album Version) We Sing. We Dance. We Steal Things. Jason Mraz & Colbie Caillat 0

计算相似度得到推荐结果

import Recommenders as Recommenders
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_sub, test_size = 0.30, random_state=0)
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user', 'title')
user_id = list(train_data.user)[7]
user_items = is_model.get_user_items(user_id)
#执行推荐
is_model.recommend(user_id)
No. of unique songs for the user: 66
no. of unique songs in the training set: 4879
Non zero values in cooccurence_matrix :290327
user_id song score rank
0 a974fc428825ed071281302d6976f59bfa95fe7e Put Your Head On My Shoulder (Album Version) 0.026334 1
1 a974fc428825ed071281302d6976f59bfa95fe7e The Strength To Go On 0.025176 2
2 a974fc428825ed071281302d6976f59bfa95fe7e Come Fly With Me (Album Version) 0.024447 3
3 a974fc428825ed071281302d6976f59bfa95fe7e Moondance (Album Version) 0.024118 4
4 a974fc428825ed071281302d6976f59bfa95fe7e Kotov Syndrome 0.023311 5
5 a974fc428825ed071281302d6976f59bfa95fe7e Use Somebody 0.023104 6
6 a974fc428825ed071281302d6976f59bfa95fe7e Lucky (Album Version) 0.022930 7
7 a974fc428825ed071281302d6976f59bfa95fe7e Secrets 0.022889 8
8 a974fc428825ed071281302d6976f59bfa95fe7e Clocks 0.022562 9
9 a974fc428825ed071281302d6976f59bfa95fe7e Sway (Album Version) 0.022359 10

基于矩阵分解(SVD)的推荐

triplet_dataset_sub_song_merged_sum_df = triplet_dataset_sub_song_merged[['user','listen_count']].groupby('user').sum().reset_index()
triplet_dataset_sub_song_merged_sum_df.rename(columns={'listen_count':'total_listen_count'},inplace=True)
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song_merged,triplet_dataset_sub_song_merged_sum_df)
triplet_dataset_sub_song_merged.head()
user song listen_count title release artist_name year total_listen_count
0 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12 You And Me Jesus Tribute To Jake Hess Jake Hess 2004 329
1 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1 Harder Better Faster Stronger Discovery Daft Punk 2007 329
2 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1 Uprising Uprising Muse 0 329
3 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1 Breakfast At Tiffany's Home Deep Blue Something 1993 329
4 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7 Lucky (Album Version) We Sing. We Dance. We Steal Things. Jason Mraz & Colbie Caillat 0 329
triplet_dataset_sub_song_merged['fractional_play_count'] = triplet_dataset_sub_song_merged['listen_count']/triplet_dataset_sub_song_merged['total_listen_count']

triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.user =='d6589314c0a9bcbca4fee0c93b14bc402363afea'][['user','song','listen_count','fractional_play_count']].head()
user song listen_count fractional_play_count
0 d6589314c0a9bcbca4fee0c93b14bc402363afea SOADQPP12A67020C82 12 0.036474
1 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAFTRR12AF72A8D4D 1 0.003040
2 d6589314c0a9bcbca4fee0c93b14bc402363afea SOANQFY12AB0183239 1 0.003040
3 d6589314c0a9bcbca4fee0c93b14bc402363afea SOAYATB12A6701FD50 1 0.003040
4 d6589314c0a9bcbca4fee0c93b14bc402363afea SOBOAFP12A8C131F36 7 0.021277

from scipy.sparse import coo_matrix

small_set = triplet_dataset_sub_song_merged
user_codes = small_set.user.drop_duplicates().reset_index()
song_codes = small_set.song.drop_duplicates().reset_index()
user_codes.rename(columns={'index':'user_index'}, inplace=True)
song_codes.rename(columns={'index':'song_index'}, inplace=True)
song_codes['so_index_value'] = list(song_codes.index)
user_codes['us_index_value'] = list(user_codes.index)
small_set = pd.merge(small_set,song_codes,how='left')
small_set = pd.merge(small_set,user_codes,how='left')
mat_candidate = small_set[['us_index_value','so_index_value','fractional_play_count']]
data_array = mat_candidate.fractional_play_count.values
row_array = mat_candidate.us_index_value.values
col_array = mat_candidate.so_index_value.values

data_sparse = coo_matrix((data_array, (row_array, col_array)),dtype=float)

data_sparse
<99996x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 10774558 stored elements in COOrdinate format>

user_codes[user_codes.user =='2a2f776cbac6df64d6cb505e7e834e01684673b6']
user_index user us_index_value
27516 2981434 2a2f776cbac6df64d6cb505e7e834e01684673b6 27516

使用SVD方法来进行矩阵分解

import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

def compute_svd(urm, K):
    U, s, Vt = svds(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(U, dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

def compute_estimated_matrix(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt 
    max_recommendation = 250
    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        estimatedRatings[userTest, :] = prod.todense()
        recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation]
    return recomendRatings

K=50
urm = data_sparse
MAX_PID = urm.shape[1]
MAX_UID = urm.shape[0]

U, S, Vt = compute_svd(urm, K)

uTest = [4,5,6,7,8,873,23]

uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)

for user in uTest:
    print("当前待推荐用户编号 {}". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
        print("推荐编号: {} 推荐歌曲: {} 作者: {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
        rank_value+=1
当前待推荐用户编号 4
推荐编号: 1 推荐歌曲: Fireflies 作者: Charttraxx Karaoke
推荐编号: 2 推荐歌曲: Hey_ Soul Sister 作者: Train
推荐编号: 3 推荐歌曲: OMG 作者: Usher featuring will.i.am
推荐编号: 4 推荐歌曲: Lucky (Album Version) 作者: Jason Mraz & Colbie Caillat
推荐编号: 5 推荐歌曲: Vanilla Twilight 作者: Owl City
推荐编号: 6 推荐歌曲: Crumpshit 作者: Philippe Rochard
推荐编号: 7 推荐歌曲: Billionaire [feat. Bruno Mars]  (Explicit Album Version) 作者: Travie McCoy
推荐编号: 8 推荐歌曲: Love Story 作者: Taylor Swift
推荐编号: 9 推荐歌曲: TULENLIEKKI 作者: M.A. Numminen
推荐编号: 10 推荐歌曲: Use Somebody 作者: Kings Of Leon
当前待推荐用户编号 5
推荐编号: 1 推荐歌曲: Sehr kosmisch 作者: Harmonia
推荐编号: 2 推荐歌曲: Ain't Misbehavin 作者: Sam Cooke
推荐编号: 3 推荐歌曲: Dog Days Are Over (Radio Edit) 作者: Florence + The Machine
推荐编号: 4 推荐歌曲: Revelry 作者: Kings Of Leon
推荐编号: 5 推荐歌曲: Undo 作者: Björk
推荐编号: 6 推荐歌曲: Cosmic Love 作者: Florence + The Machine
推荐编号: 7 推荐歌曲: Home 作者: Edward Sharpe & The Magnetic Zeros
推荐编号: 8 推荐歌曲: You've Got The Love 作者: Florence + The Machine
推荐编号: 9 推荐歌曲: Bring Me To Life 作者: Evanescence
推荐编号: 10 推荐歌曲: Tighten Up 作者: The Black Keys
当前待推荐用户编号 6
推荐编号: 1 推荐歌曲: Crumpshit 作者: Philippe Rochard
推荐编号: 2 推荐歌曲: Marry Me 作者: Train
推荐编号: 3 推荐歌曲: Hey_ Soul Sister 作者: Train
推荐编号: 4 推荐歌曲: Lucky (Album Version) 作者: Jason Mraz & Colbie Caillat
推荐编号: 5 推荐歌曲: One On One 作者: the bird and the bee
推荐编号: 6 推荐歌曲: I Never Told You 作者: Colbie Caillat
推荐编号: 7 推荐歌曲: Canada 作者: Five Iron Frenzy
推荐编号: 8 推荐歌曲: Fireflies 作者: Charttraxx Karaoke
推荐编号: 9 推荐歌曲: TULENLIEKKI 作者: M.A. Numminen
推荐编号: 10 推荐歌曲: Bring Me To Life 作者: Evanescence
当前待推荐用户编号 7
推荐编号: 1 推荐歌曲: Behind The Sea [Live In Chicago] 作者: Panic At The Disco
推荐编号: 2 推荐歌曲: The City Is At War (Album Version) 作者: Cobra Starship
推荐编号: 3 推荐歌曲: Dead Souls 作者: Nine Inch Nails
推荐编号: 4 推荐歌曲: Una Confusion 作者: LU
推荐编号: 5 推荐歌曲: Home 作者: Edward Sharpe & The Magnetic Zeros
推荐编号: 6 推荐歌曲: Climbing Up The Walls 作者: Radiohead
推荐编号: 7 推荐歌曲: Tighten Up 作者: The Black Keys
推荐编号: 8 推荐歌曲: Tive Sim 作者: Cartola
推荐编号: 9 推荐歌曲: West One (Shine On Me) 作者: The Ruts
推荐编号: 10 推荐歌曲: Cosmic Love 作者: Florence + The Machine
当前待推荐用户编号 8
推荐编号: 1 推荐歌曲: Undo 作者: Björk
推荐编号: 2 推荐歌曲: Canada 作者: Five Iron Frenzy
推荐编号: 3 推荐歌曲: Better To Reign In Hell 作者: Cradle Of Filth
推荐编号: 4 推荐歌曲: Unite (2009 Digital Remaster) 作者: Beastie Boys
推荐编号: 5 推荐歌曲: Behind The Sea [Live In Chicago] 作者: Panic At The Disco
推荐编号: 6 推荐歌曲: Rockin' Around The Christmas Tree 作者: Brenda Lee
推荐编号: 7 推荐歌曲: Devil's Slide 作者: Joe Satriani
推荐编号: 8 推荐歌曲: Revelry 作者: Kings Of Leon
推荐编号: 9 推荐歌曲: 16 Candles 作者: The Crests
推荐编号: 10 推荐歌曲: Catch You Baby (Steve Pitron & Max Sanna Radio Edit) 作者: Lonnie Gordon
当前待推荐用户编号 873
推荐编号: 1 推荐歌曲: The Scientist 作者: Coldplay
推荐编号: 2 推荐歌曲: Yellow 作者: Coldplay
推荐编号: 3 推荐歌曲: Clocks 作者: Coldplay
推荐编号: 4 推荐歌曲: Fix You 作者: Coldplay
推荐编号: 5 推荐歌曲: In My Place 作者: Coldplay
推荐编号: 6 推荐歌曲: Shiver 作者: Coldplay
推荐编号: 7 推荐歌曲: Speed Of Sound 作者: Coldplay
推荐编号: 8 推荐歌曲: Creep (Explicit) 作者: Radiohead
推荐编号: 9 推荐歌曲: Sparks 作者: Coldplay
推荐编号: 10 推荐歌曲: Use Somebody 作者: Kings Of Leon
当前待推荐用户编号 23
推荐编号: 1 推荐歌曲: Garden Of Eden 作者: Guns N' Roses
推荐编号: 2 推荐歌曲: Don't Speak 作者: John Dahlbäck
推荐编号: 3 推荐歌曲: Master Of Puppets 作者: Metallica
推荐编号: 4 推荐歌曲: TULENLIEKKI 作者: M.A. Numminen
推荐编号: 5 推荐歌曲: Bring Me To Life 作者: Evanescence
推荐编号: 6 推荐歌曲: Kryptonite 作者: 3 Doors Down
推荐编号: 7 推荐歌曲: Make Her Say 作者: Kid Cudi / Kanye West / Common
推荐编号: 8 推荐歌曲: Night Village 作者: Deep Forest
推荐编号: 9 推荐歌曲: Better To Reign In Hell 作者: Cradle Of Filth
推荐编号: 10 推荐歌曲: Xanadu 作者: Olivia Newton-John;Electric Light Orchestra
uTest = [27513]
#Get estimated rating for test user
print("Predictied ratings:")
uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)
Predictied ratings:
for user in uTest:
    print("当前待推荐用户编号 {}". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
        print("推荐编号: {} 推荐歌曲: {} 作者: {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
        rank_value+=1
当前待推荐用户编号 27513
推荐编号: 1 推荐歌曲: Master Of Puppets 作者: Metallica
推荐编号: 2 推荐歌曲: Garden Of Eden 作者: Guns N' Roses
推荐编号: 3 推荐歌曲: Bring Me To Life 作者: Evanescence
推荐编号: 4 推荐歌曲: Kryptonite 作者: 3 Doors Down
推荐编号: 5 推荐歌曲: Make Her Say 作者: Kid Cudi / Kanye West / Common
推荐编号: 6 推荐歌曲: Night Village 作者: Deep Forest
推荐编号: 7 推荐歌曲: Savior 作者: Rise Against
推荐编号: 8 推荐歌曲: Good Things 作者: Rich Boy / Polow Da Don / Keri Hilson
推荐编号: 9 推荐歌曲: Bleed It Out [Live At Milton Keynes] 作者: Linkin Park
推荐编号: 10 推荐歌曲: Uprising 作者: Muse
posted @ 2024-02-06 14:52  AIIsFuture  阅读(85)  评论(1编辑  收藏  举报