音乐推荐系统案例

#!/usr/bin/env python
# coding: utf-8

# # 推荐系统
# 
# - 音乐数据处理
# 
# - 基于商品相似性的推荐 
# 
# - 基于SVD矩阵分解的推荐

# In[1]:


import os
os.getcwd()


# ## 数据读取

# In[2]:


import pandas as pd
import numpy as np
import time
import sqlite3

data_home = './'


# 在数据中只需要用户,歌曲,播放量

# In[4]:


get_ipython().run_cell_magic('time', '', "triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt', \n                              sep='\\t', header=None, \n                              names=['user','song','play_count'])")


# In[5]:


get_ipython().run_cell_magic('time', '', 'triplet_dataset.shape')


# In[6]:


get_ipython().run_cell_magic('time', '', 'triplet_dataset.info()')


# In[7]:


get_ipython().run_cell_magic('time', '', 'triplet_dataset.head(n=10)')


# In[8]:


get_ipython().run_cell_magic('time', '', "triplet_dataset['play_count'] = triplet_dataset['play_count'].astype('int32')")


# In[9]:


triplet_dataset.info()


# In[10]:


triplet_dataset.head(n=10)


# In[11]:


get_ipython().run_cell_magic('time', '', 'triplet_dataset.count()')


# ## 对每一个用户,分别统计他的播放总量

# In[12]:


get_ipython().run_cell_magic('time', '', "output_dict = {}\n# b80344d063b5ccb3212f76538f3d9e43d87dca9e \tSOBXHDL12A81C204C0 \t1\nwith open(data_home+'train_triplets.txt') as f:\n    for line_number, line in enumerate(f):\n        user = line.split('\\t')[0] # b80344d063b5ccb3212f76538f3d9e43d87dca9e\n        play_count = int(line.split('\\t')[2]) # 1\n        if user in output_dict: # 如果这个用在字典中 \n            play_count +=output_dict[user] # 将play_count播放量累加\n            output_dict.update({user:play_count}) # 更新该用户的累计播放量\n        output_dict.update({user:play_count}) # 给该用户初始赋值\noutput_list = [{'user':k,'play_count':v} for k,v in output_dict.items()] # 将字典转换成一个列表\nplay_count_df = pd.DataFrame(output_list) # 将列表转换成dataframe\nplay_count_df = play_count_df.sort_values(by = 'play_count', ascending = False)\nplay_count_df.head(10)")


# In[16]:


play_count_df.count()


# In[13]:


play_count_df.head(10)


# In[14]:


get_ipython().run_cell_magic('time', '', "play_count_df.to_csv(path_or_buf='user_playcount_df.csv', index = False)")


# ## 对于每一首歌,分别统计它的播放总量

# In[15]:


get_ipython().run_cell_magic('time', '', "output_dict = {}\nwith open(data_home+'train_triplets.txt') as f:\n    for line_number, line in enumerate(f):\n        song = line.split('\\t')[1]\n        play_count = int(line.split('\\t')[2])\n        if song in output_dict:\n            play_count +=output_dict[song]\n            output_dict.update({song:play_count})\n        output_dict.update({song:play_count})\noutput_list = [{'song':k,'play_count':v} for k,v in output_dict.items()]\nsong_count_df = pd.DataFrame(output_list)\nsong_count_df = song_count_df.sort_values(by = 'play_count', ascending = False)")


# In[18]:


song_count_df.head(10)
# song_count_df.tail(10)


# In[19]:


song_count_df.count()


# In[20]:


get_ipython().run_cell_magic('time', '', "song_count_df.to_csv(path_or_buf='song_playcount_df.csv', index = False)")


# ## 看看目前的排行情况

# In[21]:


play_count_df = pd.read_csv(filepath_or_buffer='user_playcount_df.csv')
play_count_df.head(n =10)


# In[22]:


song_count_df = pd.read_csv(filepath_or_buffer='song_playcount_df.csv')
song_count_df.head(10)


# ## 取其中一部分数(按大小排好序的了,这些应该是比较重要的数据),作为我们的实验数据

# In[23]:


get_ipython().run_cell_magic('time', '', 'total_play_count = sum(song_count_df.play_count) # 统计所有歌曲的播放总量\nprint ((float(play_count_df.head(n=100000).play_count.sum())/total_play_count)*100) # 播放量前10w的用户播放总量/所有歌曲播放总量\nplay_count_subset = play_count_df.head(n=100000) # 将前10w的用户播放量保存成一个新的df')


# In[24]:


get_ipython().run_cell_magic('time', '', '(float(song_count_df.head(n=30000).play_count.sum())/total_play_count)*100 # 前3w首最热门的歌曲的播放总量/所有歌曲的播放总量')


# In[25]:


song_count_subset = song_count_df.head(n=30000)


# 取10W个用户,3W首歌

# In[26]:


user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)


# 过滤掉其他用户数据

# In[ ]:


get_ipython().run_cell_magic('time', '', "triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt',sep='\\t', \n                              header=None, names=['user','song','play_count'])\ntriplet_dataset_sub = triplet_dataset[triplet_dataset.user.isin(user_subset) ]\ndel(triplet_dataset)\ntriplet_dataset_sub_song = triplet_dataset_sub[triplet_dataset_sub.song.isin(song_subset)]\ndel(triplet_dataset_sub)")


# In[18]:


triplet_dataset_sub_song.to_csv(path_or_buf=data_home+'triplet_dataset_sub_song.csv', index=False)


# 我们的数据量

# In[19]:


triplet_dataset_sub_song.shape


# In[20]:


triplet_dataset_sub_song.head(n=10)


# ## 加入音乐详细信息

# .db文件需要稍微处理下 转换成csv

# In[21]:


conn = sqlite3.connect(data_home+'track_metadata.db')
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
cur.fetchall()


# In[22]:


track_metadata_df = pd.read_sql(con=conn, sql='select * from songs')
track_metadata_df_sub = track_metadata_df[track_metadata_df.song_id.isin(song_subset)]


# In[23]:


track_metadata_df_sub.to_csv(path_or_buf=data_home+'track_metadata_df_sub.csv', index=False)


# In[24]:


track_metadata_df_sub.shape


# ## 我们现有的数据

# In[25]:


# 用户 音乐 播放量 10w用户 3w歌曲
triplet_dataset_sub_song = pd.read_csv(filepath_or_buffer=data_home+'triplet_dataset_sub_song.csv',encoding = "ISO-8859-1") 
# 3w歌曲的信息
track_metadata_df_sub = pd.read_csv(filepath_or_buffer=data_home+'track_metadata_df_sub.csv',encoding = "ISO-8859-1")


# In[26]:



triplet_dataset_sub_song.head()


# In[27]:


# 音乐元数据的信息
track_metadata_df_sub.head()


# ## 清洗数据集

# 去除掉无用的和重复的

# In[28]:


del(track_metadata_df_sub['track_id'])
del(track_metadata_df_sub['artist_mbid']) # 删除字段
track_metadata_df_sub = track_metadata_df_sub.drop_duplicates(['song_id']) # 去重
# merge合并 triplet_dataset_sub_song用户id音乐id播放量 track_metadata_df_sub音乐id+各种信息
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song, track_metadata_df_sub, how='left', left_on='song', right_on='song_id')
triplet_dataset_sub_song_merged.rename(columns={'play_count':'listen_count'},inplace=True)


# In[29]:


del(triplet_dataset_sub_song_merged['song_id'])
del(triplet_dataset_sub_song_merged['artist_id'])
del(triplet_dataset_sub_song_merged['duration'])
del(triplet_dataset_sub_song_merged['artist_familiarity'])
del(triplet_dataset_sub_song_merged['artist_hotttnesss'])
del(triplet_dataset_sub_song_merged['track_7digitalid'])
del(triplet_dataset_sub_song_merged['shs_perf'])
del(triplet_dataset_sub_song_merged['shs_work'])


# 搞定数据

# In[31]:


triplet_dataset_sub_song_merged.head(n=10)


# ## 瞅瞅音乐数据集的情况

# ### 最流行的歌曲

# In[32]:


# 返回歌曲名 以及歌曲的播放量
popular_songs = triplet_dataset_sub_song_merged[['title','listen_count']].groupby('title').sum().reset_index()
# 按照播放量排序 倒序 取前20
popular_songs_top_20 = popular_songs.sort_values('listen_count', ascending=False).head(n=20)

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
 
objects = (list(popular_songs_top_20['title'])) # 20首歌的名字
y_pos = np.arange(len(objects)) # 0-19
performance = list(popular_songs_top_20['listen_count']) # 20首歌的播放量
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular songs')
 
plt.show()


# ### 最受欢迎的releases

# In[33]:


popular_release = triplet_dataset_sub_song_merged[['release','listen_count']].groupby('release').sum().reset_index()
popular_release_top_20 = popular_release.sort_values('listen_count', ascending=False).head(n=20)

objects = (list(popular_release_top_20['release']))
y_pos = np.arange(len(objects))
performance = list(popular_release_top_20['listen_count'])
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Release')
 
plt.show()


# ## 最受欢迎的歌手

# In[34]:


popular_artist = triplet_dataset_sub_song_merged[['artist_name','listen_count']].groupby('artist_name').sum().reset_index()
popular_artist_top_20 = popular_artist.sort_values('listen_count', ascending=False).head(n=20)

objects = (list(popular_artist_top_20['artist_name']))
y_pos = np.arange(len(objects))
performance = list(popular_artist_top_20['listen_count'])
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular Artists')
 
plt.show()


# ## 用户播放量的分布

# In[35]:


user_song_count_distribution = triplet_dataset_sub_song_merged[['user','title']].groupby('user').count().reset_index().sort_values(
by='title',ascending = False)
user_song_count_distribution.title.describe()


# In[36]:


x = user_song_count_distribution.title
n, bins, patches = plt.hist(x, 50, facecolor='green', alpha=0.75)
plt.xlabel('Play Counts')
plt.ylabel('Num of Users')
plt.title(r'$\mathrm{Histogram\ of\ User\ Play\ Count\ Distribution}\ $')
plt.grid(True)
plt.show()


# # 推荐系统

# In[38]:


import Recommenders as Recommenders
from sklearn.model_selection import train_test_split


# ## 简单暴力,排行榜单推荐

# In[39]:


triplet_dataset_sub_song_merged_set = triplet_dataset_sub_song_merged
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_set, test_size = 0.40, random_state=0)


# In[40]:


train_data.head()


# In[41]:


def create_popularity_recommendation(train_data, user_id, item_id):
    #Get a count of user_ids for each unique song as recommendation score
    # #根据指定的特征来统计其播放情况,可以选择歌曲名,专辑名,歌手名
    train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index() # 按照歌曲名称分组,统计每组的数量
    #  #为了直观展示,我们用得分来表示其结果
    train_data_grouped.rename(columns = {user_id: 'score'},inplace=True) # 将每个歌曲的用户量重命名为歌曲的得分
    
    #Sort the songs based upon recommendation score
    # #排行榜单需要排序
    train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
    
    #Generate a recommendation rank based upon score
    #加入一项排行等级,表示其推荐的优先级
    train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
    #Get the top 10 recommendations
    popularity_recommendations = train_data_sort.head(20)
    return popularity_recommendations


# In[42]:


recommendations = create_popularity_recommendation(triplet_dataset_sub_song_merged,'user','title')


# In[43]:


recommendations


# ## 基于歌曲相似度的推荐

# 选择一小部分歌曲来实验

# In[44]:


song_count_subset = song_count_df.head(n=5000) # 获取全部歌曲的前5000条
user_subset = list(play_count_subset.user) # 获取10w用户
song_subset = list(song_count_subset.song) # 获取5000歌曲id
# triplet_dataset_sub_song_merged为用户 歌曲 播放量 + 歌曲信息
triplet_dataset_sub_song_merged_sub = triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.song.isin(song_subset)]


# In[45]:


triplet_dataset_sub_song_merged_sub.head()


# In[46]:


train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_sub, test_size = 0.30, random_state=0)
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user', 'title')
user_id = list(train_data.user)[7]
user_items = is_model.get_user_items(user_id)


# In[47]:


#Recommend songs for the user using personalized model
is_model.recommend(user_id)


# ## 基于矩阵分解(SVD)的推荐

# <img src="1.png" style="width:550px;height:280px;float:left">
# <img src="5.png" style="width:350px;height:280px;float:left">

# 对矩阵进行SVD分解,将得到USV

# <img src="2.png" style="width:500px;height:380px;float:left"><img src="3.png" style="width:400px;height:200px;float:left">

# 重新计算 U*S*V的结果得到A2 来比较下A2和A的差异,看起来差异是有的,但是并不大,所以我们可以近似来代替

# <img src="4.png" style="width:330px;height:220px;float:left">
# <img src="5.png" style="width:330px;height:220px;float:left">

# <img src="6.png" style="width:650px;height:480px;float:left">

# <img src="7.png" style="width:650px;height:480px;float:left">

# 先计算歌曲被当前用户播放量 / 用户播放总量 当做分值

# In[48]:


triplet_dataset_sub_song_merged_sum_df = triplet_dataset_sub_song_merged[['user','listen_count']].groupby('user').sum().reset_index()
triplet_dataset_sub_song_merged_sum_df.rename(columns={'listen_count':'total_listen_count'},inplace=True)
triplet_dataset_sub_song_merged = pd.merge(triplet_dataset_sub_song_merged,triplet_dataset_sub_song_merged_sum_df)
triplet_dataset_sub_song_merged.head()


# In[49]:


triplet_dataset_sub_song_merged['fractional_play_count'] = triplet_dataset_sub_song_merged['listen_count']/triplet_dataset_sub_song_merged['total_listen_count']


# 大概是这样

# In[50]:


triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.user =='d6589314c0a9bcbca4fee0c93b14bc402363afea'][['user','song','listen_count','fractional_play_count']].head()


# In[51]:


from scipy.sparse import coo_matrix

small_set = triplet_dataset_sub_song_merged
user_codes = small_set.user.drop_duplicates().reset_index()
song_codes = small_set.song.drop_duplicates().reset_index()
user_codes.rename(columns={'index':'user_index'}, inplace=True)
song_codes.rename(columns={'index':'song_index'}, inplace=True)
song_codes['so_index_value'] = list(song_codes.index)
user_codes['us_index_value'] = list(user_codes.index)
small_set = pd.merge(small_set,song_codes,how='left')
small_set = pd.merge(small_set,user_codes,how='left')
mat_candidate = small_set[['us_index_value','so_index_value','fractional_play_count']]
data_array = mat_candidate.fractional_play_count.values
row_array = mat_candidate.us_index_value.values
col_array = mat_candidate.so_index_value.values

data_sparse = coo_matrix((data_array, (row_array, col_array)),dtype=float)


# In[52]:


data_sparse


# In[53]:


user_codes[user_codes.user =='2a2f776cbac6df64d6cb505e7e834e01684673b6']


# In[54]:


import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix


# In[55]:


def compute_svd(urm, K):
    U, s, Vt = svds(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(U, dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

def compute_estimated_matrix(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt 
    max_recommendation = 250
    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        estimatedRatings[userTest, :] = prod.todense()
        recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation]
    return recomendRatings


# In[56]:


K=50
urm = data_sparse
MAX_PID = urm.shape[1]
MAX_UID = urm.shape[0]

U, S, Vt = compute_svd(urm, K)


# In[57]:


uTest = [4,5,6,7,8,873,23]

uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)


# In[58]:


for user in uTest:
    print("Recommendation for user with user id {}". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
        print("The number {} recommended song is {} BY {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
        rank_value+=1


# In[59]:


uTest = [27513]
#Get estimated rating for test user
print("Predictied ratings:")
uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)


# In[60]:


for user in uTest:
    print("Recommendation for user with user id {}". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
        print("The number {} recommended song is {} BY {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
        rank_value+=1

 

posted @ 2021-03-05 14:58  哒哒哒2018  阅读(287)  评论(0编辑  收藏  举报