sql实现协同过滤2
准备数据
create table table_origin_data( users varchar(100), item varchar(100), score number(20,4) ); insert into table_origin_data(users,item,score) VALUES ('张三','M1',5) insert into table_origin_data(users,item,score) VALUES ('李四','M2',2) insert into table_origin_data(users,item,score) VALUES ('王二','M1',4) insert into table_origin_data(users,item,score) VALUES ('张三','M2',3) insert into table_origin_data(users,item,score) VALUES ('张三','M3',1)
表table_origin_data
step1-1计算分子(乘积)
在计算的这个数据集的基础之上按item1,item2进行分组,sum(score_prod),把计算的结果存入到table_cov表中
CREATE TABLE table_cov AS SELECT item1, item2,sum(score_prod) AS prod FROM (SELECT t1.item AS item1, t2.item AS item2, t1.score*t2.score as score_prod FROM table_origin_data t1 INNER JOIN table_origin_data t2 ON t1.users=t2.users) GROUP BY item1, item2;
step1_2:计算余弦相似度
CREATE TABLE table_corr AS SELECT t1.item1, t1.item2, t1.prod/SQRT(t2.prod*t3.prod) AS corr FROM table_cov t1 INNER JOIN table_cov t2 ON t1.item1=t2.item1 AND t1.item1=t2.item2 INNER JOIN table_cov t3 ON t1.item2=t3.item1 AND t1.item2=t3.item2;
table_cov表inner join关联自己两次,第一次关联条件分别用t1表的item1列=自己t2表的item1 and item1列=自己t2表的item2
第二次关联条件t1表的item2列=自己t2表的item1 and item1列=自己t2表的item2
select查询的列就是t1表的item1,item2,和t1表的prod/根号下t2的prod乘以t3的prod
第三步:协同过滤
CREATE TABLE table_unkown AS SELECT t1.users, t2.item FROM (SELECT DISTINCT users, 1 AS joiner FROM table_origin_data) t1 INNER JOIN (SELECT DISTINCT item, 1 AS joiner FROM table_origin_data) t2 ON t1.joiner=t2.joiner LEFT JOIN table_origin_data t3 ON t1.users=t3.users AND t2.item=t3.item ;
把计算出的结果存入table_unkown表中,算最终的结果
SELECT users, item, SUM(score) AS score FROM ( SELECT t1.users, t1.item, t2.score*t3.corr AS score FROM table_unkown t1 INNER JOIN table_origin_data t2 ON t1.users=t2.users INNER JOIN table_corr t3 ON t1.item=t3.item1 AND t2.item=t3.item2 ) GROUP BY users, item;
https://blog.csdn.net/guofei9987/article/details/111993311 原博文