数据清洗
用python 处理数据movielens-100, 并用matlab文件保存。
# coding: utf-8
# ml-k100.py
import pandas as pd
import numpy as np
import time
import scipy.io as sio
def to_date(x):
if len(x)==0:
return 1995
tt = time.strptime(x.decode(), '%d-%b-%Y')
return tt.tm_year + round(tt.tm_yday/ 366, 3)
def parse_line(line):
item, title, date, _, _, *genres = line.split(b'|')
#if len(date)==0:
# print(line)
return [int(item), to_date(date)] + [int(v) for v in genres]
def transform(age):
V = pd.qcut(age, 10)
V1 = V.value_counts()
v2 = V1.sort_index().cumsum() / len(age)
return pd.Categorical.from_codes(V.cat.codes, v2.tolist()).astype(float)
D = open('ml-100k/u.item', 'rb').read().splitlines()
D1 = pd.DataFrame(data=list(map(parse_line, D)), columns=['item', 'date'] + ['f{}'.format(i) for i in range(19)])
D1.date = transform(D1.date)
D2 = pd.read_csv('ml-100k/u.user', sep='|', header=None, names=['user','age','gender','occupation','zipcode'])
D2.age = transform(D2.age)
D3 = pd.get_dummies(D2.drop('zipcode', axis=1), columns=['gender','occupation'])
D4 = pd.read_csv('ml-100k/u.data',sep='\t', header=None, names=['user','item','rate','timestamp'])
D4.drop('timestamp', axis=1,inplace=True)
sio.savemat('ml-100k.mat', {'user': D3.values, 'item': D1.values, 'rate': D4.values})
--- 她说, 她是仙,她不是神