http://www.csie.ntu.edu.tw/~r95007/thesis/svdnetflix/report/report.pdf
http://eecs.wsu.edu/~vjakkula/MLProject.pdf
http://michielvanwezel.com/papers/kagie_vdloos_vwezelV2.pdf
http://cseweb.ucsd.edu/users/elkan/KddNetflixWorkshop.pdf
http://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/The-Netflix-Prize-Bennett.pdf
准备数据集
1shell 将所有测试数据集文件合并为一个文件
#!/bin/bash
for x in netflix/training_set/mv_*.txt ;
do cat $x >> ratings.txt ;
done &
http://www.netflixprize.com/community/viewtopic.php?id=87
需要下载path模块
#!/usr/bin/env python
import sys
import csv
from path import path
NULL = '\N'
class Dialect(csv.excel):
delimiter = '\t'
lineterminator = '\n'
doublequote = False
escapechar = None
quoting = csv.QUOTE_MINIMAL
def csvDump(iter_rows_func, basename, dir='.', csvdir='csv', dialect=Dialect):
dir,csvdir = path(dir),path(csvdir)
if not csvdir.exists():
csvdir.mkdir()
inpath = dir/basename
outfile = csvdir/inpath.namebase + '.csv'
if not outfile.exists():
write = csv.writer(open(outfile, 'wb'), dialect).writerow
print >> sys.stderr, 'Writing %s ...' % outfile
for row in iter_rows_func(inpath):
write(row)
def iterMovieRows(path):
for line in open(path):
id,year,title = line.rstrip().split(',',2)
year = year!='NULL' and int(year) or NULL
yield (int(id), year, title)
def iterTrainingSetRows(dir):
for path in dir.walkfiles():
iterlines = (line.strip() for line in open(path))
movie_id = int(iterlines.next()[:-1])
for line in iterlines:
user_id,rating,date = line.split(',',2)
yield (movie_id, int(user_id), date, float(rating))
def iterProbeSetRows(path):
for line in (line.strip() for line in open(path)):
try:
user_id = int(line)
except ValueError:
movie_id = int(line[:-1])
else:
yield (movie_id,user_id)
def iterQualifyingSetRows(path):
for line in (line.strip() for line in open(path)):
try:
user_id,date = line.split(',')
except ValueError:
movie_id = int(line[:-1])
else:
http://eecs.wsu.edu/~vjakkula/MLProject.pdf
http://michielvanwezel.com/papers/kagie_vdloos_vwezelV2.pdf
http://cseweb.ucsd.edu/users/elkan/KddNetflixWorkshop.pdf
http://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/The-Netflix-Prize-Bennett.pdf
准备数据集
1shell 将所有测试数据集文件合并为一个文件
#!/bin/bash
for x in netflix/training_set/mv_*.txt ;
done &
http://www.netflixprize.com/community/viewtopic.php?id=87
需要下载path模块
#!/usr/bin/env python
import sys
import csv
from path import path
NULL = '\N'
class Dialect(csv.excel):
def csvDump(iter_rows_func, basename, dir='.', csvdir='csv', dialect=Dialect):
def iterMovieRows(path):
def iterTrainingSetRows(dir):
def iterProbeSetRows(path):
def iterQualifyingSetRows(path):