数据分析 (电影数据)
import pandas as pd
uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
fuser = '//home//yunpiao//data/1M//users.dat'
fmovie = '/home/yunpiao/data/1M/movies.dat'
fratings = '/home/yunpiao/data/1M/ratings.dat'
pusers = pd.read_table(fuser, sep='::', header=None, names=uname, engine='python')
uname = ['user_id','movie_id', 'rating', 'timestamp']
prating = pd.read_table(fratings, sep='::', header=None, names=uname, engine='python')
uname = ['movie_id', 'title', 'genres']
%timeit pmovie = pd.read_table(fmovie, sep='::', header=None, names=uname,engine='python')
100 loops, best of 3: 11.5 ms per loop
切片
pusers[:5]
|
user_id |
gender |
age |
occupation |
zip |
0 |
1 |
F |
1 |
10 |
48067 |
1 |
2 |
M |
56 |
16 |
70072 |
2 |
3 |
M |
25 |
15 |
55117 |
3 |
4 |
M |
45 |
7 |
02460 |
4 |
5 |
M |
25 |
20 |
55455 |
prating[:5]
|
user_id |
movie_id |
rating |
timestamp |
0 |
1 |
1193 |
5 |
978300760 |
1 |
1 |
661 |
3 |
978302109 |
2 |
1 |
914 |
3 |
978301968 |
3 |
1 |
3408 |
4 |
978300275 |
4 |
1 |
2355 |
5 |
978824291 |
pmovie[1:10:4]
|
movie_id |
title |
genres |
1 |
2 |
Jumanji (1995) |
Adventure|Children's|Fantasy |
5 |
6 |
Heat (1995) |
Action|Crime|Thriller |
9 |
10 |
GoldenEye (1995) |
Action|Adventure|Thriller |
data = pd.merge(pd.merge(prating,pusers),pmovie)
print(data.ix[6])
user_id 19
movie_id 1193
rating 5
timestamp 982730936
gender M
age 1
occupation 10
zip 48073
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 6, dtype: object
mean_ratings = data.pivot_table('rating',index='title', columns='gender', aggfunc='mean')
mean_ratings[:5]
gender |
F |
M |
title |
|
|
$1,000,000 Duck (1971) |
3.375000 |
2.761905 |
'Night Mother (1986) |
3.388889 |
3.352941 |
'Til There Was You (1997) |
2.675676 |
2.733333 |
'burbs, The (1989) |
2.793478 |
2.962085 |
...And Justice for All (1979) |
3.828571 |
3.689024 |
rating_by_title = data.groupby('title').size()
rating_by_title[:4]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
dtype: int64
active_title = rating_by_title.index[rating_by_title >= 250]
print(active_title)
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
u'2001: A Space Odyssey (1968)', u'2010 (1984)',
...
u'X-Men (2000)', u'Year of Living Dangerously (1982)',
u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
u'Young Frankenstein (1974)', u'Young Guns (1988)',
u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
u'Zero Effect (1998)', u'eXistenZ (1999)'],
dtype='object', name=u'title', length=1216)
mean_ratings = mean_ratings.ix[active_title]
mean_ratings[:3]
gender |
F |
M |
title |
|
|
'burbs, The (1989) |
2.793478 |
2.962085 |
10 Things I Hate About You (1999) |
3.646552 |
3.311966 |
101 Dalmatians (1961) |
3.791444 |
3.500000 |
top_demale_ratings = mean_ratings.sort_values(by='M',ascending=False)
top_demale_ratings['M'][:3]
title
Godfather, The (1972) 4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.576628
Shawshank Redemption, The (1994) 4.560625
Name: M, dtype: float64
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
mean_ratings[:5]
gender |
F |
M |
diff |
title |
|
|
|
'burbs, The (1989) |
2.793478 |
2.962085 |
0.168607 |
10 Things I Hate About You (1999) |
3.646552 |
3.311966 |
-0.334586 |
101 Dalmatians (1961) |
3.791444 |
3.500000 |
-0.291444 |
101 Dalmatians (1996) |
3.240000 |
2.911215 |
-0.328785 |
12 Angry Men (1957) |
4.184397 |
4.328421 |
0.144024 |
top_diff = mean_ratings.sort_values(by="diff", ascending=False)
top_diff[:4:1]
gender |
F |
M |
diff |
title |
|
|
|
Good, The Bad and The Ugly, The (1966) |
3.494949 |
4.221300 |
0.726351 |
Kentucky Fried Movie, The (1977) |
2.878788 |
3.555147 |
0.676359 |
Dumb & Dumber (1994) |
2.697987 |
3.336595 |
0.638608 |
Longest Day, The (1962) |
3.411765 |
4.031447 |
0.619682 |
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.ix[active_title]
rating_std_by_title.sort_values(ascending=False)[:10]
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64