In [1]:
import pandas as pd
import numpy as np

print "Pandas version:",pd.__version__

from pandas import Series, DataFrame

from os import getenv
DATADIR = getenv("DATADIR")
SUBDIR = '/PUBLIC/movielens/ml-1m'
DATADIR += SUBDIR


Pandas version: 0.20.2

In [2]:
#--------------------------------------------------
import plotly as plotly
print "Plotly version", plotly.__version__  # version >1.9.4 required
import plotly.graph_objs as go
from plotly import tools

# plotly.offline.init_notebook_mode() # run at the start of every notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot #, plot  # Difference .plot / .iplot ???
init_notebook_mode() # run at the start of every ipython notebook to use plotly.offline
                     # this injects the plotly.js source files into the notebook
#--------------------------------------------------
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
#--------------------------------------------------


Plotly version 2.0.12

Read data into DataFrames


In [3]:
usersDF = pd.read_csv(
	"%s/users.csv" % DATADIR, sep=',',
	names=['UserID','Gender','Age','Occupation','ZipCode']
)

In [4]:
ratingsDF = pd.read_csv(
	"%s/ratings.csv" % DATADIR, sep=',',
	names=['UserID','MovieID','Rating','Timestamp']
)

In [6]:
# Compute Ratings Histogram:
ratingsHistogram = (
	ratingsDF[['Rating','UserID']].groupby(['Rating']).count()
	.rename(columns={"UserID": "Cnt"}).reset_index()
	.sort_values('Rating', ascending=True)
)
ratingsHistogram


Out[6]:
Rating Cnt
0 1 56174
1 2 107557
2 3 261197
3 4 348971
4 5 226310

Join DataFrames


In [7]:
# Join:
ratingsWithUserDataDF = ratingsDF.merge(usersDF, how='inner', on='UserID')

In [8]:
ratingsWithUserDataDF.head(3)


Out[8]:
UserID MovieID Rating Timestamp Gender Age Occupation ZipCode
0 1 1193 5 978300760 F 1 10 48067
1 1 661 3 978302109 F 1 10 48067
2 1 914 3 978301968 F 1 10 48067

In [9]:
# Compute Ratings Histogram by Gender:
ratingsHistogram = (
	ratingsWithUserDataDF.groupby(['Rating','Gender']).count()
	.rename(columns={'UserID':'Cnt'}).reset_index()[['Rating','Gender','Cnt']]
	.sort_values(['Rating','Gender'])
)
ratingsHistogram.head(10)


Out[9]:
Rating Gender Cnt
0 1 F 13347
1 1 M 42827
2 2 F 24548
3 2 M 83009
4 3 F 62966
5 3 M 198231
6 4 F 87033
7 4 M 261938
8 5 F 58546
9 5 M 167764

In [10]:
# Compute Ratings Histogram by Gender:
ratingsHistogram = (
	ratingsWithUserDataDF.groupby(['Rating','Gender']).count()
	.rename(columns={'UserID':'Cnt'})[['Cnt']]
 	.sort_index(ascending=[True,True])
)
ratingsHistogram.head(10)


Out[10]:
Cnt
Rating Gender
1 F 13347
M 42827
2 F 24548
M 83009
3 F 62966
M 198231
4 F 87033
M 261938
5 F 58546
M 167764

In [11]:
fRatingsNr=ratingsWithUserDataDF.groupby(['Gender']).count()['UserID']['F']
mRatingsNr=ratingsWithUserDataDF.groupby(['Gender']).count()['UserID']['M']
print "Nr. of ratings by female users:",fRatingsNr
print "Nr. of ratings by male users:  ",mRatingsNr


Nr. of ratings by female users: 246440
Nr. of ratings by male users:   753769

In [12]:
ratingsHistogram = ratingsHistogram.reset_index()
ratingsHistogram['CntNormalized'] = (
	ratingsHistogram
	.apply(lambda x: 1.*x['Cnt']/fRatingsNr if x['Gender']=='F' else 1.*x['Cnt']/mRatingsNr, axis=1)
)

In [13]:
ratingsHistogram


Out[13]:
Rating Gender Cnt CntNormalized
0 1 F 13347 0.054159
1 1 M 42827 0.056817
2 2 F 24548 0.099610
3 2 M 83009 0.110125
4 3 F 62966 0.255502
5 3 M 198231 0.262986
6 4 F 87033 0.353161
7 4 M 261938 0.347504
8 5 F 58546 0.237567
9 5 M 167764 0.222567

In [14]:
ratingsHistogram.groupby(['Gender']).sum()[['Cnt','CntNormalized']]


Out[14]:
Cnt CntNormalized
Gender
F 246440 1.0
M 753769 1.0

Read movies data


In [15]:
moviesDF = pd.read_csv(
	"%s/movies.csv" % DATADIR, sep='+',
	names=['MovieID','Title','Genres']
)

In [16]:
moviesDF.head(3)


Out[16]:
MovieID Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance

In [17]:
stackedGenres=moviesDF['Genres'].str.split("|",expand=True).stack()

In [18]:
stackedGenres.reset_index().head(5)


Out[18]:
level_0 level_1 0
0 0 0 Animation
1 0 1 Children's
2 0 2 Comedy
3 1 0 Adventure
4 1 1 Children's

In [19]:
moviesDF = (
	moviesDF.reset_index()
	.merge(stackedGenres.reset_index(), how='inner', left_on='index', right_on='level_0')
	.drop(['index','Genres','level_0','level_1'], axis=1)
	.rename(columns={0:'Genre'})
)

In [20]:
moviesDF.head(6)


Out[20]:
MovieID Title Genre
0 1 Toy Story (1995) Animation
1 1 Toy Story (1995) Children's
2 1 Toy Story (1995) Comedy
3 2 Jumanji (1995) Adventure
4 2 Jumanji (1995) Children's
5 2 Jumanji (1995) Fantasy

In [21]:
ratingsWithUserAndMovieDataDF = ratingsWithUserDataDF.merge(moviesDF, how='inner', on='MovieID')

In [22]:
ratingsWithUserAndMovieDataDF.head(5)


Out[22]:
UserID MovieID Rating Timestamp Gender Age Occupation ZipCode Title Genre
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 1193 5 978298413 M 56 16 70072 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 1193 4 978220179 M 25 12 32793 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 1193 4 978199279 M 25 7 22903 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 1193 5 978158471 M 50 1 95350 One Flew Over the Cuckoo's Nest (1975) Drama

In [23]:
ratingsByGenderAndGenreDF=(
	ratingsWithUserAndMovieDataDF
	.groupby(['Gender','Genre'])
	.agg({
		'Rating': 'mean',
		'UserID': 'count'
	})
	.sort_index()
)

In [24]:
ratingsByGenderAndGenreDF


Out[24]:
Rating UserID
Gender Genre
F Action 3.490252 45650
Adventure 3.512879 27332
Animation 3.744702 12221
Children's 3.572548 21317
Comedy 3.571938 96271
Crime 3.689332 16442
Documentary 3.946392 1940
Drama 3.765662 98153
Fantasy 3.513076 8718
Film-Noir 4.018087 4202
Horror 3.202870 14635
Musical 3.809108 13505
Mystery 3.686548 9976
Romance 3.673579 50297
Sci-Fi 3.450255 27400
Thriller 3.573360 40308
War 3.893138 14093
Western 3.551913 3477
M Action 3.491386 211807
Adventure 3.468125 106621
Animation 3.661335 31072
Children's 3.358961 50869
Comedy 3.503667 260309
Crime 3.713720 63099
Documentary 3.928811 5970
Drama 3.766589 256376
Fantasy 3.426603 27583
Film-Noir 4.092254 14059
Horror 3.217891 61751
Musical 3.596332 28028
Mystery 3.662009 30202
Romance 3.573262 97226
Sci-Fi 3.469952 129894
Thriller 3.569685 149372
War 3.893375 54434
Western 3.655120 17206

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: