In [1]:

    
import pandas as pd
import numpy as np

print "Pandas version:",pd.__version__

from pandas import Series, DataFrame

from os import getenv
DATADIR = getenv("DATADIR")
SUBDIR = '/PUBLIC/movielens/ml-1m'
DATADIR += SUBDIR









    



Pandas version: 0.20.2



In [2]:

    
#--------------------------------------------------
import plotly as plotly
print "Plotly version", plotly.__version__  # version >1.9.4 required
import plotly.graph_objs as go
from plotly import tools

# plotly.offline.init_notebook_mode() # run at the start of every notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot #, plot  # Difference .plot / .iplot ???
init_notebook_mode() # run at the start of every ipython notebook to use plotly.offline
                     # this injects the plotly.js source files into the notebook
#--------------------------------------------------
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
#--------------------------------------------------









    



Plotly version 2.0.12

Read data into DataFrames



In [3]:

    
usersDF = pd.read_csv(
	"%s/users.csv" % DATADIR, sep=',',
	names=['UserID','Gender','Age','Occupation','ZipCode']
)



In [4]:

    
ratingsDF = pd.read_csv(
	"%s/ratings.csv" % DATADIR, sep=',',
	names=['UserID','MovieID','Rating','Timestamp']
)



In [6]:

    
# Compute Ratings Histogram:
ratingsHistogram = (
	ratingsDF[['Rating','UserID']].groupby(['Rating']).count()
	.rename(columns={"UserID": "Cnt"}).reset_index()
	.sort_values('Rating', ascending=True)
)
ratingsHistogram

Join DataFrames



In [7]:

    
# Join:
ratingsWithUserDataDF = ratingsDF.merge(usersDF, how='inner', on='UserID')



In [8]:

    
ratingsWithUserDataDF.head(3)



In [9]:

    
# Compute Ratings Histogram by Gender:
ratingsHistogram = (
	ratingsWithUserDataDF.groupby(['Rating','Gender']).count()
	.rename(columns={'UserID':'Cnt'}).reset_index()[['Rating','Gender','Cnt']]
	.sort_values(['Rating','Gender'])
)
ratingsHistogram.head(10)



In [10]:

    
# Compute Ratings Histogram by Gender:
ratingsHistogram = (
	ratingsWithUserDataDF.groupby(['Rating','Gender']).count()
	.rename(columns={'UserID':'Cnt'})[['Cnt']]
 	.sort_index(ascending=[True,True])
)
ratingsHistogram.head(10)



In [11]:

    
fRatingsNr=ratingsWithUserDataDF.groupby(['Gender']).count()['UserID']['F']
mRatingsNr=ratingsWithUserDataDF.groupby(['Gender']).count()['UserID']['M']
print "Nr. of ratings by female users:",fRatingsNr
print "Nr. of ratings by male users:  ",mRatingsNr









    



Nr. of ratings by female users: 246440
Nr. of ratings by male users:   753769



In [12]:

    
ratingsHistogram = ratingsHistogram.reset_index()
ratingsHistogram['CntNormalized'] = (
	ratingsHistogram
	.apply(lambda x: 1.*x['Cnt']/fRatingsNr if x['Gender']=='F' else 1.*x['Cnt']/mRatingsNr, axis=1)
)



In [13]:

    
ratingsHistogram









    Out[13]:







  
    
      
      Rating
      Gender
      Cnt
      CntNormalized
    
  
  
    
      0
      1
      F
      13347
      0.054159
    
    
      1
      1
      M
      42827
      0.056817
    
    
      2
      2
      F
      24548
      0.099610
    
    
      3
      2
      M
      83009
      0.110125
    
    
      4
      3
      F
      62966
      0.255502
    
    
      5
      3
      M
      198231
      0.262986
    
    
      6
      4
      F
      87033
      0.353161
    
    
      7
      4
      M
      261938
      0.347504
    
    
      8
      5
      F
      58546
      0.237567
    
    
      9
      5
      M
      167764
      0.222567



In [14]:

    
ratingsHistogram.groupby(['Gender']).sum()[['Cnt','CntNormalized']]









    Out[14]:







  
    
      
      Cnt
      CntNormalized
    
    
      Gender
      
      
    
  
  
    
      F
      246440
      1.0
    
    
      M
      753769
      1.0

Read movies data



In [15]:

    
moviesDF = pd.read_csv(
	"%s/movies.csv" % DATADIR, sep='+',
	names=['MovieID','Title','Genres']
)



In [16]:

    
moviesDF.head(3)









    Out[16]:







  
    
      
      MovieID
      Title
      Genres
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation|Children's|Comedy
    
    
      1
      2
      Jumanji (1995)
      Adventure|Children's|Fantasy
    
    
      2
      3
      Grumpier Old Men (1995)
      Comedy|Romance



In [17]:

    
stackedGenres=moviesDF['Genres'].str.split("|",expand=True).stack()



In [18]:

    
stackedGenres.reset_index().head(5)



In [19]:

    
moviesDF = (
	moviesDF.reset_index()
	.merge(stackedGenres.reset_index(), how='inner', left_on='index', right_on='level_0')
	.drop(['index','Genres','level_0','level_1'], axis=1)
	.rename(columns={0:'Genre'})
)



In [20]:

    
moviesDF.head(6)









    Out[20]:







  
    
      
      MovieID
      Title
      Genre
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation
    
    
      1
      1
      Toy Story (1995)
      Children's
    
    
      2
      1
      Toy Story (1995)
      Comedy
    
    
      3
      2
      Jumanji (1995)
      Adventure
    
    
      4
      2
      Jumanji (1995)
      Children's
    
    
      5
      2
      Jumanji (1995)
      Fantasy



In [21]:

    
ratingsWithUserAndMovieDataDF = ratingsWithUserDataDF.merge(moviesDF, how='inner', on='MovieID')



In [22]:

    
ratingsWithUserAndMovieDataDF.head(5)









    Out[22]:







  
    
      
      UserID
      MovieID
      Rating
      Timestamp
      Gender
      Age
      Occupation
      ZipCode
      Title
      Genre
    
  
  
    
      0
      1
      1193
      5
      978300760
      F
      1
      10
      48067
      One Flew Over the Cuckoo's Nest (1975)
      Drama
    
    
      1
      2
      1193
      5
      978298413
      M
      56
      16
      70072
      One Flew Over the Cuckoo's Nest (1975)
      Drama
    
    
      2
      12
      1193
      4
      978220179
      M
      25
      12
      32793
      One Flew Over the Cuckoo's Nest (1975)
      Drama
    
    
      3
      15
      1193
      4
      978199279
      M
      25
      7
      22903
      One Flew Over the Cuckoo's Nest (1975)
      Drama
    
    
      4
      17
      1193
      5
      978158471
      M
      50
      1
      95350
      One Flew Over the Cuckoo's Nest (1975)
      Drama



In [23]:

    
ratingsByGenderAndGenreDF=(
	ratingsWithUserAndMovieDataDF
	.groupby(['Gender','Genre'])
	.agg({
		'Rating': 'mean',
		'UserID': 'count'
	})
	.sort_index()
)



In [24]:

    
ratingsByGenderAndGenreDF









    Out[24]:







  
    
      
      
      Rating
      UserID
    
    
      Gender
      Genre
      
      
    
  
  
    
      F
      Action
      3.490252
      45650
    
    
      Adventure
      3.512879
      27332
    
    
      Animation
      3.744702
      12221
    
    
      Children's
      3.572548
      21317
    
    
      Comedy
      3.571938
      96271
    
    
      Crime
      3.689332
      16442
    
    
      Documentary
      3.946392
      1940
    
    
      Drama
      3.765662
      98153
    
    
      Fantasy
      3.513076
      8718
    
    
      Film-Noir
      4.018087
      4202
    
    
      Horror
      3.202870
      14635
    
    
      Musical
      3.809108
      13505
    
    
      Mystery
      3.686548
      9976
    
    
      Romance
      3.673579
      50297
    
    
      Sci-Fi
      3.450255
      27400
    
    
      Thriller
      3.573360
      40308
    
    
      War
      3.893138
      14093
    
    
      Western
      3.551913
      3477
    
    
      M
      Action
      3.491386
      211807
    
    
      Adventure
      3.468125
      106621
    
    
      Animation
      3.661335
      31072
    
    
      Children's
      3.358961
      50869
    
    
      Comedy
      3.503667
      260309
    
    
      Crime
      3.713720
      63099
    
    
      Documentary
      3.928811
      5970
    
    
      Drama
      3.766589
      256376
    
    
      Fantasy
      3.426603
      27583
    
    
      Film-Noir
      4.092254
      14059
    
    
      Horror
      3.217891
      61751
    
    
      Musical
      3.596332
      28028
    
    
      Mystery
      3.662009
      30202
    
    
      Romance
      3.573262
      97226
    
    
      Sci-Fi
      3.469952
      129894
    
    
      Thriller
      3.569685
      149372
    
    
      War
      3.893375
      54434
    
    
      Western
      3.655120
      17206



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	Rating	Cnt
0	1	56174
1	2	107557
2	3	261197
3	4	348971
4	5	226310

	UserID	MovieID	Rating	Timestamp	Gender	Age	Occupation	ZipCode
0	1	1193	5	978300760	F	1	10	48067
1	1	661	3	978302109	F	1	10	48067
2	1	914	3	978301968	F	1	10	48067

	Rating	Gender	Cnt
0	1	F	13347
1	1	M	42827
2	2	F	24548
3	2	M	83009
4	3	F	62966
5	3	M	198231
6	4	F	87033
7	4	M	261938
8	5	F	58546
9	5	M	167764

		Cnt
Rating	Gender
1	F	13347
1	M	42827
2	F	24548
2	M	83009
3	F	62966
3	M	198231
4	F	87033
4	M	261938
5	F	58546
5	M	167764

	Rating	Gender	Cnt	CntNormalized
0	1	F	13347	0.054159
1	1	M	42827	0.056817
2	2	F	24548	0.099610
3	2	M	83009	0.110125
4	3	F	62966	0.255502
5	3	M	198231	0.262986
6	4	F	87033	0.353161
7	4	M	261938	0.347504
8	5	F	58546	0.237567
9	5	M	167764	0.222567

	MovieID	Title	Genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance

	MovieID	Title	Genre
0	1	Toy Story (1995)	Animation
1	1	Toy Story (1995)	Children's
2	1	Toy Story (1995)	Comedy
3	2	Jumanji (1995)	Adventure
4	2	Jumanji (1995)	Children's
5	2	Jumanji (1995)	Fantasy

		Rating	UserID
Gender	Genre
F	Action	3.490252	45650
	Adventure	3.512879	27332
	Animation	3.744702	12221
	Children's	3.572548	21317
	Comedy	3.571938	96271
	Crime	3.689332	16442
	Documentary	3.946392	1940
	Drama	3.765662	98153
	Fantasy	3.513076	8718
	Film-Noir	4.018087	4202
	Horror	3.202870	14635
	Musical	3.809108	13505
	Mystery	3.686548	9976
	Romance	3.673579	50297
	Sci-Fi	3.450255	27400
	Thriller	3.573360	40308
	War	3.893138	14093
	Western	3.551913	3477
M	Action	3.491386	211807
	Adventure	3.468125	106621
	Animation	3.661335	31072
	Children's	3.358961	50869
	Comedy	3.503667	260309
	Crime	3.713720	63099
	Documentary	3.928811	5970
	Drama	3.766589	256376
	Fantasy	3.426603	27583
	Film-Noir	4.092254	14059
	Horror	3.217891	61751
	Musical	3.596332	28028
	Mystery	3.662009	30202
	Romance	3.573262	97226
	Sci-Fi	3.469952	129894
	Thriller	3.569685	149372
	War	3.893375	54434
	Western	3.655120	17206