In [2]:

    
# print "Spark version:",sc.version

# import pandas as pd
# import numpy as np

# from pandas import Series, DataFrame

from os import getenv
DATADIR = getenv("DATADIR")
SUBDIR = '/PUBLIC/movielens/ml-1m'
DATADIR += SUBDIR



In [3]:

    
from sframe import SFrame
from sframe import SArray
from sframe import aggregate as agg



In [4]:

    
#--------------------------------------------------
import plotly as plotly
print "Plotly version", plotly.__version__  # version >1.9.4 required
import plotly.graph_objs as go
from plotly import tools

# plotly.offline.init_notebook_mode() # run at the start of every notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot #, plot  # Difference .plot / .iplot ???
init_notebook_mode() # run at the start of every ipython notebook to use plotly.offline
                     # this injects the plotly.js source files into the notebook
#--------------------------------------------------
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
#--------------------------------------------------









    



Plotly version 2.0.12

Read data into SFrames



In [4]:

    
usersSF = SFrame.read_csv(
	"%s/users.dat" % DATADIR, delimiter='::', header=False, verbose=False,
	column_type_hints = [int, str, int, int, str]
)
usersSF = usersSF.rename({
	'X1': 'UserID',
	'X2': 'Gender',
	'X3': 'Age',
	'X4': 'Occupation',
	'X5': 'ZipCode',
})
usersDescSF=dict(zip(usersSF.column_names(), usersSF.column_types()))
print usersDescSF









    



[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1506593496.log






    



{'Gender': <type 'str'>, 'Age': <type 'int'>, 'UserID': <type 'int'>, 'ZipCode': <type 'str'>, 'Occupation': <type 'int'>}



In [5]:

    
ratingsSF = SFrame.read_csv(
	"%s/ratings.dat" % DATADIR, delimiter='::', header=False, verbose=False,
	column_type_hints = [int, int, int, int]
)
ratingsSF = ratingsSF.rename({
	'X1': 'UserID',
	'X2': 'MovieID',
	'X3': 'Rating',
	'X4': 'Timestamp'
})



In [6]:

    
# Compute Ratings Histogram:
ratingsHistogram = ratingsSF.groupby(['Rating'], {'Cnt': agg.COUNT()}).sort('Rating')
ratingsHistogram.print_rows()









    



+--------+--------+
| Rating |  Cnt   |
+--------+--------+
|   1    | 56174  |
|   2    | 107557 |
|   3    | 261197 |
|   4    | 348971 |
|   5    | 226310 |
+--------+--------+
[5 rows x 2 columns]

Join SFrames



In [7]:

    
# Join:
ratingsWithUserDataSF = ratingsSF.join(usersSF, how='inner', on='UserID')



In [8]:

    
ratingsWithUserDataSF.head(3)









    Out[8]:





    
        UserID
        MovieID
        Rating
        Timestamp
        Gender
        Age
        Occupation
        ZipCode
    
    
        1
        1193
        5
        978300760
        F
        1
        10
        48067
    
    
        1
        661
        3
        978302109
        F
        1
        10
        48067
    
    
        1
        914
        3
        978301968
        F
        1
        10
        48067
    

[3 rows x 8 columns]



In [9]:

    
# Compute Ratings Histogram by Gender:
ratingsHistogram = ratingsWithUserDataSF.groupby(['Rating','Gender'], {'Cnt': agg.COUNT()}).sort(['Rating','Gender'])
ratingsHistogram.print_rows()









    



+--------+--------+--------+
| Gender | Rating |  Cnt   |
+--------+--------+--------+
|   F    |   1    | 13347  |
|   M    |   1    | 42827  |
|   F    |   2    | 24548  |
|   M    |   2    | 83009  |
|   F    |   3    | 62966  |
|   M    |   3    | 198231 |
|   F    |   4    | 87033  |
|   M    |   4    | 261938 |
|   F    |   5    | 58546  |
|   M    |   5    | 167764 |
+--------+--------+--------+
[10 rows x 3 columns]



In [10]:

    
data = []
data.append(go.Bar(
	x = ratingsHistogram.filter_by('M','Gender')['Rating'].to_numpy(),
	y = ratingsHistogram.filter_by('M','Gender')['Cnt'].to_numpy(),
	name = 'Male'
))
data.append(go.Bar(
	x = ratingsHistogram.filter_by('F','Gender')['Rating'].to_numpy(),
	y = ratingsHistogram.filter_by('F','Gender')['Cnt'].to_numpy(),
	name = 'Female'
))
layout = go.Layout(
  title='Distribution of ratings by gender',
  xaxis=dict(
    title='Rating',
  ),
  yaxis=dict(
    title='Cnt'
  )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)



In [11]:

    
fRatingsNr,mRatingsNr=ratingsWithUserDataSF.groupby(['Gender'], {'Cnt': agg.COUNT()}).sort('Gender')['Cnt']
print "Nr. of ratings by female users:",fRatingsNr
print "Nr. of ratings by male users:  ",mRatingsNr









    



Nr. of ratings by female users: 246440
Nr. of ratings by male users:   753769



In [12]:

    
ratingsHistogram['CntNormalized'] = \
	ratingsHistogram.apply(lambda x: 1.*x['Cnt']/fRatingsNr if x['Gender']=='F' else 1.*x['Cnt']/mRatingsNr)



In [13]:

    
ratingsHistogram.print_rows()









    



+--------+--------+--------+-----------------+
| Gender | Rating |  Cnt   |  CntNormalized  |
+--------+--------+--------+-----------------+
|   F    |   1    | 13347  | 0.0541592273981 |
|   M    |   1    | 42827  | 0.0568171415911 |
|   F    |   2    | 24548  | 0.0996104528486 |
|   M    |   2    | 83009  |  0.110125250574 |
|   F    |   3    | 62966  |  0.255502353514 |
|   M    |   3    | 198231 |  0.262986405649 |
|   F    |   4    | 87033  |  0.353161012823 |
|   M    |   4    | 261938 |  0.347504341516 |
|   F    |   5    | 58546  |  0.237566953417 |
|   M    |   5    | 167764 |  0.22256686067  |
+--------+--------+--------+-----------------+
[10 rows x 4 columns]



In [14]:

    
ratingsHistogram.groupby('Gender', agg.SUM('CntNormalized'))









    Out[14]:





    
        Gender
        Sum of CntNormalized
    
    
        M
        1.0
    
    
        F
        1.0
    

[2 rows x 2 columns]



In [15]:

    
data = []
data.append(go.Bar(
	x = ratingsHistogram.filter_by('M','Gender')['Rating'].to_numpy(),
	y = ratingsHistogram.filter_by('M','Gender')['CntNormalized'].to_numpy(),
	name = 'Male'
))
data.append(go.Bar(
	x = ratingsHistogram.filter_by('F','Gender')['Rating'].to_numpy(),
	y = ratingsHistogram.filter_by('F','Gender')['CntNormalized'].to_numpy(),
	name = 'Female'
))
layout = go.Layout(
  title='Normalized distribution of ratings by gender',
  xaxis=dict(
    title='Rating',
  ),
  yaxis=dict(
    title='Pct'
  )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Read movies data



In [16]:

    
moviesSF = SFrame.read_csv(
	"%s/movies.dat" % DATADIR, delimiter='::', header=False, verbose=False,
	column_type_hints = [int, str, str]
)
moviesSF = moviesSF.rename({
	'X1': 'MovieID',
	'X2': 'Title',
	'X3': 'Genres'})



In [17]:

    
moviesSF.print_rows(4)









    



+---------+--------------------------+------------------------------+
| MovieID |          Title           |            Genres            |
+---------+--------------------------+------------------------------+
|    1    |     Toy Story (1995)     | Animation|Children's|Comedy  |
|    2    |      Jumanji (1995)      | Adventure|Children's|Fantasy |
|    3    | Grumpier Old Men (1995)  |        Comedy|Romance        |
|    4    | Waiting to Exhale (1995) |         Comedy|Drama         |
+---------+--------------------------+------------------------------+
[3883 rows x 3 columns]



In [18]:

    
moviesSF['Genres'] = moviesSF['Genres'].apply(lambda x: x.split("|"))



In [19]:

    
moviesSF.print_rows(4, max_column_width=50, max_row_width=130)









    



+---------+--------------------------+----------------------------------+
| MovieID |          Title           |              Genres              |
+---------+--------------------------+----------------------------------+
|    1    |     Toy Story (1995)     | [Animation, Children's, Comedy]  |
|    2    |      Jumanji (1995)      | [Adventure, Children's, Fantasy] |
|    3    | Grumpier Old Men (1995)  |        [Comedy, Romance]         |
|    4    | Waiting to Exhale (1995) |         [Comedy, Drama]          |
+---------+--------------------------+----------------------------------+
[3883 rows x 3 columns]



In [20]:

    
moviesLongSF=moviesSF.stack('Genres', new_column_name='Genre')
print 'Number of distinct genres:',moviesLongSF['Genre'].unique().size()









    



Number of distinct genres: 18



In [21]:

    
moviesLongSF.print_rows(10)









    



+---------+--------------------------+------------+
| MovieID |          Title           |   Genre    |
+---------+--------------------------+------------+
|    1    |     Toy Story (1995)     | Animation  |
|    1    |     Toy Story (1995)     | Children's |
|    1    |     Toy Story (1995)     |   Comedy   |
|    2    |      Jumanji (1995)      | Adventure  |
|    2    |      Jumanji (1995)      | Children's |
|    2    |      Jumanji (1995)      |  Fantasy   |
|    3    | Grumpier Old Men (1995)  |   Comedy   |
|    3    | Grumpier Old Men (1995)  |  Romance   |
|    4    | Waiting to Exhale (1995) |   Comedy   |
|    4    | Waiting to Exhale (1995) |   Drama    |
+---------+--------------------------+------------+
[6408 rows x 3 columns]



In [22]:

    
ratingsWithUserAndMovieDataSF = ratingsWithUserDataSF.join(moviesSF, how='inner', on='MovieID')



In [23]:

    
ratingsByGenderAndGenreSF=ratingsWithUserAndMovieDataSF['Gender','Rating','Genres']\
													.stack('Genres', new_column_name='Genre')\
													.groupby(['Gender','Rating','Genre'], {'Cnt': agg.COUNT()})\
													.sort(['Genre','Gender','Rating'])



In [24]:

    
avgRatingsByGenderAndGenreSF=ratingsWithUserAndMovieDataSF['Gender','Rating','Genres']\
													.stack('Genres', new_column_name='Genre')\
													.groupby(['Gender','Genre'], {'AvgRating': agg.AVG('Rating')})\
													.sort(['Genre','Gender'])



In [25]:

    
genres = ratingsByGenderAndGenreSF['Genre'].unique().sort()

for g in genres[:]:
	r = ratingsByGenderAndGenreSF.filter_by(g,'Genre')
	fRatingsNr,mRatingsNr=r.groupby(['Gender'], {'Cnt': agg.SUM('Cnt')}).sort('Gender')['Cnt']
	r['CntNormalized'] = r.apply(lambda x: 1.*x['Cnt']/fRatingsNr if x['Gender']=='F' else 1.*x['Cnt']/mRatingsNr)
	
# 	print fRatingsNr,mRatingsNr
# 	r.print_rows(5)
	fRatingsAvg,mRatingsAvg = avgRatingsByGenderAndGenreSF.filter_by(g,'Genre')['AvgRating']

	data = []
	data.append(go.Bar(
		x = r.filter_by('M','Gender')['Rating'].to_numpy(),
		y = r.filter_by('M','Gender')['CntNormalized'].to_numpy(),
		name = 'Male, Avg=%f'%mRatingsAvg
	))
	data.append(go.Bar(
		x = r.filter_by('F','Gender')['Rating'].to_numpy(),
		y = r.filter_by('F','Gender')['CntNormalized'].to_numpy(),
		name = 'Female, Avg=%f'%fRatingsAvg
	))
	layout = go.Layout(
		title='%s - Normalized distribution of ratings by gender'%g,
		xaxis=dict(
			title='Rating',
		),
		yaxis=dict(
			title='Pct'
		)
	)
	fig = go.Figure(data=data, layout=layout)
	iplot(fig)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

UserID	MovieID	Rating	Timestamp	Gender	Age	Occupation	ZipCode
1	1193	5	978300760	F	1	10	48067
1	661	3	978302109	F	1	10	48067
1	914	3	978301968	F	1	10	48067