In [3]:
import numpy as np
import pandas as pd
Read the data
In [4]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)
In [5]:
df.head()
Out[5]:
Get movie titles
In [6]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()
Out[6]:
Merged dataframes
In [7]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()
Out[7]:
In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
Create a ratings dataframe with average rating and number of ratings
In [9]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).head()
Out[9]:
In [10]:
df.groupby('title')['rating'].count().sort_values(ascending=False).head()
Out[10]:
In [11]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings.head()
Out[11]:
Number of ratings column
In [12]:
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head()
Out[12]:
Data Visualization: Histogram
In [13]:
plt.figure(figsize=(10,4))
ratings['num of ratings'].hist(bins=70)
Out[13]:
In [14]:
plt.figure(figsize=(10,4))
ratings['rating'].hist(bins=70)
Out[14]:
In [15]:
sns.jointplot(x='rating',y='num of ratings',data=ratings,alpha=0.5)
Out[15]:
In [16]:
moviemat = df.pivot_table(index='user_id',columns='title',values='rating')
moviemat.head()
Out[16]:
Most rated movies
In [17]:
ratings.sort_values('num of ratings',ascending=False).head(10)
Out[17]:
We choose two movies: starwars, a sci-fi movie. And Liar Liar, a comedy.
In [18]:
ratings.head()
Out[18]:
Now let's grab the user ratings for those two movies:
In [19]:
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']
starwars_user_ratings.head()
Out[19]:
Using corrwith() method to get correlations between two pandas series:
In [20]:
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)
Clear data by removing NaN values and using a DataFrame instead of a series
In [21]:
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True)
corr_starwars.head()
Out[21]:
In [22]:
corr_starwars.sort_values('Correlation',ascending=False).head(10)
Out[22]:
Filtering out movies that have less than 100 reviews (this value was chosen based off the histogram). This is needed to get more accurate results
In [23]:
corr_starwars = corr_starwars.join(ratings['num of ratings'])
corr_starwars.head()
Out[23]:
Now sort the values
In [24]:
corr_starwars[corr_starwars['num of ratings']>100].sort_values('Correlation',ascending=False).head()
Out[24]:
The same for the comedy Liar Liar:
In [25]:
corr_liarliar = pd.DataFrame(similar_to_liarliar,columns=['Correlation'])
corr_liarliar.dropna(inplace=True)
corr_liarliar = corr_liarliar.join(ratings['num of ratings'])
corr_liarliar[corr_liarliar['num of ratings']>100].sort_values('Correlation',ascending=False).head()
Out[25]: