In [ ]:
In [1]:
import numpy as np
import pandas as pd
Loading the movie data from Excel into a DataFrame.
In [2]:
mov_user_data = pd.read_excel('Assign_3_data.xlsx')
Converting the type of the columns in the dataframe from int to strings.
In [3]:
mov_user_data.columns = [str(val) for val in list(mov_user_data.columns)]
Looking at the head of the dataframe.
In [4]:
mov_user_data.head()
Out[4]:
Creating a correlation coefficient dataframe
In [5]:
corr_df = mov_user_data.corr()
Looking at the head of the correlation coefficient dataframe
In [6]:
corr_df.head()
Out[6]:
Checking correlation coefficients with test values in assignment for consistency.
In [7]:
print abs(corr_df['1648'].ix['5136'] - 0.40298) < 1.0e3
print abs(corr_df['918'].ix['2824'] - -0.31706) < 1.03
For consistency check with values given in assignment, Top 5 neighbors for user 3712. Consistency check is passed.
In [8]:
corr_df['3712'].sort_values(ascending=False)[1:6]
Out[8]:
Top 5 neighbors for user 3867.
In [9]:
corr_df['3867'].sort_values(ascending=False)[1:6]
Out[9]:
Top 5 neighbors for user 89.
In [10]:
corr_df['89'].sort_values(ascending=False)[1:6]
Out[10]:
Initializing empty Series to store movie predictions (without and with normalization) for user 3712 from 5 nearest neighbors.
In [11]:
pred_3712_no_norm = pd.Series(index=mov_user_data.index)
pred_3712_wi_norm = pd.Series(index=mov_user_data.index)
Storing the labels of the 5 nearest neighbor users and the correlation coefficients between each of these 5 nearest neighbors and user 3712.
In [12]:
fiv_nn_3712 = list(corr_df['3712'].sort_values(ascending=False)[1:6].index)
fiv_nn_3712_corr = corr_df['3712'].sort_values(ascending=False)[1:6].values
Using the ratings of the 5 nearest neighbors and the correlations with the 5 nearest neighbors to predict the rating of user 3712 for this movie.
In [13]:
for movie in pred_3712_no_norm.index:
ratings = np.array([ mov_user_data[fiv_nn_3712[0]].ix[movie], mov_user_data[fiv_nn_3712[1]].ix[movie],
mov_user_data[fiv_nn_3712[2]].ix[movie], mov_user_data[fiv_nn_3712[3]].ix[movie],
mov_user_data[fiv_nn_3712[4]].ix[movie] ])
ind_slice = [i for i, rat_val in enumerate(ratings) if np.isnan(rat_val)==False]
pred_3712_no_norm.ix[movie] = np.sum(fiv_nn_3712_corr[ind_slice]*ratings[ind_slice])/np.sum(fiv_nn_3712_corr[ind_slice])
rat_norm = np.array([ mov_user_data[fiv_nn_3712[0]].ix[movie] - mov_user_data[fiv_nn_3712[0]].mean(),
mov_user_data[fiv_nn_3712[1]].ix[movie] - mov_user_data[fiv_nn_3712[1]].mean(),
mov_user_data[fiv_nn_3712[2]].ix[movie] - mov_user_data[fiv_nn_3712[2]].mean(),
mov_user_data[fiv_nn_3712[3]].ix[movie] - mov_user_data[fiv_nn_3712[3]].mean(),
mov_user_data[fiv_nn_3712[4]].ix[movie] - mov_user_data[fiv_nn_3712[4]].mean() ])
pred_3712_wi_norm.ix[movie] = ( mov_user_data['3712'].mean() +
np.sum(rat_norm[ind_slice]*fiv_nn_3712_corr[ind_slice])/np.sum(fiv_nn_3712_corr[ind_slice]) )
Printing the top 5 movies for used 3712 based on the ratings from the 5 nearest neighbors with no normalization.
In [14]:
pred_3712_no_norm.sort_values(ascending=False)[0:5]
Out[14]:
Printing the top 5 movies for used 3712 based on the ratings from the 5 nearest neighbors with normalization.
In [15]:
pred_3712_wi_norm.sort_values(ascending=False)[0:5]
Out[15]:
Initializing empty Series to store movie predictions (without and with normalization) for user 3867 from 5 nearest neighbors.
In [16]:
pred_3867_no_norm = pd.Series(index=mov_user_data.index)
pred_3867_wi_norm = pd.Series(index=mov_user_data.index)
Storing the labels of the 5 nearest neighbor users and the correlation coefficients between each of these 5 nearest neighbors and user 3867.
In [17]:
fiv_nn_3867 = list(corr_df['3867'].sort_values(ascending=False)[1:6].index)
fiv_nn_3867_corr = corr_df['3867'].sort_values(ascending=False)[1:6].values
Using the ratings of the 5 nearest neighbors and the correlations with the 5 nearest neighbors to predict the rating of user 3867 for this movie.
In [18]:
for movie in pred_3867_no_norm.index:
ratings = np.array([ mov_user_data[fiv_nn_3867[0]].ix[movie], mov_user_data[fiv_nn_3867[1]].ix[movie],
mov_user_data[fiv_nn_3867[2]].ix[movie], mov_user_data[fiv_nn_3867[3]].ix[movie],
mov_user_data[fiv_nn_3867[4]].ix[movie] ])
ind_slice = [i for i, rat_val in enumerate(ratings) if np.isnan(rat_val)==False]
pred_3867_no_norm.ix[movie] = np.sum(fiv_nn_3867_corr[ind_slice]*ratings[ind_slice])/np.sum(fiv_nn_3867_corr[ind_slice])
rat_norm = np.array([ mov_user_data[fiv_nn_3867[0]].ix[movie] - mov_user_data[fiv_nn_3867[0]].mean(),
mov_user_data[fiv_nn_3867[1]].ix[movie] - mov_user_data[fiv_nn_3867[1]].mean(),
mov_user_data[fiv_nn_3867[2]].ix[movie] - mov_user_data[fiv_nn_3867[2]].mean(),
mov_user_data[fiv_nn_3867[3]].ix[movie] - mov_user_data[fiv_nn_3867[3]].mean(),
mov_user_data[fiv_nn_3867[4]].ix[movie] - mov_user_data[fiv_nn_3867[4]].mean() ])
pred_3867_wi_norm.ix[movie] = ( mov_user_data['3867'].mean() +
np.sum(rat_norm[ind_slice]*fiv_nn_3867_corr[ind_slice])/np.sum(fiv_nn_3867_corr[ind_slice]) )
Printing the top 5 movies for used 3867 based on the ratings from the 5 nearest neighbors with no normalization.
In [19]:
pred_3867_no_norm.sort_values(ascending=False)[0:5]
Out[19]:
Printing the top 5 movies for used 3867 based on the ratings from the 5 nearest neighbors with normalization.
In [20]:
pred_3867_wi_norm.sort_values(ascending=False)[0:5]
Out[20]:
Initializing empty Series to store movie predictions (without and with normalization) for user 89 from 5 nearest neighbors.
In [21]:
pred_89_no_norm = pd.Series(index=mov_user_data.index)
pred_89_wi_norm = pd.Series(index=mov_user_data.index)
Storing the labels of the 5 nearest neighbor users and the correlation coefficients between each of these 5 nearest neighbors and user 89.
In [22]:
fiv_nn_89 = list(corr_df['89'].sort_values(ascending=False)[1:6].index)
fiv_nn_89_corr = corr_df['89'].sort_values(ascending=False)[1:6].values
Using the ratings of the 5 nearest neighbors and the correlations with the 5 nearest neighbors to predict the rating of user 89 for this movie.
In [23]:
for movie in pred_89_no_norm.index:
ratings = np.array([ mov_user_data[fiv_nn_89[0]].ix[movie], mov_user_data[fiv_nn_89[1]].ix[movie],
mov_user_data[fiv_nn_89[2]].ix[movie], mov_user_data[fiv_nn_89[3]].ix[movie],
mov_user_data[fiv_nn_89[4]].ix[movie] ])
ind_slice = [i for i, rat_val in enumerate(ratings) if np.isnan(rat_val)==False]
pred_89_no_norm.ix[movie] = np.sum(fiv_nn_89_corr[ind_slice]*ratings[ind_slice])/np.sum(fiv_nn_89_corr[ind_slice])
rat_norm = np.array([ mov_user_data[fiv_nn_89[0]].ix[movie] - mov_user_data[fiv_nn_89[0]].mean(),
mov_user_data[fiv_nn_89[1]].ix[movie] - mov_user_data[fiv_nn_89[1]].mean(),
mov_user_data[fiv_nn_89[2]].ix[movie] - mov_user_data[fiv_nn_89[2]].mean(),
mov_user_data[fiv_nn_89[3]].ix[movie] - mov_user_data[fiv_nn_89[3]].mean(),
mov_user_data[fiv_nn_89[4]].ix[movie] - mov_user_data[fiv_nn_89[4]].mean() ])
pred_89_wi_norm.ix[movie] = ( mov_user_data['89'].mean() +
np.sum(rat_norm[ind_slice]*fiv_nn_89_corr[ind_slice])/np.sum(fiv_nn_89_corr[ind_slice]) )
Printing the top 5 movies for used 89 based on the ratings from the 5 nearest neighbors with no normalization.
In [24]:
pred_89_no_norm.sort_values(ascending=False)[0:5]
Out[24]:
Printing the top 5 movies for used 89 based on the ratings from the 5 nearest neighbors with normalization.
In [25]:
pred_89_wi_norm.sort_values(ascending=False)[0:5]
Out[25]:
In [ ]: