BUILDING A RECOMMENDER SYSTEM ON USER-USER COLLABORATIVE FILTERING (MOVIELENS DATASET)

We will load the data sets firsts.


In [17]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math

#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']

#importing the data files onto dataframes
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')

#dropping unecessary columns
#Voting Timestamp - Removed
data_df.drop(data_df.columns[[3]], axis = 1, inplace = True)
#Movie Title, Video Release Date and IMDB URL - Removed
item_df.drop(item_df.columns[[1,3,4]], axis = 1, inplace = True)
#Occupation and Zip Code - Removed
user_df.drop(user_df.columns[[3,4]], axis = 1, inplace = True)

In [18]:
print(data_df.head())


   user id  movie id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1

In [19]:
print(item_df.head())


   movie id release date  unknown  Action  Adventure  Animation  Childrens  \
0         1  01-Jan-1995        0       0          0          1          1   
1         2  01-Jan-1995        0       1          1          0          0   
2         3  01-Jan-1995        0       0          0          0          0   
3         4  01-Jan-1995        0       1          0          0          0   
4         5  01-Jan-1995        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]

In [20]:
#Ajust release date to get only the year
item_df['release date'] = pd.to_datetime(item_df['release date'], errors='coerce').dt.year

In [21]:
print(item_df.head())


   movie id  release date  unknown  Action  Adventure  Animation  Childrens  \
0         1        1995.0        0       0          0          1          1   
1         2        1995.0        0       1          1          0          0   
2         3        1995.0        0       0          0          0          0   
3         4        1995.0        0       1          0          0          0   
4         5        1995.0        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]

In [22]:
print(user_df.head())


   user id  age gender
0        1   24      M
1        2   53      F
2        3   23      M
3        4   24      M
4        5   33      F

In [23]:
#Convert Gender column to numeric
user_df['gender'].replace('F', 1,inplace=True)
user_df['gender'].replace('M', 2,inplace=True)

In [24]:
#Adjust columns replacing NaN with the mean
meanYear = int(round(item_df['release date'].mean()))
print(meanYear)


1989

In [25]:
item_df['release date'] = item_df['release date'].fillna(meanYear)

In [26]:
print(item_df['release date'].hasnans)


False

In [27]:
#merge it all
data_item = pd.merge(data_df, item_df, left_on = "movie id", right_on = "movie id")
data_item_user = pd.merge(data_item, user_df, left_on = "user id", right_on = "user id")
dataset = data_item_user

In [28]:
print(dataset.head())


   user id  movie id  rating  release date  unknown  Action  Adventure  \
0      196       242       3        1997.0        0       0          0   
1      196       257       2        1997.0        0       1          1   
2      196       111       4        1996.0        0       0          0   
3      196        25       4        1996.0        0       0          0   
4      196       382       4        1994.0        0       0          0   

   Animation  Childrens  Comedy   ...    Horror  Musical  Mystery  Romance   \
0          0          0       1   ...         0        0        0         0   
1          0          0       1   ...         0        0        0         0   
2          0          0       1   ...         0        0        0         1   
3          0          0       1   ...         0        0        0         0   
4          0          0       1   ...         0        0        0         0   

   Sci-Fi  Thriller  War  Western  age  gender  
0       0         0    0        0   49       2  
1       1         0    0        0   49       2  
2       0         0    0        0   49       2  
3       0         0    0        0   49       2  
4       0         0    0        0   49       2  

[5 rows x 25 columns]

In [29]:
# Data distribution
display(dataset.describe())


user id movie id rating release date unknown Action Adventure Animation Childrens Comedy ... Horror Musical Mystery Romance Sci-Fi Thriller War Western age gender
count 100000.00000 100000.000000 100000.000000 100000.000000 100000.0000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 ... 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000
mean 462.48475 425.530130 3.529860 1987.956310 0.0001 0.255890 0.137530 0.036050 0.071820 0.298320 ... 0.053170 0.049540 0.052450 0.194610 0.12730 0.21872 0.093980 0.018540 32.969850 1.742600
std 266.61442 330.798356 1.125674 14.154889 0.0100 0.436362 0.344408 0.186416 0.258191 0.457523 ... 0.224373 0.216994 0.222934 0.395902 0.33331 0.41338 0.291802 0.134894 11.562623 0.437204
min 1.00000 1.000000 1.000000 1922.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 7.000000 1.000000
25% 254.00000 175.000000 3.000000 1986.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 24.000000 1.000000
50% 447.00000 322.000000 4.000000 1994.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 30.000000 2.000000
75% 682.00000 631.000000 4.000000 1996.000000 0.0000 1.000000 0.000000 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 40.000000 2.000000
max 943.00000 1682.000000 5.000000 1998.000000 1.0000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 73.000000 2.000000

8 rows × 25 columns


In [30]:
# Show the current Dataset Structure
from IPython.display import display
display(dataset)


user id movie id rating release date unknown Action Adventure Animation Childrens Comedy ... Horror Musical Mystery Romance Sci-Fi Thriller War Western age gender
0 196 242 3 1997.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
1 196 257 2 1997.0 0 1 1 0 0 1 ... 0 0 0 0 1 0 0 0 49 2
2 196 111 4 1996.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
3 196 25 4 1996.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
4 196 382 4 1994.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
5 196 202 3 1993.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
6 196 153 5 1988.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
7 196 286 5 1996.0 0 0 0 0 0 0 ... 0 0 0 1 0 0 1 0 49 2
8 196 66 3 1995.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
9 196 845 4 1996.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
10 196 173 2 1987.0 0 1 1 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
11 196 238 4 1987.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
12 196 94 3 1990.0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 49 2
13 196 762 3 1996.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 49 2
14 196 381 4 1994.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
15 196 306 4 1997.0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 49 2
16 196 8 5 1995.0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 49 2
17 196 70 3 1994.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
18 196 655 5 1986.0 0 0 1 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
19 196 13 2 1995.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
20 196 692 5 1995.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
21 196 1022 4 1997.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 49 2
22 196 287 3 1996.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 49 2
23 196 269 3 1997.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
24 196 285 5 1996.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 49 2
25 196 110 1 1995.0 0 1 1 0 0 1 ... 0 0 0 0 0 0 1 0 49 2
26 196 251 3 1997.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
27 196 393 4 1993.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
28 196 663 5 1979.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 49 2
29 196 580 2 1995.0 0 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 49 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99970 598 898 4 1997.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 40 1
99971 598 243 2 1997.0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 40 1
99972 598 308 4 1997.0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 40 1
99973 598 312 5 1997.0 0 0 0 0 0 1 ... 0 0 1 0 0 0 0 0 40 1
99974 598 313 5 1997.0 0 1 0 0 0 0 ... 0 0 0 1 0 0 0 0 40 1
99975 598 260 3 1997.0 0 1 0 0 0 0 ... 0 0 1 0 1 1 0 0 40 1
99976 598 895 2 1997.0 0 0 0 0 0 0 ... 1 0 0 0 0 1 0 0 40 1
99977 598 691 2 1998.0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 40 1
99978 598 349 4 1998.0 0 1 0 0 0 0 ... 0 0 0 0 0 1 0 0 40 1
99979 598 538 4 1997.0 0 0 0 1 1 0 ... 0 1 0 0 0 0 0 0 40 1
99980 873 294 4 1997.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 48 1
99981 873 328 4 1997.0 0 1 0 0 0 0 ... 0 0 1 1 0 1 0 0 48 1
99982 873 307 3 1997.0 0 0 0 0 0 0 ... 1 0 1 0 0 1 0 0 48 1
99983 873 750 3 1997.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 1
99984 873 258 3 1997.0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 48 1
99985 873 339 3 1997.0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 1
99986 873 321 1 1996.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 48 1
99987 873 879 2 1997.0 0 1 0 0 0 0 ... 0 0 0 0 0 1 1 0 48 1
99988 873 286 2 1996.0 0 0 0 0 0 0 ... 0 0 0 1 0 0 1 0 48 1
99989 873 259 1 1997.0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 48 1
99990 873 289 2 1996.0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 48 1
99991 873 292 5 1997.0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 1
99992 873 269 2 1997.0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 48 1
99993 873 875 1 1997.0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 48 1
99994 873 300 4 1997.0 0 1 0 0 0 0 ... 0 0 0 0 0 1 0 0 48 1
99995 873 313 5 1997.0 0 1 0 0 0 0 ... 0 0 0 1 0 0 0 0 48 1
99996 873 326 4 1997.0 0 1 0 0 0 0 ... 0 0 0 0 0 0 1 0 48 1
99997 873 348 3 1998.0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 48 1
99998 873 358 2 1997.0 0 1 1 0 0 0 ... 0 0 0 0 1 1 0 0 48 1
99999 873 342 4 1997.0 0 0 0 0 0 1 ... 0 0 1 0 0 0 0 0 48 1

100000 rows × 25 columns


In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Calculate Feature Relevance to the Dataset
for name, values in dataset.iteritems():
    # Clone Dataset for Feature Relevance calculation
    backupData = dataset.copy()
    # Clone the Column to be predicted
    y = backupData[name].copy()
    # Drop column, that will be used for prediction
    X = backupData.drop(name, 1)
    # Split Data for Model calibration
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=72)
    regressor = DecisionTreeRegressor(random_state=72)
    regressor.fit(X_train, y_train)
    score = regressor.score(X_test, y_test)
    print('Genre {} score = {}').format(name, score)
    # User Id and Movie Id have a weak relation within the other features, but

In [ ]:
# Based on Feature Relevance we are going to 
# 1st Remove the user id from the dataset
smartdata = dataset.copy()
smartdata.drop(smartdata.columns[[0]], axis = 1, inplace = True)
print(smartdata.head())

In [ ]:
# 2nd Lets translate the Ratingo into a more discrete evalution (like and dislike)
# 1 - 2.9 : DILIKE
# 3 - 5 : LIKE

for name, values in smartdata['rating'].iteritems():
    print(values)

In [32]:
# Produce a scatter matrix for each pair of features in the data 
pd.plotting.scatter_matrix(dataset, alpha = 0.3, figsize = (14,8), diagonal = 'kde');



In [ ]:
# Feature Scaling
# Scale the data using the natural logarithm 
log_data = np.log(dataset)
# Scale the sample data using the natural logarithm
log_samples = np.log(samples)
# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

In [ ]:
item_list = (((pd.merge(item,data).sort_values(by = 'movie id')).groupby('movie title')))['movie id', 'movie title', 'rating']
item_list = item_list.mean()
item_list['movie title'] = item_list.index
item_list = item_list.as_matrix()

In [ ]:
recommendation_list = []
for i in recommendation:
    recommendation_list.append(item_list[i-1])
    
recommendation = (pd.DataFrame(recommendation_list,columns = ['movie id','mean rating' ,'movie title'])).sort_values(by = 'mean rating', ascending = False)
print(recommendation[['mean rating','movie title']])

In [ ]: