BUILDING A RECOMMENDER SYSTEM ON USER-USER COLLABORATIVE FILTERING (MOVIELENS DATASET)

We will load the data sets firsts.


In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math

#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']

#importing the data files onto dataframes
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')

#dropping unecessary columns
#Voting Timestamp - Removed
data_df.drop(data_df.columns[[3]], axis = 1, inplace = True)
#Movie Title, Video Release Date and IMDB URL - Removed
item_df.drop(item_df.columns[[1,3,4]], axis = 1, inplace = True)
#Occupation and Zip Code - Removed
user_df.drop(user_df.columns[[3,4]], axis = 1, inplace = True)

In [3]:
print(data_df.head())


   user id  movie id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1

In [4]:
print(item_df.head())


   movie id release date  unknown  Action  Adventure  Animation  Childrens  \
0         1  01-Jan-1995        0       0          0          1          1   
1         2  01-Jan-1995        0       1          1          0          0   
2         3  01-Jan-1995        0       0          0          0          0   
3         4  01-Jan-1995        0       1          0          0          0   
4         5  01-Jan-1995        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]

In [5]:
#Ajust release date to get only the year
item_df['release date'] = pd.to_datetime(item_df['release date'], errors='coerce').dt.year

In [6]:
print(item_df.head())


   movie id  release date  unknown  Action  Adventure  Animation  Childrens  \
0         1        1995.0        0       0          0          1          1   
1         2        1995.0        0       1          1          0          0   
2         3        1995.0        0       0          0          0          0   
3         4        1995.0        0       1          0          0          0   
4         5        1995.0        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]

In [7]:
print(user_df.head())


   user id  age gender
0        1   24      M
1        2   53      F
2        3   23      M
3        4   24      M
4        5   33      F

In [8]:
#Convert Gender column to numeric
user_df['gender'].replace('F', 1,inplace=True)
user_df['gender'].replace('M', 2,inplace=True)

In [9]:
#Adjust columns replacing NaN with the mean
meanYear = int(round(item_df['release date'].mean()))
print(meanYear)


1989

In [10]:
item_df['release date'] = item_df['release date'].fillna(meanYear)

In [11]:
print(item_df['release date'].hasnans)


False

In [12]:
#merge it all
data_item = pd.merge(data_df, item_df, left_on = "movie id", right_on = "movie id")
data_item_user = pd.merge(data_item, user_df, left_on = "user id", right_on = "user id")
dataset = data_item_user

In [13]:
print(dataset.head())


   user id  movie id  rating  release date  unknown  Action  Adventure  \
0      196       242       3        1997.0        0       0          0   
1      196       257       2        1997.0        0       1          1   
2      196       111       4        1996.0        0       0          0   
3      196        25       4        1996.0        0       0          0   
4      196       382       4        1994.0        0       0          0   

   Animation  Childrens  Comedy   ...    Horror  Musical  Mystery  Romance   \
0          0          0       1   ...         0        0        0         0   
1          0          0       1   ...         0        0        0         0   
2          0          0       1   ...         0        0        0         1   
3          0          0       1   ...         0        0        0         0   
4          0          0       1   ...         0        0        0         0   

   Sci-Fi  Thriller  War  Western  age  gender  
0       0         0    0        0   49       2  
1       1         0    0        0   49       2  
2       0         0    0        0   49       2  
3       0         0    0        0   49       2  
4       0         0    0        0   49       2  

[5 rows x 25 columns]

In [14]:
# Data distribution
display(dataset.describe())


user id movie id rating release date unknown Action Adventure Animation Childrens Comedy ... Horror Musical Mystery Romance Sci-Fi Thriller War Western age gender
count 100000.00000 100000.000000 100000.000000 100000.000000 100000.0000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 ... 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000
mean 462.48475 425.530130 3.529860 1987.956310 0.0001 0.255890 0.137530 0.036050 0.071820 0.298320 ... 0.053170 0.049540 0.052450 0.194610 0.12730 0.21872 0.093980 0.018540 32.969850 1.742600
std 266.61442 330.798356 1.125674 14.154889 0.0100 0.436362 0.344408 0.186416 0.258191 0.457523 ... 0.224373 0.216994 0.222934 0.395902 0.33331 0.41338 0.291802 0.134894 11.562623 0.437204
min 1.00000 1.000000 1.000000 1922.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 7.000000 1.000000
25% 254.00000 175.000000 3.000000 1986.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 24.000000 1.000000
50% 447.00000 322.000000 4.000000 1994.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 30.000000 2.000000
75% 682.00000 631.000000 4.000000 1996.000000 0.0000 1.000000 0.000000 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 40.000000 2.000000
max 943.00000 1682.000000 5.000000 1998.000000 1.0000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 73.000000 2.000000

8 rows × 25 columns


In [15]:
data_matrix = dataset.pivot(index = 'user id').fillna(0)
data_matrix.head()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-15-92126870e8cf> in <module>()
----> 1 data_matrix = dataset.pivot(index = 'user id').fillna(0)
      2 data_matrix.head()

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in pivot(self, index, columns, values)
   3851         """
   3852         from pandas.core.reshape.reshape import pivot
-> 3853         return pivot(self, index=index, columns=columns, values=values)
   3854 
   3855     def stack(self, level=-1, dropna=True):

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/reshape.pyc in pivot(self, index, columns, values)
    367         cols = [columns] if index is None else [index, columns]
    368         append = index is None
--> 369         indexed = self.set_index(cols, append=append)
    370         return indexed.unstack(columns)
    371     else:

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in set_index(self, keys, drop, append, inplace, verify_integrity)
   2828                 names.append(None)
   2829             else:
-> 2830                 level = frame[col]._values
   2831                 names.append(col)
   2832                 if drop:

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1962             return self._getitem_multilevel(key)
   1963         else:
-> 1964             return self._getitem_column(key)
   1965 
   1966     def _getitem_column(self, key):

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1969         # get column
   1970         if self.columns.is_unique:
-> 1971             return self._get_item_cache(key)
   1972 
   1973         # duplicate columns & possible reduce dimensionality

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1643         res = cache.get(item)
   1644         if res is None:
-> 1645             values = self._data.get(item)
   1646             res = self._box_item_values(item, values)
   1647             cache[item] = res

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3597                         loc = indexer.item()
   3598                     else:
-> 3599                         raise ValueError("cannot label index with a null key")
   3600 
   3601             return self.iget(loc, fastpath=fastpath)

ValueError: cannot label index with a null key

In [ ]: