BUILDING A RECOMMENDER SYSTEM ON USER-USER COLLABORATIVE FILTERING (MOVIELENS DATASET)

We will load the data sets firsts.



In [2]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math

#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']

#importing the data files onto dataframes
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')

#dropping unecessary columns
#Voting Timestamp - Removed
data_df.drop(data_df.columns[[3]], axis = 1, inplace = True)
#Movie Title, Video Release Date and IMDB URL - Removed
item_df.drop(item_df.columns[[1,3,4]], axis = 1, inplace = True)
#Occupation and Zip Code - Removed
user_df.drop(user_df.columns[[3,4]], axis = 1, inplace = True)



In [3]:

    
print(data_df.head())









    



   user id  movie id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1



In [4]:

    
print(item_df.head())









    



   movie id release date  unknown  Action  Adventure  Animation  Childrens  \
0         1  01-Jan-1995        0       0          0          1          1   
1         2  01-Jan-1995        0       1          1          0          0   
2         3  01-Jan-1995        0       0          0          0          0   
3         4  01-Jan-1995        0       1          0          0          0   
4         5  01-Jan-1995        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]



In [5]:

    
#Ajust release date to get only the year
item_df['release date'] = pd.to_datetime(item_df['release date'], errors='coerce').dt.year



In [6]:

    
print(item_df.head())









    



   movie id  release date  unknown  Action  Adventure  Animation  Childrens  \
0         1        1995.0        0       0          0          1          1   
1         2        1995.0        0       1          1          0          0   
2         3        1995.0        0       0          0          0          0   
3         4        1995.0        0       1          0          0          0   
4         5        1995.0        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0         0       0         0    0        0  
1        0         0       0         1    0        0  
2        0         0       0         1    0        0  
3        0         0       0         0    0        0  
4        0         0       0         1    0        0  

[5 rows x 21 columns]



In [7]:

    
print(user_df.head())









    



   user id  age gender
0        1   24      M
1        2   53      F
2        3   23      M
3        4   24      M
4        5   33      F



In [8]:

    
#Convert Gender column to numeric
user_df['gender'].replace('F', 1,inplace=True)
user_df['gender'].replace('M', 2,inplace=True)



In [9]:

    
#Adjust columns replacing NaN with the mean
meanYear = int(round(item_df['release date'].mean()))
print(meanYear)



In [10]:

    
item_df['release date'] = item_df['release date'].fillna(meanYear)



In [11]:

    
print(item_df['release date'].hasnans)









    



False



In [12]:

    
#merge it all
data_item = pd.merge(data_df, item_df, left_on = "movie id", right_on = "movie id")
data_item_user = pd.merge(data_item, user_df, left_on = "user id", right_on = "user id")
dataset = data_item_user



In [13]:

    
print(dataset.head())









    



   user id  movie id  rating  release date  unknown  Action  Adventure  \
0      196       242       3        1997.0        0       0          0   
1      196       257       2        1997.0        0       1          1   
2      196       111       4        1996.0        0       0          0   
3      196        25       4        1996.0        0       0          0   
4      196       382       4        1994.0        0       0          0   

   Animation  Childrens  Comedy   ...    Horror  Musical  Mystery  Romance   \
0          0          0       1   ...         0        0        0         0   
1          0          0       1   ...         0        0        0         0   
2          0          0       1   ...         0        0        0         1   
3          0          0       1   ...         0        0        0         0   
4          0          0       1   ...         0        0        0         0   

   Sci-Fi  Thriller  War  Western  age  gender  
0       0         0    0        0   49       2  
1       1         0    0        0   49       2  
2       0         0    0        0   49       2  
3       0         0    0        0   49       2  
4       0         0    0        0   49       2  

[5 rows x 25 columns]



In [14]:

    
# Data distribution
display(dataset.describe())









    







  
    
      
      user id
      movie id
      rating
      release date
      unknown
      Action
      Adventure
      Animation
      Childrens
      Comedy
      ...
      Horror
      Musical
      Mystery
      Romance
      Sci-Fi
      Thriller
      War
      Western
      age
      gender
    
  
  
    
      count
      100000.00000
      100000.000000
      100000.000000
      100000.000000
      100000.0000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      ...
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.00000
      100000.00000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
    
    
      mean
      462.48475
      425.530130
      3.529860
      1987.956310
      0.0001
      0.255890
      0.137530
      0.036050
      0.071820
      0.298320
      ...
      0.053170
      0.049540
      0.052450
      0.194610
      0.12730
      0.21872
      0.093980
      0.018540
      32.969850
      1.742600
    
    
      std
      266.61442
      330.798356
      1.125674
      14.154889
      0.0100
      0.436362
      0.344408
      0.186416
      0.258191
      0.457523
      ...
      0.224373
      0.216994
      0.222934
      0.395902
      0.33331
      0.41338
      0.291802
      0.134894
      11.562623
      0.437204
    
    
      min
      1.00000
      1.000000
      1.000000
      1922.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.00000
      0.000000
      0.000000
      7.000000
      1.000000
    
    
      25%
      254.00000
      175.000000
      3.000000
      1986.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.00000
      0.000000
      0.000000
      24.000000
      1.000000
    
    
      50%
      447.00000
      322.000000
      4.000000
      1994.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.00000
      0.000000
      0.000000
      30.000000
      2.000000
    
    
      75%
      682.00000
      631.000000
      4.000000
      1996.000000
      0.0000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.00000
      0.000000
      0.000000
      40.000000
      2.000000
    
    
      max
      943.00000
      1682.000000
      5.000000
      1998.000000
      1.0000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.00000
      1.00000
      1.000000
      1.000000
      73.000000
      2.000000
    
  

8 rows × 25 columns



In [15]:

    
data_matrix = dataset.pivot(index = 'user id').fillna(0)
data_matrix.head()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-15-92126870e8cf> in <module>()
----> 1 data_matrix = dataset.pivot(index = 'user id').fillna(0)
      2 data_matrix.head()

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in pivot(self, index, columns, values)
   3851         """
   3852         from pandas.core.reshape.reshape import pivot
-> 3853         return pivot(self, index=index, columns=columns, values=values)
   3854 
   3855     def stack(self, level=-1, dropna=True):

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/reshape.pyc in pivot(self, index, columns, values)
    367         cols = [columns] if index is None else [index, columns]
    368         append = index is None
--> 369         indexed = self.set_index(cols, append=append)
    370         return indexed.unstack(columns)
    371     else:

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in set_index(self, keys, drop, append, inplace, verify_integrity)
   2828                 names.append(None)
   2829             else:
-> 2830                 level = frame[col]._values
   2831                 names.append(col)
   2832                 if drop:

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1962             return self._getitem_multilevel(key)
   1963         else:
-> 1964             return self._getitem_column(key)
   1965 
   1966     def _getitem_column(self, key):

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1969         # get column
   1970         if self.columns.is_unique:
-> 1971             return self._get_item_cache(key)
   1972 
   1973         # duplicate columns & possible reduce dimensionality

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1643         res = cache.get(item)
   1644         if res is None:
-> 1645             values = self._data.get(item)
   1646             res = self._box_item_values(item, values)
   1647             cache[item] = res

/Users/robertodias/Anaconda/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3597                         loc = indexer.item()
   3598                     else:
-> 3599                         raise ValueError("cannot label index with a null key")
   3600 
   3601             return self.iget(loc, fastpath=fastpath)

ValueError: cannot label index with a null key



In [ ]:

	user id	movie id	rating	release date	unknown	Action	Adventure	Animation	Childrens	Comedy	...	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western	age	gender
count	100000.00000	100000.000000	100000.000000	100000.000000	100000.0000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	...	100000.000000	100000.000000	100000.000000	100000.000000	100000.00000	100000.00000	100000.000000	100000.000000	100000.000000	100000.000000
mean	462.48475	425.530130	3.529860	1987.956310	0.0001	0.255890	0.137530	0.036050	0.071820	0.298320	...	0.053170	0.049540	0.052450	0.194610	0.12730	0.21872	0.093980	0.018540	32.969850	1.742600
std	266.61442	330.798356	1.125674	14.154889	0.0100	0.436362	0.344408	0.186416	0.258191	0.457523	...	0.224373	0.216994	0.222934	0.395902	0.33331	0.41338	0.291802	0.134894	11.562623	0.437204
min	1.00000	1.000000	1.000000	1922.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	0.000000	0.000000	7.000000	1.000000
25%	254.00000	175.000000	3.000000	1986.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	0.000000	0.000000	24.000000	1.000000
50%	447.00000	322.000000	4.000000	1994.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	0.000000	0.000000	30.000000	2.000000
75%	682.00000	631.000000	4.000000	1996.000000	0.0000	1.000000	0.000000	0.000000	0.000000	1.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	0.000000	0.000000	40.000000	2.000000
max	943.00000	1682.000000	5.000000	1998.000000	1.0000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.00000	1.00000	1.000000	1.000000	73.000000	2.000000