In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from ntflib import betantf

%matplotlib inline
sns.set(style="white")

Defining functions for mapping and error


In [2]:
def mapper(array):
    array = np.sort(array)
    int_map = np.arange(len(np.unique(array))).astype(int)
    dict_map = dict(zip(np.sort(np.unique(array)), int_map))
    tmp = pd.Series(array)
    res = tmp.map(lambda x: dict_map[x])
    
    inv_dict_map = {v: k for k, v in dict_map.items()}
    return res.values, inv_dict_map

def rmse(x, y):
    return np.sqrt((x - y)**2.0).sum()

Grabbing Movie Lens data


In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip


--2015-06-25 10:25:08--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917392 (5.6M) [application/zip]
Saving to: 'ml-1m.zip'

ml-1m.zip           100%[=====================>]   5.64M   668KB/s   in 13s    

2015-06-25 10:25:22 (437 KB/s) - 'ml-1m.zip' saved [5917392/5917392]

Parsing data and cleaning it up for NTFLib


In [3]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings.Timestamp = ratings.Timestamp.map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m'))
# movies = pd.read_table('ml-1m/movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
# users = pd.read_table('ml-1m/users.dat', sep='::', names=['UserID' ,'Gender', 'Age', 'Occupation::Zip-code'])


/Users/eli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:648: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.
  ParserWarning)

In [4]:
# Converting dates to integers
ratings['UserID'], inv_uid_dict = mapper(ratings['UserID'])
ratings['MovieID'], inv_mid_dict = mapper(ratings['MovieID'])
ratings['Timestamp'], inv_ts_dict = mapper(ratings['Timestamp'])

In [5]:
x_indices = ratings[['UserID', 'MovieID', 'Timestamp']].copy()
x_indices['UserID'] = x_indices['UserID'] - x_indices['UserID'].min() 
x_indices['MovieID'] = x_indices['MovieID'] - x_indices['MovieID'].min()
x_indices['Timestamp'] = x_indices['Timestamp'] - x_indices['Timestamp'].min() 

print x_indices.min()
x_indices = x_indices.values
x_vals = ratings['Rating'].values


UserID       0
MovieID      0
Timestamp    0
dtype: int64

In [6]:
print 'Number of unique movie IDs: {0}'.format(len(ratings['MovieID'].unique()))
print 'Max movie ID: {0}'.format(ratings['MovieID'].max())


Number of unique movie IDs: 3706
Max movie ID: 3705

In [7]:
indices_train, indices_test, val_train, val_test = train_test_split(
    x_indices, x_vals, test_size=0.40, random_state=42)

shape_uid = len(np.unique(x_indices[:,0]))
shape_mid = len(np.unique(x_indices[:,1]))
shape_ts = len(np.unique(x_indices[:,2]))
shape = [shape_uid, shape_mid, shape_ts]
shape


Out[7]:
[6040, 3706, 35]

In [8]:
indices_train


Out[8]:
array([[6022, 3683,   33],
       [5529, 3261,    9],
       [2908, 1544,    6],
       ..., 
       [ 853,  467,    2],
       [4032, 2268,    7],
       [ 785,  440,    2]])

In [9]:
# shape = [len(np.unique(ratings[x])) for x in ['UserID', 'MovieID', 'Timestamp']]
bnf = betantf.BetaNTF(shape, n_components=5, n_iters=10)
before = bnf.score(indices_train, val_train)
initial = bnf.impute(x_indices)
reconstructed = bnf.fit(indices_train, val_train)
after = bnf.score()
assert(after < before)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-64ff1eab406a> in <module>()
      3 before = bnf.score(indices_train, val_train)
      4 initial = bnf.impute(x_indices)
----> 5 reconstructed = bnf.fit(indices_train, val_train)
      6 after = bnf.score()
      7 assert(after < before)

/Users/eli/github/NTFLib/ntflib/betantf.pyc in fit(self, x_indices, x_vals)
     91         self.x_indices = x_indices
     92         self.x_vals = x_vals
---> 93         self._check_input(x_indices, x_vals)
     94         for it in range(self.n_iters):
     95             # Update each factor individually

/Users/eli/github/NTFLib/ntflib/betantf.pyc in _check_input(self, x_indices, x_vals)
     50         for col in range(x_indices.shape[1]):
     51             rank = x_indices[:, col]
---> 52             msg = msg % col
     53             if rank.max() + 1 != np.unique(rank).shape[0]:
     54                 warnings.warn(msg)

TypeError: not all arguments converted during string formatting

In [ ]:
debug


> /Users/eli/github/NTFLib/ntflib/betantf.py(52)_check_input()
     51             rank = x_indices[:, col]
---> 52             msg = msg % col
     53             if rank.max() + 1 != np.unique(rank).shape[0]:

ipdb> p col
1
ipdb> p msg
'Rank did not match shape; is column 0 starting with zero and strictly contiguous integers?'
ipdb> l
     47         we cannot tolerate a whole dimension with no data."""
     48         msg = "Rank did not match shape; is column %i "
     49         msg += "starting with zero and strictly contiguous integers?"
     50         for col in range(x_indices.shape[1]):
     51             rank = x_indices[:, col]
---> 52             msg = msg % col
     53             if rank.max() + 1 != np.unique(rank).shape[0]:
     54                 warnings.warn(msg)
     55         assert len(x_vals) == len(x_indices)
     56         assert np.all(np.isfinite(x_vals))
     57         assert np.all(x_vals >= 0)


In [83]:
prediction = bnf.impute(indices_test)

In [85]:
rmse(prediction, val_test) / float(prediction.shape[0])


Out[85]:
0.82303123616696161

In [11]:
!cat ml-1m/README


SUMMARY
================================================================================

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

USAGE LICENSE
================================================================================

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any research
purposes under the following conditions:

     * The user may not state or imply any endorsement from the
       University of Minnesota or the GroupLens Research Group.

     * The user must acknowledge the use of the data set in
       publications resulting from the use of the data set, and must
       send us an electronic or paper copy of those publications.

     * The user may not redistribute the data without separate
       permission.

     * The user may not use this information for any commercial or
       revenue-bearing purposes without first obtaining permission
       from a faculty member of the GroupLens Research Project at the
       University of Minnesota.

If you have any further questions or comments, please contact GroupLens
<grouplens-info@cs.umn.edu>. 

ACKNOWLEDGEMENTS
================================================================================

Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
set.

FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
================================================================================

The GroupLens Research Project is a research group in the Department of 
Computer Science and Engineering at the University of Minnesota. Members of 
the GroupLens Research Project are involved in many research projects related 
to the fields of information filtering, collaborative filtering, and 
recommender systems. The project is lead by professors John Riedl and Joseph 
Konstan. The project began to explore automated collaborative filtering in 
1992, but is most well known for its world wide trial of an automated 
collaborative filtering system for Usenet news in 1996. Since then the project 
has expanded its scope to research overall information filtering solutions, 
integrating in content-based methods as well as improving current collaborative 
filtering technology.

Further information on the GroupLens Research project, including research 
publications, can be found at the following web site:
        
        http://www.grouplens.org/

GroupLens Research currently operates a movie recommender based on 
collaborative filtering:

        http://www.movielens.org/

RATINGS FILE DESCRIPTION
================================================================================

All ratings are contained in the file "ratings.dat" and are in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

USERS FILE DESCRIPTION
================================================================================

User information is in the file "users.dat" and is in the following
format:

UserID::Gender::Age::Occupation::Zip-code

All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.

- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

MOVIES FILE DESCRIPTION
================================================================================

Movie information is in the file "movies.dat" and is in the following
format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (including
year of release)
- Genres are pipe-separated and are selected from the following genres:

	* Action
	* Adventure
	* Animation
	* Children's
	* Comedy
	* Crime
	* Documentary
	* Drama
	* Fantasy
	* Film-Noir
	* Horror
	* Musical
	* Mystery
	* Romance
	* Sci-Fi
	* Thriller
	* War
	* Western

- Some MovieIDs do not correspond to a movie due to accidental duplicate
entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist

In [ ]:


In [ ]: