In [15]:
import numpy as np
import pandas as pd
import tables as tb
!find ./data
In [30]:
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('data/ml-1m/users.dat',
sep='::', header=None, names=unames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/ml-1m/movies.dat',
sep='::', header=None, names=mnames)
In [2]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0)
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0)
print movielens_train
print movielens_test
In [3]:
# given the following ndarray, access the its third element
arr = np.arange(10)
arr
Out[3]:
In [4]:
# given the following ndarray, access its last column
arr = np.array([[5,4,2,5],[4,5,1,12],[0,1,5,4]])
arr
Out[4]:
In [21]:
# given the following ndarray, obtain all elements that are larger than zero
arr = np.array([[-0.28179535, 1.80896278, -1.08991099, -1.20264003, 0.61651465],
[ 0.49983669, 0.28402664, -0.12685554, 0.81266623, 0.96586634]])
arr
Out[21]:
In [22]:
# given the following ndarray, set the last two elements to 10
arr = np.array([1,2,-10,5,-6])
arr
Out[22]:
In [19]:
# given the following ndarray, compute its sum
arr = np.arange(10)
arr
Out[19]:
In [20]:
# given the following ndarray, compute its mean
arr = np.array([50,-79,80,35])
arr
Out[20]:
In [9]:
# given the following ndarray, detect all elements that are nans
arr = np.array([np.nan] * 10)
arr[2:4] = 5
arr
Out[9]:
In [10]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df
Out[10]:
In [11]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df
Out[11]:
In [12]:
# given the following three Series, create a DataFrame such that it holds them in its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
In [32]:
# given the DataFrame 'movielens' that we loaded in the previous step, try to index
# into the 'zip' column
movielens_train[?]
In [29]:
# using the same 'movielens' DataFrame, index into the row whose index is 593263
movielens_train.ix[?]
In [ ]:
# write an 'estimate' function that computes the mean rating of a particular user
def estimate(user_id, movie_id):
# first, index into all ratings by this user
# second, compute the mean of those ratings
# return
# try it out for a user_id, movie_id pair
estimate(4653, 2648)
In [ ]:
# write an 'estimate' function that computes the mean rating of a particular user
def estimate(user_id, movie_id):
# first, index into all ratings of this movie
# second, compute the mean of those ratings
# return
# try it out for a user_id, movie_id pair
estimate(4653, 2648)
In [13]:
def compute_rmse(y_pred, y_true):
""" Compute Root Mean Squared Error. """
return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))
In [14]:
def evaluate(estimate_f):
""" RMSE-based predictive performance evaluation with pandas. """
ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
real = movielens_test.rating.values
return compute_rmse(estimated, real)
In [ ]:
# write your estimate function here
def my_estimate_func(user_id, movie_id):
# your code
With those, you can test for performance with the following line, which assumes that your function is called my_estimate_func:
In [ ]:
print 'RMSE for my estimate function: %s' % evaluate(my_estimate_func)
Once you are happy with your score, you can submit your RMSE by running this function (in the hosted notebook only):
In [ ]:
from update_score import update_score
update_score(evaluate(my_estimate_func))
In [ ]:
# write your answer in this code block
In [ ]:
# write your answer in this code block
In [ ]:
# write your answer in this code block
In [ ]:
# write your answer in this code block