In [1]:
"""Required imports"""
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
p_csv = pd.read_csv("predictions.csv")
In [3]:
p_csv.columns = ["Id", "plays"]
In [4]:
p_csv
Out[4]:
In [5]:
p_csv.to_csv("predictions.csv", index=False)
In [4]:
import json
import urllib
# If set to true, most functions will print information about intermediate steps.
DEBUG = False
def get_freebase_id(query):
"""
Given a Google Search query, computes the corresponding Freebase ID
as determined by Google Knowledge Graph.
Args:
query: String, the query such as 'the liars' or '50 cent'
Returns:
String, the freebase id in string format.
Raises:
KeyError, IndexError: When no FreeBase ID can be found for the query.
"""
api_key = 'AIzaSyBBY9bXofiXL9vbe_V6Y49NyAHRv46As60'
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
params = {
'query': query,
'limit': 10,
'indent': True,
'key': api_key,
}
type_params = ['types=Person', 'types=MusicGroup']
url = service_url + '?' + '&'.join(type_params) + '&' + urllib.urlencode(params)
response = json.loads(urllib.urlopen(url).read())
try:
freebase_id = response['itemListElement'][0]['result']['@id']
except (KeyError, IndexError) as e:
if DEBUG:
print "No key for query in Google Knowledge Graph for query: %s." % response
raise
return freebase_id[3:]
In [5]:
def get_wikidata_id(mid):
"""
Calculate the WikiData ID based on the MID (Freebase ID).
Args:
mid: String, the mid to be found in WikiData.
Returns:
String, the WikiData ID.
Raises:
KeyError, IndexError: When the MID cannot be found in WikiData.
"""
service_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
params = {
'format': 'json',
'query':
'PREFIX wd: <http://www.wikidata.org/entity/>\n'
'PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n'
'PREFIX wikibase: <http://wikiba.se/ontology#>\n'
'SELECT ?s ?sLabel ?p ?o ?oLabel WHERE {\n'
'?s ?p ?o .\n'
'?s wdt:P646 "' + mid + '" .\n'
'SERVICE wikibase:label {\n'
'bd:serviceParam wikibase:language "en" .\n'
'}\n'
'}\n'
}
url = service_url + '?' + urllib.urlencode(params)
response = json.loads(urllib.urlopen(url).read())
try:
object_url = response['results']['bindings'][0]['s']['value']
except (KeyError, IndexError) as e:
if DEBUG:
print "Unable to get wikidata id for mid: %s. We used url: %s." % (mid, url)
raise
qid = object_url.split('/')[-1]
return qid
In [6]:
def get_genres(qid):
"""
Returns the Music Genres if found corresponding to the WikiData ID.
Args:
qid: String, the WikiData ID.
Returns:
[String], the list of genres if any.
Raises:
"""
service_url = 'https://www.wikidata.org/w/api.php'
params = {
'action': 'wbgetentities',
'ids': qid,
'format': 'json'
}
url = service_url + '?' + urllib.urlencode(params)
response = json.loads(urllib.urlopen(url).read())
try:
# Genre property is P136
genre_snaks = response['entities'][qid]['claims']['P136']
except KeyError as e:
if DEBUG:
print "Unable to extract genre property for qid: %s." % qid
raise
# Convert snacks into list of ids to query
genre_ids = []
for snak in genre_snaks:
try:
genre_id = snak['mainsnak']['datavalue']['value']['id']
genre_ids.append(genre_id)
except KeyError as e:
if DEBUG:
print "Unable to extract genre id from snak for qid: %s. Skipping." % (qid)
if len(genre_ids) == 0:
raise Exception("No genre ids for qid: %s." % qid)
# Fetch genre information
params = {
'action': 'wbgetentities',
'ids': '|'.join(genre_ids),
'format': 'json'
}
url = service_url + '?' + urllib.urlencode(params)
response = json.loads(urllib.urlopen(url).read())
# Extract the english labels for the entities.
genres = []
for genre_id in genre_ids:
try:
genre_object = response['entities'][genre_id]
except KeyError as e:
if DEBUG:
print "Unable to extract genre_id with qid: %s." % (genre_id)
continue
try:
genres.append(genre_object['labels']['en']['value'])
except KeyError as e:
if DEBUG:
print "Unable to extract english label for qid: %s with genre qid %s." % (qid, genre_id)
continue
if len(genres) == 0:
raise Exception("Could not extract english labels for genres for qid %s.", qid)
return genres
In [7]:
def get_lastfm_tags(mbid):
"""
Returns the top three of Last FM tags corresponding to the Music Brainz ID.
Args:
mbdi: String, the Music Brainz ID to be found.
Returns:
[String], the top three tags for the mbid.
Raises:
KeyError: If the mbid cannot be found in the Last FM page.
"""
service_url = 'http://ws.audioscrobbler.com/2.0/'
params = {
'method': 'artist.gettoptags',
'mbid': mbid,
'api_key': 'f2fac19fc4abcebe0d1729429137037e',
'format': 'json',
}
url = service_url + '?' + urllib.urlencode(params)
response = json.loads(urllib.urlopen(url).read())
# Extract top 3 tags if possible
try:
tags = response['toptags']['tag'][:3]
except KeyError:
if DEBUG:
print "Failed to extract tags from last fm for mbid: %s." % mbid
raise
return [tag['name'] for tag in tags]
In [8]:
## Read the artists. Not necessary for our model.
#artists = pd.read_csv('artists.csv')
#artists.head(10)
In [9]:
## Read the profiles. Not necessary for our model.
#profiles = pd.read_csv('profiles.csv')
#profiles.head(10)
In [10]:
# Load the training data and attack the 'log_plays' column.
train = pd.read_csv('train.csv')
train['log_plays'] = np.log(train['plays'])
train.head(10)
Out[10]:
In [11]:
# Some sanity checks to make sure we've loaded the data correctly.
from collections import Counter
artist_count = Counter(train['artist'])
user_count = Counter(train['user'])
assert len(artist_count) == 2000
assert len(user_count) == 233286
In [ ]:
# Histogram for artist count for sanity check.
plt.hist(artist_count.values(), bins=100)
In [ ]:
# Histogram for user counts for sanity check.
plt.hist(user_count.values(), bins=100)
In [ ]:
# Histogram for log plays for sanity check.
plt.hist(train['plays'], bins=100)
In [ ]:
# Histogram for log plays for sanity check.
plt.hist(train['log_plays'], bins=100)
In [12]:
# Let's take a look at the log distribution by artist
# by_artist = train.groupby('artist').mean()
by_user = train.groupby('user').mean()
In [ ]:
# plot the average for plays.
plt.hist(by_user['plays'], bins=100)
In [ ]:
# Plot the average for log_plays.
plt.hist(by_user['log_plays'], bins=100)
In [ ]:
# Plot the plays by artist.
plt.hist(by_artist['plays'], bins=100)
In [ ]:
# Plot the average log plays (aka, 'rating').
plt.hist(by_artist['log_plays'], bins=100)
In [13]:
# Per user, let's take a look at median by users.
by_user_median = train.groupby('user').quantile(0.5)
In [15]:
# Merges the mean and median computations on user, since this is what we will use to determine outliers.
merged = pd.merge(by_user.reset_index(), by_user_median.reset_index(), on="user", suffixes=["_mean", "_median"])
In [16]:
# Let's calculate the gap_plays (ie, mean - median) for each user.
merged['gap_plays'] = merged['plays_mean'] - merged['plays_median']
In [ ]:
# Let's see how the data is distributed.
plt.hist(merged['gap_plays'], bins=100)
In [17]:
# Candidate outliers have a gap > 1000 or < -1000
min_gap = -1000
max_gap = 1000
candidates = merged[(merged['gap_plays'] < min_gap) | (merged['gap_plays'] > max_gap)]
In [18]:
# Merge the candidates so we can get the log_plays.
candidate_removals = pd.merge(train, candidates, on="user")
In [19]:
# We remove only if log_play > 10 (ie, extreme outliers).
candidate_removals = candidate_removals[candidate_removals['log_plays'] > 10]
In [20]:
# Remove the candidates.
cleaned_train = train[~((train['user'].isin(candidate_removals['user']))
& (train['artist'].isin(candidate_removals['artist']))
& (train['plays'].isin(candidate_removals['plays'])))]
In [21]:
# Removed exactly what we wanted. Make some basic checks.
assert len(train) - len(cleaned_train) == len(candidate_removals)
assert len(candidate_removals) == 313
print "Removed a total of %s." % len(candidate_removals)
In [22]:
# Seperate into train and validation. We take a random 10 percent of the data to be validation.
# We have to make sure that the train_set has all of the users.
all_users_and_artists_in_both = False
expected_num_users = len(set(cleaned_train['user']))
expected_num_artist = len(set(cleaned_train['artist']))
while not all_users_and_artists_in_both:
shuffled_set = cleaned_train.sample(frac=1).reset_index(drop=True)
train_length = int(0.9 * len(shuffled_set))
train_set = shuffled_set[:train_length]
validation_set = shuffled_set[train_length:]
print "Attempted..."
if (len(set(train_set['user'])) == expected_num_users
and len(set(train_set['artist'])) == expected_num_artist):
print "Found!"
all_users_and_artists_in_both = True
In [23]:
expected_num_users, len(set(train_set['user'])), len(set(validation_set['user']))
Out[23]:
In [24]:
def extract_expected_and_preds(predictions, df, column='log_plays'):
"""
Converts the output `predictions` from a Model and the `df` from
of expected values into two flat numpy arrays.
Args:
predictions: The predictions returned by the Model.
df: pandas.DataFrame, the df containing the expected results
column: String, the column in df with the expected values.
Returns:
np.array(float), np.array(float): Two numpy arrays of
preds and expected values where each entry in preds
corresponds to expected.
"""
if column not in ['log_plays', 'plays']:
raise Exception("Unsupported column name %s.", column)
expected = np.array(df[column])
preds = np.zeros(len(expected))
i = 0
for _, row in df.iterrows():
artist =row['artist']
user = row['user']
preds[i] = predictions[user][artist]
i += 1
return preds, expected
In [25]:
def rmse(preds, expected):
return (np.sum((preds - expected)**2 / float(len(expected)))) ** 0.5
In [26]:
def absolute_error(preds, expected):
return np.sum(np.abs(preds - expected) / float(len(expected)))
In [27]:
class Model(object):
"""
Our custom model class which performs basic learning on the data.
See http://www.netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf for details on the model.
As an overview, the model assumes the following:
pred = bias + bias_user[user] + bias_artist[artist] + np.dot(P[user],Q[artist])
We have a global bias, a bias for each user, and a bias for each artist. We also have an interaction
term which is calculated by taking \sum_k P[user][k] * Q[artist][k].
The interpretation is that there are K hidden factors which define the interaction between the user
the artist.
The model learning attempts to minimize the following loss functions:
loss(bias, bias_user, bias_artist, P, Q) =
\sum_{user, artist} (actual - pred)^2 + \lambda[||P||
+ ||Q|| + ||bias_user|| + ||bias_artist|| + ||bias||]
The learning takes place wit SGD.
Properties:
name: String, the name of the model, based on input parameters. Used for saving and loading files.
learn_rate: float, the learning rate for SGD (gamma).
regularization: float, the normalization factor (lambda).
K: int, the number of latent variables the model assumes.
column: string, the name of the column in data which we wish to predict. Currently
only support 'log_plays' and 'plays'.
"""
def __init__(self, data = None, learning_rate = 0.02, penalization = 0.1, hidden_factors=50, column='log_plays'):
"""
Initializes a model.
Args:
data: Opt[pandas.DataFrame], the training data for the model. This should be None if we plan
to load the parameters from a file, as the data is loaded too.
learning_rate: float, the learning rate to use for SGD.
penalization: float, the regularization factor.
hidden_factors: int, the number latent variables for the mode.
column: String, the model will attempt to predict data[column]. Only 'log_plays' and 'plays'
currently supported.
"""
if column not in ['log_plays', 'plays']:
raise Exception("Unsupported column %s." % column)
self.name = "model_learn_rate_%s_penalization_%s_k_%s_on_column_%s" % (learning_rate,
penalization,
hidden_factors,
column)
self.learn_rate = learning_rate
self.regularization = penalization
self.K = hidden_factors
self.train = data
self.column = column
# A dictionary mapping R[user][artist] to values.
self._R = None
# A dictionary mapping P[user] = np.array(K)
self._P = None
# A dictionary mapping Q[artist] = np.array(K)
self._Q = None
# A constant bias term.
self._mu = None
# A dictionary mapping _mu_user[user] to a contants bias term.
self._mu_user = None
# A dictionary mapping _mu_artist[artist] to a contant bias term.
self._mu_artist = None
# cache results.
self._predictions = None
self._expected = None
def initialize_params(self):
"""
Initializes the parameters for the model, P,Q, mu, mu_user, and mu_artist to the defaults.
"""
from collections import defaultdict
bias = self.train[self.column].mean()
users = self.train.groupby('user').mean().reset_index()
artists = self.train.groupby('artist').mean().reset_index()
# Default user and artist bias.
user_bias = {user: val - bias for (user, val) in zip(users['user'], users[self.column])}
artist_bias = {artist: val - bias for (artist, val) in zip(artists['artist'], artists[self.column])}
# The hard part of initializing the P and Q sparse matrices (aka, dictionaries ^_^)
P, Q, R = {}, {}, defaultdict(dict)
i = 0
for _, row in self.train.iterrows():
i += 1
user = row['user']
artist = row['artist']
rating = row[self.column]
P[user] = np.random.normal(size=self.K)
Q[artist] = np.random.normal(size=self.K)
R[user][artist] = rating
if i % (len(self.train) / 10) == 0:
print "Done with %.2f percent." % (100 * float(i) / len(self.train))
self._R, self._P, self._Q = dict(R), P, Q
self._mu, self._mu_user, self._mu_artist = bias, user_bias, artist_bias
def load_params(self, filename=None):
"""
Loads the parameters into the model. This is useful to load a previously trained model.
Args:
filename: Opt[String], the filename from which to load the parameters.
If None, assumes the parameters are contained in the file named self.name.
"""
import pickle
if not filename:
filename = "%s_params" % self.name
with open("%s.pk" % filename) as handle:
(self.train,
self._R, self._P, self._Q,
self._mu, self._mu_user, self._mu_artist) = pickle.load(handle)
# sanity checks to verify data loaded is for the correct model.
assert len(self._P.itervalues().next()) == self.K
assert len(self._Q.itervalues().next()) == self.K
def save_params(self, filename=None):
"""
Saves the parameters of the model.
Args:
filename: Opt[String], the filename into which to save the parameters.
If None, assumes the parameters should be saved in the file named self.name.
"""
import pickle
if not filename:
filename = "%s_params" % self.name
obj = (self.train, self._R, self._P, self._Q, self._mu, self._mu_user, self._mu_artist)
with open("%s.pk" % filename, 'w') as handle:
pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
def _getParamGradients(self, user, artist, pred):
self._loss = self._R[user][artist] - pred
return (-self._loss * self._Q[artist] + self.regularization * self._P[user],
-self._loss * self._P[user] + self.regularization * self._Q[artist],
-self._loss,
-self._loss + self.regularization * self._mu_user[user],
-self._loss + self.regularization * self._mu_artist[artist])
def learn(self, passes = 1, test=None):
"""
Train the model using the training data.
Args:
passes: Int, the number of full passes to do over the input data.
test: df.DataFrame, a validation set. If given, calculates the ABSERROR
after each half pass.
"""
for j in xrange(passes):
i = 0
for _, row in self.train.iterrows():
i += 1
user = row['user']
artist = row['artist']
self._user = user
self._artist = artist
pred = self._predict_single(user, artist)
# Update in the parameters based on data points
Pgrad, Qgrad, mugrad, mu_user_grad, mu_artist_grad = self._getParamGradients(user, artist, pred)
self._P[user] -= self.learn_rate * Pgrad
self._Q[artist] -= self.learn_rate * Qgrad
self._mu -= self.learn_rate * mugrad
self._mu_user[user] -= self.learn_rate * mu_user_grad
self._mu_artist[artist] -= self.learn_rate * mu_artist_grad
if i % (len(self.train) / 10) == 0:
print "Loss for current item %s." % str(self._loss)
print "Done with %.2f percent." % (100 * float(i) / len(self.train))
print("Finised pass %s." % str(j + 1))
# Calcualte the loss after this pass.
if validation_set is not None:
p, e = self.calculate_abs_error(validation_set)
print("Absolute mean error %s.", absolute_error(p, e))
def _predict_single(self, user, artist):
# interesting...optinimzations could be done here.
return (np.dot(self._P[user], self._Q[artist])
+ self._mu_user[user]
+ self._mu_artist[artist]
+ self._mu)
def calculate_abs_error(self, test):
"""
Note that this caches results, so it will ignore test in subsequent calls.
"""
expected = np.array(test[self.column])
preds = np.zeros(len(expected))
# Reuse predictions.
predictions = self.predict(test)
i = 0
for _, row in test.iterrows():
artist =row['artist']
user = row['user']
preds[i] = predictions[user][artist]
i += 1
return preds, expected
def predict(self, test):
"""
Predicts for the values in the test DataFrame.
Args:
test: pandas.DataFrame, the dataframe containing user/artist rows to predict.
Returns:
dict[String -> dict[String -> Float]]: A dictionary mapping
return_val[user][artist] to the predicted values.
"""
from collections import defaultdict
predictions = defaultdict(dict)
i = 0
for _, row in test.iterrows():
i += 1
user = row['user']
artist = row['artist']
predictions[user][artist] = self._predict_single(user, artist)
if i % (len(test) / 10) == 0:
print "Done with %.2f percent." % (100 * float(i) / len(test))
self._predictions = dict(predictions)
return self._predictions
def save(self, filename = None):
"""
Save the model to disk.
"""
if not filename:
filename = self.name
import pickle
with open("%s.pk" % filename, 'w') as handle:
pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
@staticmethod
def load(filename):
"""
Load a model from disk.
"""
import pickle
with open("%s.pk" % filename) as handle:
return pickle.load(handle)
In [28]:
##See the below for loading a model. We suggest you load the parameters only, and not the entire model.
## Note that column specified the `column` in data which the model is attempting to predict.
# model = Model(data = None, learning_rate = 0.01, penalization = 0.001, hidden_factors=100, column='log_plays')
## Load the parameters for the model. These parameters should match hidden_factors and column to be relevant.
# model.load_params(filename)
## Alternatively, you can initialize the parameters randomly if data is NOT None.
# model.initialize_params()
## If you wish, you can run the model over the data again, to improve the predictions. passes determines how many
## iterations over the data to perform.
# model.learn(passes=3)
## At some point, we recommend you save your current set of model parameters so you can reuse later without retraining.
# model.save_params(filename)
## You can also save the entire model (which includes saving the training data), but this can be a bit finicky.
# model.save(filename)
## If you do save the entire model, you can load it again with the following.
# model = Model.load(filename)
## Once you're ready to make predictions, simply do where df is the dataframe of user/artist ids.
## The model returns a python dictionary such that preds[user][artist] gives the predicted value.
# preds = model.predict(df)
## To make things fast, you can calculate RMSE against a validation dataframe (ie, user/artist/column) dataframe.
## First, extract everything into numpy. Column should be one of 'plays' or 'log_plays'
# p, e = extract_expected_and_preds(preds, expected_df, column)
## Then calculate the error.
# rm = rmse(p,e)
# ae = absolute_error(p,e)
## At the end, we recommend you save the entire model...
In [29]:
model_last_logs = Model()
#model_last_logs.initialize_params()
In [73]:
model_last_logs.load_params("final_params_model_trained_115_passes")
In [31]:
train_set = model_last_logs.train
in_train_set = {(user, artist, plays, log_plays): True for (user, artist, plays, log_plays) in zip(train_set['user'],
train_set['artist'],
train_set['plays'],
train_set['log_plays'])}
In [32]:
bool_index = cleaned_train.apply(lambda row: (row['user'],
row['artist'],
row['plays'],
row['log_plays']) in in_train_set, axis=1)
In [33]:
validation_set = cleaned_train[~bool_index]
In [34]:
len(validation_set) + len(train_set) == len(cleaned_train)
Out[34]:
In [ ]:
# Learned first 100.
model_last_logs.learn(passes=100, test=validation_set)
In [60]:
# Continue training with smaller penalization.
model_last_logs.learn_rate = 0.02
model_last_logs.regularization = 0.1
model_last_logs.learn(passes=100, test=validation_set)
In [74]:
# Just optimize parameters that work.
model_last_logs.regularization = 0.09
model_last_logs.learn(passes=100, test=validation_set)
In [ ]:
# Let's calcualte our regularizationo factor compared to our prediction...?
In [71]:
model_last_logs.save_params("final_params_model_trained_115_passes")
In [38]:
p_dict_l = model_last_logs.predict(validation_set)
In [39]:
p_l, e_l = extract_expected_and_preds(p_dict_l, validation_set)
In [35]:
# Compare to the user median train set.
user_median = train_set.groupby('user').quantile(.5)
artist_median = train_set.groupby('artist').quantile(.5)
user_mean = train_set.groupby('user').mean()
artist_mean = train_set.groupby('artist').mean()
In [36]:
user_median_bias = user_median['log_plays'] - train_set['log_plays'].quantile(.5)
artist_median_bias = artist_median['log_plays'] - train_set['log_plays'].quantile(.5)
In [40]:
# Make predictions for the validation set based on median, mean, geometric mean, and log media.
from collections import defaultdict
median_preds = defaultdict(dict)
mean_preds = defaultdict(dict)
geometric_median_preds = defaultdict(dict)
geometric_mean_preds = defaultdict(dict)
for _, row in validation_set.iterrows():
user = row['user']
artist = row['artist']
median_preds[user][artist] = user_median['plays'][user]
mean_preds[user][artist] = user_mean['plays'][user]
geometric_median_preds[user][artist] = np.exp(user_median['log_plays'][user])
geometric_mean_preds[user][artist] = np.exp(user_mean['log_plays'][user])
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')
mean_preds, _ = extract_expected_and_preds(mean_preds, validation_set, column='plays')
geometric_median_preds, _ = extract_expected_and_preds(geometric_median_preds, validation_set, column='plays')
geometric_mean_preds, _ = extract_expected_and_preds(geometric_mean_preds, validation_set, column='plays')
In [42]:
# Calculate ABS for all of the above.
print "ABS Median User: %s." % absolute_error(median_preds, np.array(validation_set['plays']))
print "ABS Mean User: %s." % absolute_error(mean_preds, np.array(validation_set['plays']))
print "ABS Log Median User: %s." % absolute_error(geometric_median_preds, e_l)
print "ABS Geometric Mean User: %s." % absolute_error(geometric_mean_preds, e_l)
# Our model loss.
print "Log Model ABS: %s." % absolute_error(p_l, e_l)
print "Normal Model ABS: %s." % absolute_error(np.exp(p_l), validation_set['plays'])
In [43]:
plt.hist(model_last_logs._mu_artist.values(),bins = 100, alpha=0.5, label="artist_bias")
plt.hist(artist_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()
In [44]:
plt.hist(model_last_logs._mu_user.values(),bins = 100, alpha=0.5, label="user_bias")
plt.hist(user_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()
In [45]:
# Calculate interaction terms.
interaction_term = train_set.apply(lambda row: np.dot(
model_last_logs._P[row['user']], model_last_logs._Q[row['artist']]), axis=1)
In [48]:
plt.hist(interaction_term,bins = 100, alpha=0.5, label="interaction", range=(-.5,.5))
plt.legend(loc='upper right')
plt.show()
In [69]:
# This is the error.
np.sum((p_l - e_l)**2)
Out[69]:
In [68]:
# This is the regularization term we're using.
np.sum(np.array(interaction_term) ** 2) + np.sum(np.array(model_last_logs._mu_user.values()) ** 2) + np.sum(np.array(model_last_logs._mu_artist.values()) ** 2) + np.sum(np.array(model_last_logs._mu) ** 2)
Out[68]:
In [49]:
plt.hist(p_l, 100, alpha=0.5, label='predicted')
plt.hist(e_l, 100, alpha=0.5, label='expected')
plt.legend(loc='upper right')
plt.show()
In [50]:
p_f = np.exp(p_l)
_, e_f = extract_expected_and_preds(p_dict_l, validation_set, column='plays')
In [51]:
plt.hist(p_f, bins=100, alpha=0.5, label='predicted', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()
In [ ]:
print "Model ABS ERR: %s" % absolute_error(p_f, e_f)
In [ ]:
print "Model RMSE: %s" % rmse(p_f, e_f)
In [ ]:
print "Median ABS ERR: %s" % absolute_error(median_preds, e_f)
In [59]:
plt.hist(median_preds, bins=100, alpha=0.5, label='median', range=(0,1000))
#plt.hist(mean_preds, bins=100, alpha=0.5, label='mean', range=(0,1000))
#plt.hist(geometric_median_preds, bins=100, alpha=0.5, label='geo median', range=(0,1000))
#plt.hist(geometric_mean_preds, bins=100, alpha=0.5, label='geo mean', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()
In [ ]:
model_last = Model(train_set, column='plays')
In [ ]:
model_last.initialize_params()
In [ ]:
model_last.regularization = 0.0
In [ ]:
model_last.learn(passes=100,test=validation_set)
In [ ]:
# Train directly on play count.
model_plays = Model(train_set, column='plays')
model_plays.initialize_params()
In [ ]:
## Save params for initialization.
#model_plays.save_params()
In [ ]:
## Load already saved params.
model_plays = Model(column='plays')
model_plays.load_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')
In [ ]:
np.seterr(over='raise')
In [ ]:
model_plays.learn_rate = 0.00001 # slow learn rate
model_plays.regularization = 0.01 # avoid overfitting.
In [ ]:
# Total passes = 2.
model_plays.learn(passes=2)
In [ ]:
model_plays.save_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')
In [ ]:
# Make the predictions
preds = model_plays.predict(validation_set)
In [ ]:
# Extract the predictions.
predictions, expected = extract_expected_and_preds(preds, validation_set, column='plays')
In [ ]:
# We set all negative values to 1.
predictions[predictions < 1] = 1
In [ ]:
# Compare to the user median model.
user_median = train_set.groupby('user').quantile(.5)
# Make predictions for the validation set.
from collections import defaultdict
median_preds = defaultdict(dict)
for _, row in validation_set.iterrows():
user = row['user']
artist = row['artist']
median_preds[user][artist] = user_median['plays'][user]
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')
In [ ]:
absolute_error(median_preds, expected), absolute_error(predictions, expected)
In [ ]:
absolute_error(predictions, expected)
In [ ]:
# Do another pass.
model_plays.learn()
In [ ]:
# Predict again
preds2 = model_plays.predict(validation_set)
predictions2, expected = extract_expected_and_preds(preds2, validation_set, column='plays')
predictions2[predictions2 < 1] = 1
In [ ]:
absolute_error(median_preds, expected), absolute_error(predictions2, expected)
In [ ]:
# We'll try our new model now.
new_model = Model(train_set)
In [ ]:
new_model.initialize_params()
In [ ]:
new_model.save_params('test')
In [ ]:
new_model = Model()
new_model.load_params('test')
In [ ]:
# Test the predictions.
ps = new_model.predict(validation_set)
pp, ee = extract_expected_and_preds(ps, validation_set)
absolute_error(pp, ee)
In [ ]:
rmse(pp, ee)
In [ ]:
np.dot(new_model._P[new_model._user],new_model._Q[new_model._artist])
In [ ]:
new_model.learn(passes=100)
In [ ]:
# New model for plays.
model_plays_test = Model(train_set, column='plays')
# model_plays_test.initialize_params()
In [ ]:
model_plays_test.load_params('test_plays')
In [ ]:
model_plays_test.save_params('test_plays')
In [ ]:
user = model_plays_test._user
artist = model_plays_test._artist
In [ ]:
plt.hist([sum(value) for value in model_plays_test._P.values()], bins=100)
In [ ]:
plt.hist([sum(value) for value in model_plays_test._Q.values()], bins=100)
In [ ]:
model_plays_test.loss
In [ ]:
model_plays_test.learn(passes=100)
In [ ]:
# Calculate RMSE and ABSE
np.mean(predictions)
In [ ]:
# Load the trained model on the entire data set with log plays.
full_model_log_plays = Model(train)
In [ ]:
full_model_log_plays.load_params('params_k_100_learn_log')
In [ ]:
full_model_log_plays._mu_user
In [ ]:
train['log_plays'].mean()
In [ ]:
# Train the model on full.
model = Model(train_set)
In [ ]:
model.load_params('params_k_100_learn_log')
In [ ]:
In [ ]:
model.initialize_params()
In [ ]:
model.save_params("params_k_100_learn")
In [ ]:
model.save("model_k_100_learn_log")
In [ ]:
# Run the learning algorithm with one pass.
model.learn(passes=1)
In [ ]:
# Test how bad default predictions might be.
predictions = model.predict(validation_set)
In [ ]:
def rmse2(predictions, validation_set, log_plays = True):
return rmse(*extract_expected_and_preds(predictions, validation_set))
In [ ]:
rmse2(predictions, validation_set)
In [ ]:
preds_log, expected_log = extract_expected_and_preds(predictions, validation_set)
In [ ]:
preds, expected = extract_expected_and_preds(predictions, validation_set, column='plays')
In [ ]:
rmse(preds, expected)
In [ ]:
rmse(preds_log, expected_log)
In [ ]:
absolute_error(preds, expected)
In [ ]:
absolute_error(preds_log, expected_log)
In [ ]:
model_validation_log
In [ ]:
# Now we implement gradient descent on our customized algorithm. Iterate over data and use a single sample to
# estimate gradiant and update
def train_algorithm(passes, R, P, Q, bias, user_bias, artist_bias, passes=1):
for i in xrange(passes):
for _, row in cleaned_train.iterrows():
user = row['user']
artist = row['artist']
pred = np.dot(P[user], Q[artist]) + user_bias[user] + artist_bias[artist] + bias
print("Finised pass %i." % i + 1)
In [ ]:
def calculate_rmse()
In [ ]:
np.dot(P[user], Q[artist])
In [ ]:
user
In [ ]:
test = pd.read_csv('test.csv')
test.head(10)
In [ ]:
artist_genre = {}
artist_mid = {}
artist_qid = {}
for (_, row) in artists[:10].iterrows():
mid = None
qid = None
name = row['name']
mid = row['artist']
try:
mid = get_freebase_id(name)
qid = get_wikidata_id(mid)
artist_genre[name] = get_genres(qid)
except:
print "Fallback to lastfm genre for name: %s, mid: %s, qid: %s." % (name, mid, qid)
artist_genre[name] = get_lastfm_tags(row.artist)
artist_mid[name] = mid
artist_qid[name] = qid
In [ ]:
test.head(10)
In [ ]:
# Make the predictions.
final_log_predictions = model_last_logs.predict(test)
In [ ]:
# Flatten into a numpy array.
preds_final_log = np.array([val for val in v.values() for v in final_log_predictions.values()])
In [ ]:
test['log_predictions'] = test.apply(lambda row: final_log_predictions[row['user']][row['artist']], axis=1)
In [ ]:
test['prediction'] = np.exp(test['log_predictions'])
In [ ]:
max(test['prediction'])
In [ ]:
vals = final_log_predictions.values()
vals = [v.values() for v in vals]
In [ ]:
flat = [y for x in vals for y in x]
In [ ]:
flat = np.array(flat)
In [ ]:
len(flat[flat < 3])
In [ ]:
plt.hist(test['log_predictions'], bins=100)
In [ ]:
plt.hist(cleaned_train['log_plays'], bins=100)
In [ ]:
output = test[['Id','prediction']]
In [ ]:
len(test)
In [ ]:
output.to_csv("predictions.csv", index=False)
In [ ]:
output.to_csv("predictions_no_header.csv", index=False, header=False)
In [ ]: