In [1]:
"""Required imports"""
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
p_csv = pd.read_csv("predictions.csv")

In [3]:
p_csv.columns = ["Id", "plays"]

In [4]:
p_csv


Out[4]:
Id plays
0 1 93.565102
1 2 154.017971
2 3 216.456968
3 4 359.470934
4 5 118.769531
5 6 131.482198
6 7 231.391997
7 8 118.004769
8 9 468.329748
9 10 106.941526
10 11 11.058392
11 12 213.800186
12 13 13.045515
13 14 120.691451
14 15 237.291399
15 16 178.692406
16 17 108.174588
17 18 545.453791
18 19 165.248310
19 20 148.012265
20 21 50.673083
21 22 115.050378
22 23 75.189822
23 24 883.220945
24 25 171.198869
25 26 215.505742
26 27 157.525291
27 28 18.319610
28 29 65.512346
29 30 233.665189
... ... ...
4154774 4154775 678.062091
4154775 4154776 113.110890
4154776 4154777 32.564302
4154777 4154778 73.435753
4154778 4154779 459.799568
4154779 4154780 41.853396
4154780 4154781 418.129722
4154781 4154782 27.435197
4154782 4154783 66.071727
4154783 4154784 257.462091
4154784 4154785 56.247032
4154785 4154786 17.359583
4154786 4154787 67.364016
4154787 4154788 45.144167
4154788 4154789 16.148844
4154789 4154790 129.515620
4154790 4154791 234.349911
4154791 4154792 47.734475
4154792 4154793 648.616435
4154793 4154794 29.131306
4154794 4154795 144.731214
4154795 4154796 45.230527
4154796 4154797 20.230446
4154797 4154798 62.153981
4154798 4154799 216.481956
4154799 4154800 29.844646
4154800 4154801 333.076001
4154801 4154802 125.266609
4154802 4154803 263.941871
4154803 4154804 93.929988

4154804 rows × 2 columns


In [5]:
p_csv.to_csv("predictions.csv", index=False)

In [4]:
import json
import urllib

# If set to true, most functions will print information about intermediate steps.
DEBUG = False

def get_freebase_id(query):
    """
    Given a Google Search query, computes the corresponding Freebase ID
    as determined by Google Knowledge Graph.
    
    Args:
        query: String, the query such as 'the liars' or '50 cent'
    Returns:
        String, the freebase id in string format.
    Raises:
        KeyError, IndexError: When no FreeBase ID can be found for the query.
    """
    api_key = 'AIzaSyBBY9bXofiXL9vbe_V6Y49NyAHRv46As60'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
        'query': query,
        'limit': 10,
        'indent': True,
        'key': api_key,
    }
    type_params = ['types=Person', 'types=MusicGroup']
    url = service_url + '?' + '&'.join(type_params) + '&' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    try: 
        freebase_id = response['itemListElement'][0]['result']['@id']
    except (KeyError, IndexError) as e:
        if DEBUG:
            print "No key for query in Google Knowledge Graph for query: %s." % response
        raise
        
    return freebase_id[3:]

In [5]:
def get_wikidata_id(mid):
    """
    Calculate the WikiData ID based on the MID (Freebase ID).
    
    Args:
        mid: String, the mid to be found in WikiData.
    Returns:
        String, the WikiData ID.
    Raises:
        KeyError, IndexError: When the MID cannot be found in WikiData.
    """
    service_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    params = {
        'format': 'json',
        'query': 
            'PREFIX wd: <http://www.wikidata.org/entity/>\n'
            'PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n'
            'PREFIX wikibase: <http://wikiba.se/ontology#>\n'
                'SELECT  ?s ?sLabel ?p ?o ?oLabel WHERE {\n'
                    '?s ?p ?o .\n'
                    '?s wdt:P646 "' + mid + '" .\n'
                    'SERVICE wikibase:label {\n'
                        'bd:serviceParam wikibase:language "en" .\n'
                    '}\n'
                '}\n'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    try:
        object_url = response['results']['bindings'][0]['s']['value']
    except (KeyError, IndexError) as e:
        if DEBUG:
            print "Unable to get wikidata id for mid: %s. We used url: %s." % (mid, url)
        raise
    
    qid = object_url.split('/')[-1]
    return qid

In [6]:
def get_genres(qid):
    """
    Returns the Music Genres if found corresponding to the WikiData ID.
    
    Args:
        qid: String, the WikiData ID.
        
    Returns:
        [String], the list of genres if any.
    Raises:
        
    """
    service_url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'ids': qid,
        'format': 'json'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    try:
        # Genre property is P136
        genre_snaks = response['entities'][qid]['claims']['P136']
    except KeyError as e:
        if DEBUG:
            print "Unable to extract genre property for qid: %s." % qid
        raise
        
    # Convert snacks into list of ids to query
    genre_ids = []
    for snak in genre_snaks:
        try:
            genre_id = snak['mainsnak']['datavalue']['value']['id']
            genre_ids.append(genre_id)
        except KeyError as e:
            if DEBUG:
                print "Unable to extract genre id from snak for qid: %s. Skipping." % (qid)
            
    if len(genre_ids) == 0:
        raise Exception("No genre ids for qid: %s." % qid)
        
    # Fetch genre information
    params = {
        'action': 'wbgetentities',
        'ids': '|'.join(genre_ids),
        'format': 'json'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    # Extract the english labels for the entities.
    genres = []
    for genre_id in genre_ids:
        try:
            genre_object = response['entities'][genre_id]
        except KeyError as e:
            if DEBUG:
                print "Unable to extract genre_id with qid: %s." % (genre_id)
            continue
        try:
            genres.append(genre_object['labels']['en']['value'])
        except KeyError as e:
            if DEBUG:
                print "Unable to extract english label for qid: %s with genre qid %s." % (qid, genre_id)
            continue
    if len(genres) == 0:
        raise Exception("Could not extract english labels for genres for qid %s.", qid)
            
    return genres

In [7]:
def get_lastfm_tags(mbid):
    """
    Returns the top three of Last FM tags corresponding to the Music Brainz ID.
    
    Args:
        mbdi: String, the Music Brainz ID to be found.
    Returns:
        [String], the top three tags for the mbid.
    Raises:
        KeyError: If the mbid cannot be found in the Last FM page.
    """
    service_url = 'http://ws.audioscrobbler.com/2.0/'
    params = {
        'method': 'artist.gettoptags',
        'mbid': mbid,
        'api_key': 'f2fac19fc4abcebe0d1729429137037e',
        'format': 'json',
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    # Extract top 3 tags if possible
    try:
        tags = response['toptags']['tag'][:3]
    except KeyError:
        if DEBUG:
            print "Failed to extract tags from last fm for mbid: %s." % mbid
        raise
    return [tag['name'] for tag in tags]

In [8]:
## Read the artists. Not necessary for our model.
#artists = pd.read_csv('artists.csv')
#artists.head(10)

In [9]:
## Read the profiles. Not necessary for our model.
#profiles = pd.read_csv('profiles.csv')
#profiles.head(10)

In [10]:
# Load the training data and attack the 'log_plays' column.
train = pd.read_csv('train.csv')
train['log_plays'] = np.log(train['plays'])
train.head(10)


Out[10]:
user artist plays log_plays
0 eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03 5a8e07d5-d932-4484-a7f7-e700793a9c94 554 6.317165
1 44ce793a6cd9d20f13f4a576a818ef983314bb5d a3a92047-be1c-4f3e-8960-c4f8570984df 81 4.394449
2 da9cf3f557161d54b76f24db64be9cc76db008e3 eeb1195b-f213-4ce1-b28c-8565211f8e43 708 6.562444
3 8fa49ab25d425edcf05d44bfc1d5aea895287d81 a1419808-65d3-4d40-998c-1a0bac65eabc 265 5.579730
4 b85fcaef67d2669cd99b334b5e8c8705263db2cf a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432 220 5.393628
5 feed7a0dc74c5251283a1505adf453a2061d08f7 1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506 2113 7.655864
6 cbb86d88a8d2d0bab8956807c6c45cd0c752324b 9c9f1380-2516-4fc9-a3e6-f9f61941d090 127 4.844187
7 5641e1e6f04868a61dc29f7227e34f4640163e9b 832a43c7-aa7d-439b-a6b4-4f1afa671c24 305 5.720312
8 9f748976d303db79f61bf570d9549d6335b11b0e 2fddb92d-24b2-46a5-bf28-3aed46f4684c 705 6.558198
9 056d5d2467dc63c4520963323e2ebf9576b58229 847e8284-8582-4b0e-9c26-b042a4f49e57 7 1.945910

In [11]:
# Some sanity checks to make sure we've loaded the data correctly.
from collections import Counter
artist_count = Counter(train['artist'])
user_count = Counter(train['user'])
assert len(artist_count) == 2000
assert len(user_count) == 233286

In [ ]:
# Histogram for artist count for sanity check.
plt.hist(artist_count.values(), bins=100)

In [ ]:
# Histogram for user counts for sanity check.
plt.hist(user_count.values(), bins=100)

In [ ]:
# Histogram for log plays for sanity check.
plt.hist(train['plays'], bins=100)

In [ ]:
# Histogram for log plays for sanity check.
plt.hist(train['log_plays'], bins=100)

In [12]:
# Let's take a look at the log distribution by artist
# by_artist = train.groupby('artist').mean()
by_user = train.groupby('user').mean()

In [ ]:
# plot the average for plays.
plt.hist(by_user['plays'], bins=100)

In [ ]:
# Plot the average for log_plays.
plt.hist(by_user['log_plays'], bins=100)

In [ ]:
# Plot the plays by artist.
plt.hist(by_artist['plays'], bins=100)

In [ ]:
# Plot the average log plays (aka, 'rating').
plt.hist(by_artist['log_plays'], bins=100)

In [13]:
# Per user, let's take a look at median by users.
by_user_median = train.groupby('user').quantile(0.5)

In [15]:
# Merges the mean and median computations on user, since this is what we will use to determine outliers.
merged = pd.merge(by_user.reset_index(), by_user_median.reset_index(), on="user", suffixes=["_mean", "_median"])

In [16]:
# Let's calculate the gap_plays (ie, mean - median) for each user.
merged['gap_plays'] = merged['plays_mean'] - merged['plays_median']

In [ ]:
# Let's see how the data is distributed.
plt.hist(merged['gap_plays'], bins=100)

In [17]:
# Candidate outliers have a gap > 1000 or < -1000
min_gap = -1000
max_gap = 1000
candidates = merged[(merged['gap_plays'] < min_gap) | (merged['gap_plays'] > max_gap)]

In [18]:
# Merge the candidates so we can get the log_plays.
candidate_removals = pd.merge(train, candidates, on="user")

In [19]:
# We remove only if log_play > 10 (ie, extreme outliers).
candidate_removals = candidate_removals[candidate_removals['log_plays'] > 10]

In [20]:
# Remove the candidates.
cleaned_train = train[~((train['user'].isin(candidate_removals['user']))
                      & (train['artist'].isin(candidate_removals['artist']))
                      & (train['plays'].isin(candidate_removals['plays'])))]

In [21]:
# Removed exactly what we wanted. Make some basic checks.
assert len(train) - len(cleaned_train) ==  len(candidate_removals)
assert len(candidate_removals) == 313
print "Removed a total of %s." % len(candidate_removals)


Removed a total of 313.

In [22]:
# Seperate into train and validation. We take a random 10 percent of the data to be validation.
# We have to make sure that the train_set has all of the users.
all_users_and_artists_in_both = False
expected_num_users = len(set(cleaned_train['user']))
expected_num_artist = len(set(cleaned_train['artist']))
while not all_users_and_artists_in_both: 
    shuffled_set = cleaned_train.sample(frac=1).reset_index(drop=True)
    train_length = int(0.9 * len(shuffled_set))
    train_set = shuffled_set[:train_length]
    validation_set = shuffled_set[train_length:]

    print "Attempted..."
    if (len(set(train_set['user'])) == expected_num_users
        and len(set(train_set['artist'])) == expected_num_artist):
        print "Found!"
        all_users_and_artists_in_both = True


Attempted...
Found!

In [23]:
expected_num_users, len(set(train_set['user'])), len(set(validation_set['user']))


Out[23]:
(233286, 233286, 193677)

In [24]:
def extract_expected_and_preds(predictions, df, column='log_plays'):
    """
    Converts the output `predictions` from a Model and the `df` from
    of expected values into two flat numpy arrays.
    
    Args:
        predictions: The predictions returned by the Model.
        df: pandas.DataFrame, the df containing the expected results
        column: String, the column in df with the expected values.
        
    Returns:
        np.array(float), np.array(float): Two numpy arrays of
            preds and expected values where each entry in preds
            corresponds to expected.
    """
    if column not in ['log_plays', 'plays']:
        raise Exception("Unsupported column name %s.", column)
        
    expected = np.array(df[column])
    preds = np.zeros(len(expected))
    i = 0
    for _, row in df.iterrows():
        artist =row['artist']
        user = row['user']
        preds[i] = predictions[user][artist]
        i += 1
    return preds, expected

In [25]:
def rmse(preds, expected):
    return (np.sum((preds - expected)**2 / float(len(expected)))) ** 0.5

In [26]:
def absolute_error(preds, expected):
    return np.sum(np.abs(preds - expected) / float(len(expected)))

In [27]:
class Model(object):
    """
    Our custom model class which performs basic learning on the data.
    
    See http://www.netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf for details on the model.
    
    As an overview, the model assumes the following:
        pred = bias + bias_user[user] + bias_artist[artist] + np.dot(P[user],Q[artist])
        
    We have a global bias, a bias for each user, and a bias for each artist. We also have an interaction
    term which is calculated by taking \sum_k P[user][k] * Q[artist][k]. 
    
    The interpretation is that there are K hidden factors which define the interaction between the user
    the artist.
    
    The model learning attempts to minimize the following loss functions:
    
        loss(bias, bias_user, bias_artist, P, Q) =
            \sum_{user, artist} (actual - pred)^2 + \lambda[||P|| 
                + ||Q|| + ||bias_user|| + ||bias_artist|| + ||bias||]
        
    The learning takes place wit SGD.
    
    Properties:
        name: String, the name of the model, based on input parameters. Used for saving and loading files.
        learn_rate: float, the learning rate for SGD (gamma).
        regularization: float, the normalization factor (lambda).
        K: int, the number of latent variables the model assumes.
        column: string, the name of the column in data which we wish to predict. Currently
            only support 'log_plays' and 'plays'.
        
    """
    def __init__(self, data = None, learning_rate = 0.02, penalization = 0.1, hidden_factors=50, column='log_plays'):
        """
        Initializes a model.
        
        Args:
            data: Opt[pandas.DataFrame], the training data for the model. This should be None if we plan
                to load the parameters from a file, as the data is loaded too.
            learning_rate: float, the learning rate to use for SGD.
            penalization: float, the regularization factor.
            hidden_factors: int, the number latent variables for the mode.
            column: String, the model will attempt to predict data[column]. Only 'log_plays' and 'plays'
                currently supported.
        """
        if column not in ['log_plays', 'plays']:
            raise Exception("Unsupported column %s." % column)
        self.name = "model_learn_rate_%s_penalization_%s_k_%s_on_column_%s" % (learning_rate,
                                                                               penalization,
                                                                               hidden_factors,
                                                                               column)
        self.learn_rate = learning_rate
        self.regularization = penalization
        self.K = hidden_factors
        self.train = data
        self.column = column
        
        # A dictionary mapping R[user][artist] to values.
        self._R = None
        # A dictionary mapping P[user] = np.array(K)
        self._P = None
        # A dictionary mapping Q[artist] = np.array(K)
        self._Q = None
        # A constant bias term.
        self._mu = None
        # A dictionary mapping _mu_user[user] to a contants bias term.
        self._mu_user = None
        # A dictionary mapping _mu_artist[artist] to a contant bias term.
        self._mu_artist = None
        
        # cache results.
        self._predictions = None
        self._expected = None
        
    def initialize_params(self):
        """
        Initializes the parameters for the model, P,Q, mu, mu_user, and mu_artist to the defaults.
        """
        from collections import defaultdict
        bias = self.train[self.column].mean()
        users = self.train.groupby('user').mean().reset_index()
        artists = self.train.groupby('artist').mean().reset_index()
        
        # Default user and artist bias.
        user_bias = {user: val - bias for (user, val) in zip(users['user'], users[self.column])}
        artist_bias = {artist: val - bias for (artist, val) in zip(artists['artist'], artists[self.column])}
        # The hard part of initializing the P and Q sparse matrices (aka, dictionaries ^_^)
        P, Q, R = {}, {}, defaultdict(dict)
        i = 0
        for _, row in self.train.iterrows():
            i += 1
            user = row['user']
            artist = row['artist']
            rating = row[self.column]
            P[user] = np.random.normal(size=self.K)
            Q[artist] = np.random.normal(size=self.K)
            R[user][artist] = rating
            
            if i % (len(self.train) / 10) == 0:
                print "Done with %.2f percent." % (100 * float(i) / len(self.train))
            
        self._R, self._P, self._Q = dict(R), P, Q
        self._mu, self._mu_user, self._mu_artist = bias, user_bias, artist_bias
        
    def load_params(self, filename=None):
        """
        Loads the parameters into the model. This is useful to load a previously trained model.
        
        Args:
            filename: Opt[String], the filename from which to load the parameters.
                If None, assumes the parameters are contained in the file named self.name.
        """
        import pickle
        if not filename:
            filename = "%s_params" % self.name
        with open("%s.pk" % filename) as handle:
            (self.train,
             self._R, self._P, self._Q,
             self._mu, self._mu_user, self._mu_artist) = pickle.load(handle)
        # sanity checks to verify data loaded is for the correct model.
        assert len(self._P.itervalues().next()) == self.K
        assert len(self._Q.itervalues().next()) == self.K
        
    def save_params(self, filename=None):
        """
        Saves the parameters of the model.
        
        Args:
            filename: Opt[String], the filename into which to save the parameters.
                If None, assumes the parameters should be saved in the file named self.name.
        """
        import pickle
        if not filename:
            filename = "%s_params" % self.name
        obj = (self.train, self._R, self._P, self._Q, self._mu, self._mu_user, self._mu_artist)
        with open("%s.pk" % filename, 'w') as handle:
            pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def _getParamGradients(self, user, artist, pred):
        self._loss = self._R[user][artist] - pred
        return (-self._loss * self._Q[artist] + self.regularization * self._P[user],
                -self._loss * self._P[user] + self.regularization * self._Q[artist],
                -self._loss,
                -self._loss + self.regularization * self._mu_user[user],
                -self._loss + self.regularization * self._mu_artist[artist])
    
    def learn(self, passes = 1, test=None):
        """
        Train the model using the training data.
        
        Args:
            passes: Int, the number of full passes to do over the input data.
            test: df.DataFrame, a validation set. If given, calculates the ABSERROR
                 after each half pass.
        """
        for j in xrange(passes):
            i = 0
            for _, row in self.train.iterrows():
                i += 1
                user = row['user']
                artist = row['artist']
                self._user = user
                self._artist = artist
                pred = self._predict_single(user, artist)
                
                # Update in the parameters based on data points
                Pgrad, Qgrad, mugrad, mu_user_grad, mu_artist_grad = self._getParamGradients(user, artist, pred)
                self._P[user] -= self.learn_rate * Pgrad 
                self._Q[artist] -= self.learn_rate * Qgrad
                self._mu -= self.learn_rate * mugrad
                self._mu_user[user] -= self.learn_rate * mu_user_grad
                self._mu_artist[artist] -= self.learn_rate * mu_artist_grad
                
                if i % (len(self.train) / 10) == 0:
                    print "Loss for current item %s." % str(self._loss)
                    print "Done with %.2f percent." % (100 * float(i) / len(self.train))
                
            print("Finised pass %s." % str(j + 1))
            # Calcualte the loss after this pass.
            if validation_set is not None:
                p, e = self.calculate_abs_error(validation_set)
                print("Absolute mean error %s.", absolute_error(p, e))
                
    def _predict_single(self, user, artist):
        # interesting...optinimzations could be done here.
        return (np.dot(self._P[user], self._Q[artist])
                                         + self._mu_user[user]
                                         + self._mu_artist[artist]
                                         + self._mu)
    
    def calculate_abs_error(self, test):
        """
        Note that this caches results, so it will ignore test in subsequent calls.
        """
        expected = np.array(test[self.column])
        preds = np.zeros(len(expected))
        # Reuse predictions.
        predictions = self.predict(test)
        i = 0
        for _, row in test.iterrows():
            artist =row['artist']
            user = row['user']
            preds[i] = predictions[user][artist]
            i += 1
        return preds, expected
    
    def predict(self, test):
        """
        Predicts for the values in the test DataFrame.
        
        Args:
            test: pandas.DataFrame, the dataframe containing user/artist rows to predict.
        
        Returns:
            dict[String -> dict[String -> Float]]: A dictionary mapping
                return_val[user][artist] to the predicted values.
        """
        from collections import defaultdict
        predictions = defaultdict(dict)
        i = 0
        for _, row in test.iterrows():
            i += 1
            user = row['user']
            artist = row['artist']
            predictions[user][artist] = self._predict_single(user, artist)
            if i % (len(test) / 10) == 0:
                print "Done with %.2f percent." % (100 * float(i) / len(test))
        self._predictions = dict(predictions)
        return self._predictions
            
    def save(self, filename = None):
        """
        Save the model to disk.
        """
        if not filename:
            filename = self.name
        import pickle
        with open("%s.pk" % filename, 'w') as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    @staticmethod
    def load(filename):
        """
        Load a model from disk.
        """
        import pickle
        with open("%s.pk" % filename) as handle:
            return pickle.load(handle)

In [28]:
##See the below for loading a model. We suggest you load the parameters only, and not the entire model.
## Note that column specified the `column` in data which the model is attempting to predict.
# model = Model(data = None, learning_rate = 0.01, penalization = 0.001, hidden_factors=100, column='log_plays')
## Load the parameters for the model. These parameters should match hidden_factors and column to be relevant.
# model.load_params(filename)
## Alternatively, you can initialize the parameters randomly if data is NOT None.
# model.initialize_params()
## If you wish, you can run the model over the data again, to improve the predictions. passes determines how many
## iterations over the data to perform.
# model.learn(passes=3)
## At some point, we recommend you save your current set of model parameters so you can reuse later without retraining.
# model.save_params(filename)
## You can also save the entire model (which includes saving the training data), but this can be a bit finicky.
# model.save(filename)
## If you do save the entire model, you can load it again with the following.
# model = Model.load(filename)
## Once you're ready to make predictions, simply do where df is the dataframe of user/artist ids.
## The model returns a python dictionary such that preds[user][artist] gives the predicted value.
# preds = model.predict(df)
## To make things fast, you can calculate RMSE against a validation dataframe (ie, user/artist/column) dataframe.
## First, extract everything into numpy. Column should be one of 'plays' or 'log_plays'
# p, e = extract_expected_and_preds(preds, expected_df, column)
## Then calculate the error.
# rm = rmse(p,e)
# ae = absolute_error(p,e)
## At the end, we recommend you save the entire model...

In [29]:
model_last_logs = Model()
#model_last_logs.initialize_params()

In [73]:
model_last_logs.load_params("final_params_model_trained_115_passes")

In [31]:
train_set = model_last_logs.train
in_train_set = {(user, artist, plays, log_plays): True for (user, artist, plays, log_plays) in zip(train_set['user'],
                                                                      train_set['artist'],
                                                                      train_set['plays'],
                                                                      train_set['log_plays'])}

In [32]:
bool_index = cleaned_train.apply(lambda row: (row['user'],
                                              row['artist'],
                                              row['plays'],
                                              row['log_plays']) in in_train_set, axis=1)

In [33]:
validation_set = cleaned_train[~bool_index]

In [34]:
len(validation_set) + len(train_set) == len(cleaned_train)


Out[34]:
True

In [ ]:
# Learned first 100.
model_last_logs.learn(passes=100, test=validation_set)

In [60]:
# Continue training with smaller penalization.
model_last_logs.learn_rate = 0.02
model_last_logs.regularization = 0.1
model_last_logs.learn(passes=100, test=validation_set)


Loss for current item 0.0713301698526.
Done with 10.00 percent.
Loss for current item -0.346096471099.
Done with 20.00 percent.
Loss for current item 0.201617718404.
Done with 30.00 percent.
Loss for current item 0.391093916246.
Done with 40.00 percent.
Loss for current item 0.329401172381.
Done with 50.00 percent.
Loss for current item 0.552185287577.
Done with 60.00 percent.
Loss for current item 1.32315781906.
Done with 70.00 percent.
Loss for current item 0.166981734387.
Done with 80.00 percent.
Loss for current item -0.105536270882.
Done with 90.00 percent.
Loss for current item -0.317814100525.
Done with 100.00 percent.
Finised pass 1.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55846115870013535)
Loss for current item 0.0703309038217.
Done with 10.00 percent.
Loss for current item -0.34592546589.
Done with 20.00 percent.
Loss for current item 0.199180547983.
Done with 30.00 percent.
Loss for current item 0.392607832875.
Done with 40.00 percent.
Loss for current item 0.328494951457.
Done with 50.00 percent.
Loss for current item 0.55238899246.
Done with 60.00 percent.
Loss for current item 1.31715359249.
Done with 70.00 percent.
Loss for current item 0.165955252833.
Done with 80.00 percent.
Loss for current item -0.104618678715.
Done with 90.00 percent.
Loss for current item -0.315802797633.
Done with 100.00 percent.
Finised pass 2.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55841876695723947)
Loss for current item 0.0693193828254.
Done with 10.00 percent.
Loss for current item -0.345857011113.
Done with 20.00 percent.
Loss for current item 0.196742173514.
Done with 30.00 percent.
Loss for current item 0.394062010023.
Done with 40.00 percent.
Loss for current item 0.327668961276.
Done with 50.00 percent.
Loss for current item 0.552595204613.
Done with 60.00 percent.
Loss for current item 1.31130995678.
Done with 70.00 percent.
Loss for current item 0.164908190536.
Done with 80.00 percent.
Loss for current item -0.103735339014.
Done with 90.00 percent.
Loss for current item -0.313784434251.
Done with 100.00 percent.
Finised pass 3.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55838037921824601)
Loss for current item 0.0682945103341.
Done with 10.00 percent.
Loss for current item -0.345886340961.
Done with 20.00 percent.
Loss for current item 0.194306358103.
Done with 30.00 percent.
Loss for current item 0.395457550895.
Done with 40.00 percent.
Loss for current item 0.326917479189.
Done with 50.00 percent.
Loss for current item 0.552801320877.
Done with 60.00 percent.
Loss for current item 1.30562525099.
Done with 70.00 percent.
Loss for current item 0.163843441041.
Done with 80.00 percent.
Loss for current item -0.102885353613.
Done with 90.00 percent.
Loss for current item -0.3117610769.
Done with 100.00 percent.
Finised pass 4.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.5583451881575745)
Loss for current item 0.0672553620319.
Done with 10.00 percent.
Loss for current item -0.346008658005.
Done with 20.00 percent.
Loss for current item 0.191876727968.
Done with 30.00 percent.
Loss for current item 0.396795618193.
Done with 40.00 percent.
Loss for current item 0.326235129179.
Done with 50.00 percent.
Loss for current item 0.553004880923.
Done with 60.00 percent.
Loss for current item 1.30009770871.
Done with 70.00 percent.
Loss for current item 0.162763732693.
Done with 80.00 percent.
Loss for current item -0.102067798359.
Done with 90.00 percent.
Loss for current item -0.309734609291.
Done with 100.00 percent.
Finised pass 5.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55831304587269559)
Loss for current item 0.0662011773378.
Done with 10.00 percent.
Loss for current item -0.346219161541.
Done with 20.00 percent.
Loss for current item 0.189456766914.
Done with 30.00 percent.
Loss for current item 0.398077425275.
Done with 40.00 percent.
Loss for current item 0.325616867322.
Done with 50.00 percent.
Loss for current item 0.55320356614.
Done with 60.00 percent.
Loss for current item 1.29472546558.
Done with 70.00 percent.
Loss for current item 0.161671636205.
Done with 80.00 percent.
Loss for current item -0.101281729553.
Done with 90.00 percent.
Loss for current item -0.307706748082.
Done with 100.00 percent.
Finised pass 6.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55828381804492699)
Loss for current item 0.0651313508238.
Done with 10.00 percent.
Loss for current item -0.346513073113.
Done with 20.00 percent.
Loss for current item 0.187049811822.
Done with 30.00 percent.
Loss for current item 0.399304227689.
Done with 40.00 percent.
Loss for current item 0.325057967019.
Done with 50.00 percent.
Loss for current item 0.553395198051.
Done with 60.00 percent.
Loss for current item 1.2895065668.
Done with 70.00 percent.
Loss for current item 0.160569571987.
Done with 80.00 percent.
Loss for current item -0.100526189852.
Done with 90.00 percent.
Loss for current item -0.305679058022.
Done with 100.00 percent.
Finised pass 7.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55825777544832678)
Loss for current item 0.0640454235853.
Done with 10.00 percent.
Loss for current item -0.346885659314.
Done with 20.00 percent.
Loss for current item 0.184659049084.
Done with 30.00 percent.
Loss for current item 0.400477315135.
Done with 40.00 percent.
Loss for current item 0.324554004159.
Done with 50.00 percent.
Loss for current item 0.553577736307.
Done with 60.00 percent.
Loss for current item 1.28443897434.
Done with 70.00 percent.
Loss for current item 0.159459817204.
Done with 80.00 percent.
Loss for current item -0.0998002136433.
Done with 90.00 percent.
Loss for current item -0.303652966393.
Done with 100.00 percent.
Finised pass 8.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55823406804625308)
Loss for current item 0.0629430746142.
Done with 10.00 percent.
Loss for current item -0.347332252004.
Done with 20.00 percent.
Loss for current item 0.182287511878.
Done with 30.00 percent.
Loss for current item 0.401598003899.
Done with 40.00 percent.
Loss for current item 0.324100842334.
Done with 50.00 percent.
Loss for current item 0.553749276348.
Done with 60.00 percent.
Loss for current item 1.27952057402.
Done with 70.00 percent.
Loss for current item 0.158344512576.
Done with 80.00 percent.
Loss for current item -0.0991028319004.
Done with 90.00 percent.
Loss for current item -0.301629776683.
Done with 100.00 percent.
Finised pass 9.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55821258259919926)
Loss for current item 0.0618241122209.
Done with 10.00 percent.
Loss for current item -0.347848266082.
Done with 20.00 percent.
Loss for current item 0.17993807823.
Done with 30.00 percent.
Loss for current item 0.402667629788.
Done with 40.00 percent.
Loss for current item 0.323694618213.
Done with 50.00 percent.
Loss for current item 0.553908046762.
Done with 60.00 percent.
Loss for current item 1.27474918243.
Done with 70.00 percent.
Loss for current item 0.15722566888.
Done with 80.00 percent.
Loss for current item -0.0984330765592.
Done with 90.00 percent.
Loss for current item -0.299610681419.
Done with 100.00 percent.
Finised pass 10.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55819342587923204)
Loss for current item 0.0606884655446.
Done with 10.00 percent.
Loss for current item -0.348429214933.
Done with 20.00 percent.
Loss for current item 0.177613469773.
Done with 30.00 percent.
Loss for current item 0.40368754158.
Done with 40.00 percent.
Loss for current item 0.323331727157.
Done with 50.00 percent.
Loss for current item 0.554052406407.
Done with 60.00 percent.
Loss for current item 1.2701225536.
Done with 70.00 percent.
Loss for current item 0.156105173182.
Done with 80.00 percent.
Loss for current item -0.0977899844203.
Done with 90.00 percent.
Loss for current item -0.297596774143.
Done with 100.00 percent.
Finised pass 11.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55817648230146311)
Loss for current item 0.0595361761885.
Done with 10.00 percent.
Loss for current item -0.349070723708.
Done with 20.00 percent.
Loss for current item 0.175316251157.
Done with 30.00 percent.
Loss for current item 0.404659095001.
Done with 40.00 percent.
Loss for current item 0.32300880916.
Done with 50.00 percent.
Loss for current item 0.554180841331.
Done with 60.00 percent.
Loss for current item 1.26563838547.
Done with 70.00 percent.
Loss for current item 0.154984794787.
Done with 80.00 percent.
Loss for current item -0.097172600612.
Done with 90.00 percent.
Loss for current item -0.295589060499.
Done with 100.00 percent.
Finised pass 12.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55816165237556348)
Loss for current item 0.0583673900104.
Done with 10.00 percent.
Loss for current item -0.349768540563.
Done with 20.00 percent.
Loss for current item 0.17304883004.
Done with 30.00 percent.
Loss for current item 0.405583647244.
Done with 40.00 percent.
Loss for current item 0.32272273516.
Done with 50.00 percent.
Loss for current item 0.554291961539.
Done with 60.00 percent.
Loss for current item 1.26129432626.
Done with 70.00 percent.
Loss for current item 0.153866190921.
Done with 80.00 percent.
Loss for current item -0.0965799816351.
Done with 90.00 percent.
Loss for current item -0.293588468422.
Done with 100.00 percent.
Finised pass 13.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55814874725477626)
Loss for current item 0.0571823490977.
Done with 10.00 percent.
Loss for current item -0.350518545996.
Done with 20.00 percent.
Loss for current item 0.170813457606.
Done with 30.00 percent.
Loss for current item 0.406462551998.
Done with 40.00 percent.
Loss for current item 0.322470593775.
Done with 50.00 percent.
Loss for current item 0.554384497633.
Done with 60.00 percent.
Loss for current item 1.25708798044.
Done with 70.00 percent.
Loss for current item 0.152750912152.
Done with 80.00 percent.
Loss for current item -0.0960111980188.
Done with 90.00 percent.
Loss for current item -0.291595857438.
Done with 100.00 percent.
Finised pass 14.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55813734774592616)
Loss for current item 0.0559813839476.
Done with 10.00 percent.
Loss for current item -0.351316760433.
Done with 20.00 percent.
Loss for current item 0.168612229583.
Done with 30.00 percent.
Loss for current item 0.407297155002.
Done with 40.00 percent.
Loss for current item 0.322249678501.
Done with 50.00 percent.
Loss for current item 0.554457297347.
Done with 60.00 percent.
Loss for current item 1.25301691469.
Done with 70.00 percent.
Loss for current item 0.151640407559.
Done with 80.00 percent.
Loss for current item -0.0954653366114.
Done with 90.00 percent.
Loss for current item -0.28961202709.
Done with 100.00 percent.
Finised pass 15.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.5581278792548704)
Loss for current item 0.0547649058754.
Done with 10.00 percent.
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-a406ec961f72> in <module>()
      2 model_last_logs.learn_rate = 0.02
      3 model_last_logs.regularization = 0.1
----> 4 model_last_logs.learn(passes=100, test=validation_set)

<ipython-input-27-e52c8642977e> in learn(self, passes, test)
    156         for j in xrange(passes):
    157             i = 0
--> 158             for _, row in self.train.iterrows():
    159                 i += 1
    160                 user = row['user']

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/frame.pyc in iterrows(self)
    697         klass = self._constructor_sliced
    698         for k, v in zip(self.index, self.values):
--> 699             s = klass(v, index=columns, name=k)
    700             yield k, s
    701 

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    245                 data = SingleBlockManager(data, index, fastpath=True)
    246 
--> 247         generic.NDFrame.__init__(self, data, fastpath=True)
    248 
    249         self.name = name

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/generic.pyc in __init__(self, data, axes, copy, dtype, fastpath)
    127         object.__setattr__(self, 'is_copy', None)
    128         object.__setattr__(self, '_data', data)
--> 129         object.__setattr__(self, '_item_cache', {})
    130 
    131     def _validate_dtype(self, dtype):

KeyboardInterrupt: 

In [74]:
# Just optimize parameters that work.
model_last_logs.regularization = 0.09
model_last_logs.learn(passes=100, test=validation_set)


Loss for current item 0.0821658857431.
Done with 10.00 percent.
Loss for current item -0.345926037405.
Done with 20.00 percent.
Loss for current item 0.135337722098.
Done with 30.00 percent.
Loss for current item 0.392568813847.
Done with 40.00 percent.
Loss for current item 0.309499020915.
Done with 50.00 percent.
Loss for current item 0.559152270682.
Done with 60.00 percent.
Loss for current item 1.21056694676.
Done with 70.00 percent.
Loss for current item 0.135502248586.
Done with 80.00 percent.
Loss for current item -0.0821793701507.
Done with 90.00 percent.
Loss for current item -0.289432330059.
Done with 100.00 percent.
Finised pass 1.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55864651220250228)
Loss for current item 0.0588775225637.
Done with 10.00 percent.
Loss for current item -0.346668326433.
Done with 20.00 percent.
Loss for current item 0.135745189329.
Done with 30.00 percent.
Loss for current item 0.382065793302.
Done with 40.00 percent.
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-74-1b3ed53739b9> in <module>()
      1 # Just optimize parameters that work.
      2 model_last_logs.regularization = 0.09
----> 3 model_last_logs.learn(passes=100, test=validation_set)

<ipython-input-27-e52c8642977e> in learn(self, passes, test)
    159                 i += 1
    160                 user = row['user']
--> 161                 artist = row['artist']
    162                 self._user = user
    163                 self._artist = artist

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    599 
    600     def __getitem__(self, key):
--> 601         key = com._apply_if_callable(key, self)
    602         try:
    603             result = self.index.get_value(self, key)

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/common.pyc in _apply_if_callable(maybe_callable, obj, **kwargs)
    450     otherwise return as it is
    451     """
--> 452     if callable(maybe_callable):
    453         return maybe_callable(obj, **kwargs)
    454     return maybe_callable

KeyboardInterrupt: 

In [ ]:
# Let's calcualte our regularizationo factor compared to our prediction...?

In [71]:
model_last_logs.save_params("final_params_model_trained_115_passes")

In [38]:
p_dict_l = model_last_logs.predict(validation_set)


Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.

In [39]:
p_l, e_l = extract_expected_and_preds(p_dict_l, validation_set)

In [35]:
# Compare to the user median train set.
user_median = train_set.groupby('user').quantile(.5)
artist_median = train_set.groupby('artist').quantile(.5)
user_mean = train_set.groupby('user').mean()
artist_mean = train_set.groupby('artist').mean()

In [36]:
user_median_bias = user_median['log_plays'] - train_set['log_plays'].quantile(.5)
artist_median_bias = artist_median['log_plays'] - train_set['log_plays'].quantile(.5)

In [40]:
# Make predictions for the validation set based on median, mean, geometric mean, and log media.
from collections import defaultdict
median_preds = defaultdict(dict)
mean_preds = defaultdict(dict)
geometric_median_preds = defaultdict(dict)
geometric_mean_preds = defaultdict(dict)

for _, row in validation_set.iterrows():
    user = row['user']
    artist = row['artist']
    median_preds[user][artist] = user_median['plays'][user]
    mean_preds[user][artist] = user_mean['plays'][user]
    geometric_median_preds[user][artist] = np.exp(user_median['log_plays'][user])
    geometric_mean_preds[user][artist] = np.exp(user_mean['log_plays'][user])
    
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')
mean_preds, _ = extract_expected_and_preds(mean_preds, validation_set, column='plays')
geometric_median_preds, _ = extract_expected_and_preds(geometric_median_preds, validation_set, column='plays')
geometric_mean_preds, _ = extract_expected_and_preds(geometric_mean_preds, validation_set, column='plays')

In [42]:
# Calculate ABS for all of the above.
print "ABS Median User: %s." % absolute_error(median_preds, np.array(validation_set['plays']))
print "ABS Mean User: %s." % absolute_error(mean_preds, np.array(validation_set['plays']))
print "ABS Log Median User: %s." % absolute_error(geometric_median_preds, e_l)
print "ABS Geometric Mean User: %s." % absolute_error(geometric_mean_preds, e_l)
# Our model loss.
print "Log Model ABS: %s." % absolute_error(p_l, e_l)
print "Normal Model ABS: %s." % absolute_error(np.exp(p_l), validation_set['plays'])


ABS Median User: 136.089596823.
ABS Mean User: 157.97147812.
ABS Log Median User: 168.958126748.
ABS Geometric Mean User: 187.43924901.
Log Model ABS: 0.558507205596.
Normal Model ABS: 135.863216125.

In [43]:
plt.hist(model_last_logs._mu_artist.values(),bins = 100, alpha=0.5, label="artist_bias")
plt.hist(artist_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()



In [44]:
plt.hist(model_last_logs._mu_user.values(),bins = 100, alpha=0.5, label="user_bias")
plt.hist(user_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()



In [45]:
# Calculate interaction terms.
interaction_term = train_set.apply(lambda row: np.dot(
    model_last_logs._P[row['user']], model_last_logs._Q[row['artist']]), axis=1)

In [48]:
plt.hist(interaction_term,bins = 100, alpha=0.5, label="interaction", range=(-.5,.5))
plt.legend(loc='upper right')
plt.show()



In [69]:
# This is the error.
np.sum((p_l - e_l)**2)


Out[69]:
223630.9990018071

In [68]:
# This is the regularization term we're using.
np.sum(np.array(interaction_term) ** 2) + np.sum(np.array(model_last_logs._mu_user.values()) ** 2) + np.sum(np.array(model_last_logs._mu_artist.values()) ** 2) + np.sum(np.array(model_last_logs._mu) ** 2)


Out[68]:
430517.15226164827

In [49]:
plt.hist(p_l, 100, alpha=0.5, label='predicted')
plt.hist(e_l, 100, alpha=0.5, label='expected')
plt.legend(loc='upper right')
plt.show()



In [50]:
p_f = np.exp(p_l)
_, e_f = extract_expected_and_preds(p_dict_l, validation_set, column='plays')

In [51]:
plt.hist(p_f, bins=100, alpha=0.5, label='predicted', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()



In [ ]:
print "Model ABS ERR: %s" % absolute_error(p_f, e_f)

In [ ]:
print "Model RMSE: %s" % rmse(p_f, e_f)

In [ ]:
print "Median ABS ERR: %s" % absolute_error(median_preds, e_f)

In [59]:
plt.hist(median_preds, bins=100, alpha=0.5, label='median', range=(0,1000))
#plt.hist(mean_preds, bins=100, alpha=0.5, label='mean', range=(0,1000))
#plt.hist(geometric_median_preds, bins=100, alpha=0.5, label='geo median', range=(0,1000))
#plt.hist(geometric_mean_preds, bins=100, alpha=0.5, label='geo mean', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()



In [ ]:
model_last = Model(train_set, column='plays')

In [ ]:
model_last.initialize_params()

In [ ]:
model_last.regularization = 0.0

In [ ]:
model_last.learn(passes=100,test=validation_set)

In [ ]:
# Train directly on play count.
model_plays = Model(train_set, column='plays')
model_plays.initialize_params()

In [ ]:
## Save params for initialization.
#model_plays.save_params()

In [ ]:
## Load already saved params.
model_plays = Model(column='plays')
model_plays.load_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')

In [ ]:
np.seterr(over='raise')

In [ ]:
model_plays.learn_rate = 0.00001 # slow learn rate
model_plays.regularization = 0.01 # avoid overfitting.

In [ ]:
# Total passes = 2.
model_plays.learn(passes=2)

In [ ]:
model_plays.save_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')

In [ ]:
# Make the predictions
preds = model_plays.predict(validation_set)

In [ ]:
# Extract the predictions.
predictions, expected = extract_expected_and_preds(preds, validation_set, column='plays')

In [ ]:
# We set all negative values to 1.
predictions[predictions < 1] = 1

In [ ]:
# Compare to the user median model.
user_median = train_set.groupby('user').quantile(.5)
# Make predictions for the validation set.
from collections import defaultdict
median_preds = defaultdict(dict)
for _, row in validation_set.iterrows():
    user = row['user']
    artist = row['artist']
    median_preds[user][artist] = user_median['plays'][user]
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')

In [ ]:
absolute_error(median_preds, expected), absolute_error(predictions, expected)

In [ ]:
absolute_error(predictions, expected)

In [ ]:
# Do another pass.
model_plays.learn()

In [ ]:
# Predict again
preds2 = model_plays.predict(validation_set)
predictions2, expected = extract_expected_and_preds(preds2, validation_set, column='plays')
predictions2[predictions2 < 1] = 1

In [ ]:
absolute_error(median_preds, expected), absolute_error(predictions2, expected)

In [ ]:
# We'll try our new model now.
new_model = Model(train_set)

In [ ]:
new_model.initialize_params()

In [ ]:
new_model.save_params('test')

In [ ]:
new_model = Model()
new_model.load_params('test')

In [ ]:
# Test the predictions.
ps = new_model.predict(validation_set)
pp, ee = extract_expected_and_preds(ps, validation_set)
absolute_error(pp, ee)

In [ ]:
rmse(pp, ee)

In [ ]:
np.dot(new_model._P[new_model._user],new_model._Q[new_model._artist])

In [ ]:
new_model.learn(passes=100)

In [ ]:
# New model for plays.
model_plays_test = Model(train_set, column='plays')
# model_plays_test.initialize_params()

In [ ]:
model_plays_test.load_params('test_plays')

In [ ]:
model_plays_test.save_params('test_plays')

In [ ]:
user = model_plays_test._user
artist = model_plays_test._artist

In [ ]:
plt.hist([sum(value) for value in model_plays_test._P.values()], bins=100)

In [ ]:
plt.hist([sum(value) for value in model_plays_test._Q.values()], bins=100)

In [ ]:
model_plays_test.loss

In [ ]:
model_plays_test.learn(passes=100)

In [ ]:
# Calculate RMSE and ABSE
np.mean(predictions)

In [ ]:
# Load the trained model on the entire data set with log plays.
full_model_log_plays = Model(train)

In [ ]:
full_model_log_plays.load_params('params_k_100_learn_log')

In [ ]:
full_model_log_plays._mu_user

In [ ]:
train['log_plays'].mean()

In [ ]:
# Train the model on full. 
model = Model(train_set)

In [ ]:
model.load_params('params_k_100_learn_log')

In [ ]:


In [ ]:
model.initialize_params()

In [ ]:
model.save_params("params_k_100_learn")

In [ ]:
model.save("model_k_100_learn_log")

In [ ]:
# Run the learning algorithm with one pass.
model.learn(passes=1)

In [ ]:
# Test how bad default predictions might be.
predictions = model.predict(validation_set)

In [ ]:
def rmse2(predictions, validation_set, log_plays = True):
    return rmse(*extract_expected_and_preds(predictions, validation_set))

In [ ]:
rmse2(predictions, validation_set)

In [ ]:
preds_log, expected_log = extract_expected_and_preds(predictions, validation_set)

In [ ]:
preds, expected = extract_expected_and_preds(predictions, validation_set, column='plays')

In [ ]:
rmse(preds, expected)

In [ ]:
rmse(preds_log, expected_log)

In [ ]:
absolute_error(preds, expected)

In [ ]:
absolute_error(preds_log, expected_log)

In [ ]:
model_validation_log

In [ ]:
# Now we implement gradient descent on our customized algorithm. Iterate over data and use a single sample to
# estimate gradiant and update
def train_algorithm(passes, R, P, Q, bias, user_bias, artist_bias, passes=1):
    for i in xrange(passes):
        for _, row in cleaned_train.iterrows():
            user = row['user']
            artist = row['artist']
            pred = np.dot(P[user], Q[artist]) + user_bias[user] + artist_bias[artist] + bias
        print("Finised pass %i." % i + 1)

In [ ]:
def calculate_rmse()

In [ ]:
np.dot(P[user], Q[artist])

In [ ]:
user

In [ ]:
test = pd.read_csv('test.csv')
test.head(10)

In [ ]:
artist_genre = {}
artist_mid = {}
artist_qid = {}
for (_, row) in artists[:10].iterrows():
    mid = None
    qid = None
    name = row['name']
    mid = row['artist']
    try:
        mid = get_freebase_id(name)
        qid = get_wikidata_id(mid)
        artist_genre[name] = get_genres(qid)
    except:
        print "Fallback to lastfm genre for name: %s, mid: %s, qid: %s." % (name, mid, qid)
        artist_genre[name] = get_lastfm_tags(row.artist)
    artist_mid[name] = mid
    artist_qid[name] = qid

In [ ]:
test.head(10)

In [ ]:
# Make the predictions.
final_log_predictions = model_last_logs.predict(test)

In [ ]:
# Flatten into a numpy array.
preds_final_log = np.array([val for val in v.values() for v in final_log_predictions.values()])

In [ ]:
test['log_predictions'] = test.apply(lambda row: final_log_predictions[row['user']][row['artist']], axis=1)

In [ ]:
test['prediction'] = np.exp(test['log_predictions'])

In [ ]:
max(test['prediction'])

In [ ]:
vals = final_log_predictions.values()
vals = [v.values() for v in vals]

In [ ]:
flat = [y for x in vals for y in x]

In [ ]:
flat = np.array(flat)

In [ ]:
len(flat[flat < 3])

In [ ]:
plt.hist(test['log_predictions'], bins=100)

In [ ]:
plt.hist(cleaned_train['log_plays'], bins=100)

In [ ]:
output = test[['Id','prediction']]

In [ ]:
len(test)

In [ ]:
output.to_csv("predictions.csv", index=False)

In [ ]:
output.to_csv("predictions_no_header.csv", index=False, header=False)

In [ ]: