notebook.community

Edit and run



In [1]:

    
"""Required imports"""
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline



In [2]:

    
p_csv = pd.read_csv("predictions.csv")



In [3]:

    
p_csv.columns = ["Id", "plays"]



In [4]:

    
p_csv









    Out[4]:






  
    
      
      Id
      plays
    
  
  
    
      0
      1
      93.565102
    
    
      1
      2
      154.017971
    
    
      2
      3
      216.456968
    
    
      3
      4
      359.470934
    
    
      4
      5
      118.769531
    
    
      5
      6
      131.482198
    
    
      6
      7
      231.391997
    
    
      7
      8
      118.004769
    
    
      8
      9
      468.329748
    
    
      9
      10
      106.941526
    
    
      10
      11
      11.058392
    
    
      11
      12
      213.800186
    
    
      12
      13
      13.045515
    
    
      13
      14
      120.691451
    
    
      14
      15
      237.291399
    
    
      15
      16
      178.692406
    
    
      16
      17
      108.174588
    
    
      17
      18
      545.453791
    
    
      18
      19
      165.248310
    
    
      19
      20
      148.012265
    
    
      20
      21
      50.673083
    
    
      21
      22
      115.050378
    
    
      22
      23
      75.189822
    
    
      23
      24
      883.220945
    
    
      24
      25
      171.198869
    
    
      25
      26
      215.505742
    
    
      26
      27
      157.525291
    
    
      27
      28
      18.319610
    
    
      28
      29
      65.512346
    
    
      29
      30
      233.665189
    
    
      ...
      ...
      ...
    
    
      4154774
      4154775
      678.062091
    
    
      4154775
      4154776
      113.110890
    
    
      4154776
      4154777
      32.564302
    
    
      4154777
      4154778
      73.435753
    
    
      4154778
      4154779
      459.799568
    
    
      4154779
      4154780
      41.853396
    
    
      4154780
      4154781
      418.129722
    
    
      4154781
      4154782
      27.435197
    
    
      4154782
      4154783
      66.071727
    
    
      4154783
      4154784
      257.462091
    
    
      4154784
      4154785
      56.247032
    
    
      4154785
      4154786
      17.359583
    
    
      4154786
      4154787
      67.364016
    
    
      4154787
      4154788
      45.144167
    
    
      4154788
      4154789
      16.148844
    
    
      4154789
      4154790
      129.515620
    
    
      4154790
      4154791
      234.349911
    
    
      4154791
      4154792
      47.734475
    
    
      4154792
      4154793
      648.616435
    
    
      4154793
      4154794
      29.131306
    
    
      4154794
      4154795
      144.731214
    
    
      4154795
      4154796
      45.230527
    
    
      4154796
      4154797
      20.230446
    
    
      4154797
      4154798
      62.153981
    
    
      4154798
      4154799
      216.481956
    
    
      4154799
      4154800
      29.844646
    
    
      4154800
      4154801
      333.076001
    
    
      4154801
      4154802
      125.266609
    
    
      4154802
      4154803
      263.941871
    
    
      4154803
      4154804
      93.929988
    
  

4154804 rows × 2 columns



In [5]:

    
p_csv.to_csv("predictions.csv", index=False)



In [4]:

    
import json
import urllib

# If set to true, most functions will print information about intermediate steps.
DEBUG = False

def get_freebase_id(query):
    """
    Given a Google Search query, computes the corresponding Freebase ID
    as determined by Google Knowledge Graph.
    
    Args:
        query: String, the query such as 'the liars' or '50 cent'
    Returns:
        String, the freebase id in string format.
    Raises:
        KeyError, IndexError: When no FreeBase ID can be found for the query.
    """
    api_key = 'AIzaSyBBY9bXofiXL9vbe_V6Y49NyAHRv46As60'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
        'query': query,
        'limit': 10,
        'indent': True,
        'key': api_key,
    }
    type_params = ['types=Person', 'types=MusicGroup']
    url = service_url + '?' + '&'.join(type_params) + '&' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    try: 
        freebase_id = response['itemListElement'][0]['result']['@id']
    except (KeyError, IndexError) as e:
        if DEBUG:
            print "No key for query in Google Knowledge Graph for query: %s." % response
        raise
        
    return freebase_id[3:]



In [5]:

    
def get_wikidata_id(mid):
    """
    Calculate the WikiData ID based on the MID (Freebase ID).
    
    Args:
        mid: String, the mid to be found in WikiData.
    Returns:
        String, the WikiData ID.
    Raises:
        KeyError, IndexError: When the MID cannot be found in WikiData.
    """
    service_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    params = {
        'format': 'json',
        'query': 
            'PREFIX wd: <http://www.wikidata.org/entity/>\n'
            'PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n'
            'PREFIX wikibase: <http://wikiba.se/ontology#>\n'
                'SELECT  ?s ?sLabel ?p ?o ?oLabel WHERE {\n'
                    '?s ?p ?o .\n'
                    '?s wdt:P646 "' + mid + '" .\n'
                    'SERVICE wikibase:label {\n'
                        'bd:serviceParam wikibase:language "en" .\n'
                    '}\n'
                '}\n'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    try:
        object_url = response['results']['bindings'][0]['s']['value']
    except (KeyError, IndexError) as e:
        if DEBUG:
            print "Unable to get wikidata id for mid: %s. We used url: %s." % (mid, url)
        raise
    
    qid = object_url.split('/')[-1]
    return qid



In [6]:

    
def get_genres(qid):
    """
    Returns the Music Genres if found corresponding to the WikiData ID.
    
    Args:
        qid: String, the WikiData ID.
        
    Returns:
        [String], the list of genres if any.
    Raises:
        
    """
    service_url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'ids': qid,
        'format': 'json'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    try:
        # Genre property is P136
        genre_snaks = response['entities'][qid]['claims']['P136']
    except KeyError as e:
        if DEBUG:
            print "Unable to extract genre property for qid: %s." % qid
        raise
        
    # Convert snacks into list of ids to query
    genre_ids = []
    for snak in genre_snaks:
        try:
            genre_id = snak['mainsnak']['datavalue']['value']['id']
            genre_ids.append(genre_id)
        except KeyError as e:
            if DEBUG:
                print "Unable to extract genre id from snak for qid: %s. Skipping." % (qid)
            
    if len(genre_ids) == 0:
        raise Exception("No genre ids for qid: %s." % qid)
        
    # Fetch genre information
    params = {
        'action': 'wbgetentities',
        'ids': '|'.join(genre_ids),
        'format': 'json'
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    # Extract the english labels for the entities.
    genres = []
    for genre_id in genre_ids:
        try:
            genre_object = response['entities'][genre_id]
        except KeyError as e:
            if DEBUG:
                print "Unable to extract genre_id with qid: %s." % (genre_id)
            continue
        try:
            genres.append(genre_object['labels']['en']['value'])
        except KeyError as e:
            if DEBUG:
                print "Unable to extract english label for qid: %s with genre qid %s." % (qid, genre_id)
            continue
    if len(genres) == 0:
        raise Exception("Could not extract english labels for genres for qid %s.", qid)
            
    return genres



In [7]:

    
def get_lastfm_tags(mbid):
    """
    Returns the top three of Last FM tags corresponding to the Music Brainz ID.
    
    Args:
        mbdi: String, the Music Brainz ID to be found.
    Returns:
        [String], the top three tags for the mbid.
    Raises:
        KeyError: If the mbid cannot be found in the Last FM page.
    """
    service_url = 'http://ws.audioscrobbler.com/2.0/'
    params = {
        'method': 'artist.gettoptags',
        'mbid': mbid,
        'api_key': 'f2fac19fc4abcebe0d1729429137037e',
        'format': 'json',
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    
    # Extract top 3 tags if possible
    try:
        tags = response['toptags']['tag'][:3]
    except KeyError:
        if DEBUG:
            print "Failed to extract tags from last fm for mbid: %s." % mbid
        raise
    return [tag['name'] for tag in tags]



In [8]:

    
## Read the artists. Not necessary for our model.
#artists = pd.read_csv('artists.csv')
#artists.head(10)



In [9]:

    
## Read the profiles. Not necessary for our model.
#profiles = pd.read_csv('profiles.csv')
#profiles.head(10)



In [10]:

    
# Load the training data and attack the 'log_plays' column.
train = pd.read_csv('train.csv')
train['log_plays'] = np.log(train['plays'])
train.head(10)









    Out[10]:






  
    
      
      user
      artist
      plays
      log_plays
    
  
  
    
      0
      eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03
      5a8e07d5-d932-4484-a7f7-e700793a9c94
      554
      6.317165
    
    
      1
      44ce793a6cd9d20f13f4a576a818ef983314bb5d
      a3a92047-be1c-4f3e-8960-c4f8570984df
      81
      4.394449
    
    
      2
      da9cf3f557161d54b76f24db64be9cc76db008e3
      eeb1195b-f213-4ce1-b28c-8565211f8e43
      708
      6.562444
    
    
      3
      8fa49ab25d425edcf05d44bfc1d5aea895287d81
      a1419808-65d3-4d40-998c-1a0bac65eabc
      265
      5.579730
    
    
      4
      b85fcaef67d2669cd99b334b5e8c8705263db2cf
      a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432
      220
      5.393628
    
    
      5
      feed7a0dc74c5251283a1505adf453a2061d08f7
      1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506
      2113
      7.655864
    
    
      6
      cbb86d88a8d2d0bab8956807c6c45cd0c752324b
      9c9f1380-2516-4fc9-a3e6-f9f61941d090
      127
      4.844187
    
    
      7
      5641e1e6f04868a61dc29f7227e34f4640163e9b
      832a43c7-aa7d-439b-a6b4-4f1afa671c24
      305
      5.720312
    
    
      8
      9f748976d303db79f61bf570d9549d6335b11b0e
      2fddb92d-24b2-46a5-bf28-3aed46f4684c
      705
      6.558198
    
    
      9
      056d5d2467dc63c4520963323e2ebf9576b58229
      847e8284-8582-4b0e-9c26-b042a4f49e57
      7
      1.945910



In [11]:

    
# Some sanity checks to make sure we've loaded the data correctly.
from collections import Counter
artist_count = Counter(train['artist'])
user_count = Counter(train['user'])
assert len(artist_count) == 2000
assert len(user_count) == 233286



In [ ]:

    
# Histogram for artist count for sanity check.
plt.hist(artist_count.values(), bins=100)



In [ ]:

    
# Histogram for user counts for sanity check.
plt.hist(user_count.values(), bins=100)



In [ ]:

    
# Histogram for log plays for sanity check.
plt.hist(train['plays'], bins=100)



In [ ]:

    
# Histogram for log plays for sanity check.
plt.hist(train['log_plays'], bins=100)



In [12]:

    
# Let's take a look at the log distribution by artist
# by_artist = train.groupby('artist').mean()
by_user = train.groupby('user').mean()



In [ ]:

    
# plot the average for plays.
plt.hist(by_user['plays'], bins=100)



In [ ]:

    
# Plot the average for log_plays.
plt.hist(by_user['log_plays'], bins=100)



In [ ]:

    
# Plot the plays by artist.
plt.hist(by_artist['plays'], bins=100)



In [ ]:

    
# Plot the average log plays (aka, 'rating').
plt.hist(by_artist['log_plays'], bins=100)



In [13]:

    
# Per user, let's take a look at median by users.
by_user_median = train.groupby('user').quantile(0.5)



In [15]:

    
# Merges the mean and median computations on user, since this is what we will use to determine outliers.
merged = pd.merge(by_user.reset_index(), by_user_median.reset_index(), on="user", suffixes=["_mean", "_median"])



In [16]:

    
# Let's calculate the gap_plays (ie, mean - median) for each user.
merged['gap_plays'] = merged['plays_mean'] - merged['plays_median']



In [ ]:

    
# Let's see how the data is distributed.
plt.hist(merged['gap_plays'], bins=100)



In [17]:

    
# Candidate outliers have a gap > 1000 or < -1000
min_gap = -1000
max_gap = 1000
candidates = merged[(merged['gap_plays'] < min_gap) | (merged['gap_plays'] > max_gap)]



In [18]:

    
# Merge the candidates so we can get the log_plays.
candidate_removals = pd.merge(train, candidates, on="user")



In [19]:

    
# We remove only if log_play > 10 (ie, extreme outliers).
candidate_removals = candidate_removals[candidate_removals['log_plays'] > 10]



In [20]:

    
# Remove the candidates.
cleaned_train = train[~((train['user'].isin(candidate_removals['user']))
                      & (train['artist'].isin(candidate_removals['artist']))
                      & (train['plays'].isin(candidate_removals['plays'])))]



In [21]:

    
# Removed exactly what we wanted. Make some basic checks.
assert len(train) - len(cleaned_train) ==  len(candidate_removals)
assert len(candidate_removals) == 313
print "Removed a total of %s." % len(candidate_removals)









    



Removed a total of 313.



In [22]:

    
# Seperate into train and validation. We take a random 10 percent of the data to be validation.
# We have to make sure that the train_set has all of the users.
all_users_and_artists_in_both = False
expected_num_users = len(set(cleaned_train['user']))
expected_num_artist = len(set(cleaned_train['artist']))
while not all_users_and_artists_in_both: 
    shuffled_set = cleaned_train.sample(frac=1).reset_index(drop=True)
    train_length = int(0.9 * len(shuffled_set))
    train_set = shuffled_set[:train_length]
    validation_set = shuffled_set[train_length:]

    print "Attempted..."
    if (len(set(train_set['user'])) == expected_num_users
        and len(set(train_set['artist'])) == expected_num_artist):
        print "Found!"
        all_users_and_artists_in_both = True









    



Attempted...
Found!



In [23]:

    
expected_num_users, len(set(train_set['user'])), len(set(validation_set['user']))









    Out[23]:





(233286, 233286, 193677)



In [24]:

    
def extract_expected_and_preds(predictions, df, column='log_plays'):
    """
    Converts the output `predictions` from a Model and the `df` from
    of expected values into two flat numpy arrays.
    
    Args:
        predictions: The predictions returned by the Model.
        df: pandas.DataFrame, the df containing the expected results
        column: String, the column in df with the expected values.
        
    Returns:
        np.array(float), np.array(float): Two numpy arrays of
            preds and expected values where each entry in preds
            corresponds to expected.
    """
    if column not in ['log_plays', 'plays']:
        raise Exception("Unsupported column name %s.", column)
        
    expected = np.array(df[column])
    preds = np.zeros(len(expected))
    i = 0
    for _, row in df.iterrows():
        artist =row['artist']
        user = row['user']
        preds[i] = predictions[user][artist]
        i += 1
    return preds, expected



In [25]:

    
def rmse(preds, expected):
    return (np.sum((preds - expected)**2 / float(len(expected)))) ** 0.5



In [26]:

    
def absolute_error(preds, expected):
    return np.sum(np.abs(preds - expected) / float(len(expected)))



In [27]:

    
class Model(object):
    """
    Our custom model class which performs basic learning on the data.
    
    See http://www.netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf for details on the model.
    
    As an overview, the model assumes the following:
        pred = bias + bias_user[user] + bias_artist[artist] + np.dot(P[user],Q[artist])
        
    We have a global bias, a bias for each user, and a bias for each artist. We also have an interaction
    term which is calculated by taking \sum_k P[user][k] * Q[artist][k]. 
    
    The interpretation is that there are K hidden factors which define the interaction between the user
    the artist.
    
    The model learning attempts to minimize the following loss functions:
    
        loss(bias, bias_user, bias_artist, P, Q) =
            \sum_{user, artist} (actual - pred)^2 + \lambda[||P|| 
                + ||Q|| + ||bias_user|| + ||bias_artist|| + ||bias||]
        
    The learning takes place wit SGD.
    
    Properties:
        name: String, the name of the model, based on input parameters. Used for saving and loading files.
        learn_rate: float, the learning rate for SGD (gamma).
        regularization: float, the normalization factor (lambda).
        K: int, the number of latent variables the model assumes.
        column: string, the name of the column in data which we wish to predict. Currently
            only support 'log_plays' and 'plays'.
        
    """
    def __init__(self, data = None, learning_rate = 0.02, penalization = 0.1, hidden_factors=50, column='log_plays'):
        """
        Initializes a model.
        
        Args:
            data: Opt[pandas.DataFrame], the training data for the model. This should be None if we plan
                to load the parameters from a file, as the data is loaded too.
            learning_rate: float, the learning rate to use for SGD.
            penalization: float, the regularization factor.
            hidden_factors: int, the number latent variables for the mode.
            column: String, the model will attempt to predict data[column]. Only 'log_plays' and 'plays'
                currently supported.
        """
        if column not in ['log_plays', 'plays']:
            raise Exception("Unsupported column %s." % column)
        self.name = "model_learn_rate_%s_penalization_%s_k_%s_on_column_%s" % (learning_rate,
                                                                               penalization,
                                                                               hidden_factors,
                                                                               column)
        self.learn_rate = learning_rate
        self.regularization = penalization
        self.K = hidden_factors
        self.train = data
        self.column = column
        
        # A dictionary mapping R[user][artist] to values.
        self._R = None
        # A dictionary mapping P[user] = np.array(K)
        self._P = None
        # A dictionary mapping Q[artist] = np.array(K)
        self._Q = None
        # A constant bias term.
        self._mu = None
        # A dictionary mapping _mu_user[user] to a contants bias term.
        self._mu_user = None
        # A dictionary mapping _mu_artist[artist] to a contant bias term.
        self._mu_artist = None
        
        # cache results.
        self._predictions = None
        self._expected = None
        
    def initialize_params(self):
        """
        Initializes the parameters for the model, P,Q, mu, mu_user, and mu_artist to the defaults.
        """
        from collections import defaultdict
        bias = self.train[self.column].mean()
        users = self.train.groupby('user').mean().reset_index()
        artists = self.train.groupby('artist').mean().reset_index()
        
        # Default user and artist bias.
        user_bias = {user: val - bias for (user, val) in zip(users['user'], users[self.column])}
        artist_bias = {artist: val - bias for (artist, val) in zip(artists['artist'], artists[self.column])}
        # The hard part of initializing the P and Q sparse matrices (aka, dictionaries ^_^)
        P, Q, R = {}, {}, defaultdict(dict)
        i = 0
        for _, row in self.train.iterrows():
            i += 1
            user = row['user']
            artist = row['artist']
            rating = row[self.column]
            P[user] = np.random.normal(size=self.K)
            Q[artist] = np.random.normal(size=self.K)
            R[user][artist] = rating
            
            if i % (len(self.train) / 10) == 0:
                print "Done with %.2f percent." % (100 * float(i) / len(self.train))
            
        self._R, self._P, self._Q = dict(R), P, Q
        self._mu, self._mu_user, self._mu_artist = bias, user_bias, artist_bias
        
    def load_params(self, filename=None):
        """
        Loads the parameters into the model. This is useful to load a previously trained model.
        
        Args:
            filename: Opt[String], the filename from which to load the parameters.
                If None, assumes the parameters are contained in the file named self.name.
        """
        import pickle
        if not filename:
            filename = "%s_params" % self.name
        with open("%s.pk" % filename) as handle:
            (self.train,
             self._R, self._P, self._Q,
             self._mu, self._mu_user, self._mu_artist) = pickle.load(handle)
        # sanity checks to verify data loaded is for the correct model.
        assert len(self._P.itervalues().next()) == self.K
        assert len(self._Q.itervalues().next()) == self.K
        
    def save_params(self, filename=None):
        """
        Saves the parameters of the model.
        
        Args:
            filename: Opt[String], the filename into which to save the parameters.
                If None, assumes the parameters should be saved in the file named self.name.
        """
        import pickle
        if not filename:
            filename = "%s_params" % self.name
        obj = (self.train, self._R, self._P, self._Q, self._mu, self._mu_user, self._mu_artist)
        with open("%s.pk" % filename, 'w') as handle:
            pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def _getParamGradients(self, user, artist, pred):
        self._loss = self._R[user][artist] - pred
        return (-self._loss * self._Q[artist] + self.regularization * self._P[user],
                -self._loss * self._P[user] + self.regularization * self._Q[artist],
                -self._loss,
                -self._loss + self.regularization * self._mu_user[user],
                -self._loss + self.regularization * self._mu_artist[artist])
    
    def learn(self, passes = 1, test=None):
        """
        Train the model using the training data.
        
        Args:
            passes: Int, the number of full passes to do over the input data.
            test: df.DataFrame, a validation set. If given, calculates the ABSERROR
                 after each half pass.
        """
        for j in xrange(passes):
            i = 0
            for _, row in self.train.iterrows():
                i += 1
                user = row['user']
                artist = row['artist']
                self._user = user
                self._artist = artist
                pred = self._predict_single(user, artist)
                
                # Update in the parameters based on data points
                Pgrad, Qgrad, mugrad, mu_user_grad, mu_artist_grad = self._getParamGradients(user, artist, pred)
                self._P[user] -= self.learn_rate * Pgrad 
                self._Q[artist] -= self.learn_rate * Qgrad
                self._mu -= self.learn_rate * mugrad
                self._mu_user[user] -= self.learn_rate * mu_user_grad
                self._mu_artist[artist] -= self.learn_rate * mu_artist_grad
                
                if i % (len(self.train) / 10) == 0:
                    print "Loss for current item %s." % str(self._loss)
                    print "Done with %.2f percent." % (100 * float(i) / len(self.train))
                
            print("Finised pass %s." % str(j + 1))
            # Calcualte the loss after this pass.
            if validation_set is not None:
                p, e = self.calculate_abs_error(validation_set)
                print("Absolute mean error %s.", absolute_error(p, e))
                
    def _predict_single(self, user, artist):
        # interesting...optinimzations could be done here.
        return (np.dot(self._P[user], self._Q[artist])
                                         + self._mu_user[user]
                                         + self._mu_artist[artist]
                                         + self._mu)
    
    def calculate_abs_error(self, test):
        """
        Note that this caches results, so it will ignore test in subsequent calls.
        """
        expected = np.array(test[self.column])
        preds = np.zeros(len(expected))
        # Reuse predictions.
        predictions = self.predict(test)
        i = 0
        for _, row in test.iterrows():
            artist =row['artist']
            user = row['user']
            preds[i] = predictions[user][artist]
            i += 1
        return preds, expected
    
    def predict(self, test):
        """
        Predicts for the values in the test DataFrame.
        
        Args:
            test: pandas.DataFrame, the dataframe containing user/artist rows to predict.
        
        Returns:
            dict[String -> dict[String -> Float]]: A dictionary mapping
                return_val[user][artist] to the predicted values.
        """
        from collections import defaultdict
        predictions = defaultdict(dict)
        i = 0
        for _, row in test.iterrows():
            i += 1
            user = row['user']
            artist = row['artist']
            predictions[user][artist] = self._predict_single(user, artist)
            if i % (len(test) / 10) == 0:
                print "Done with %.2f percent." % (100 * float(i) / len(test))
        self._predictions = dict(predictions)
        return self._predictions
            
    def save(self, filename = None):
        """
        Save the model to disk.
        """
        if not filename:
            filename = self.name
        import pickle
        with open("%s.pk" % filename, 'w') as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    @staticmethod
    def load(filename):
        """
        Load a model from disk.
        """
        import pickle
        with open("%s.pk" % filename) as handle:
            return pickle.load(handle)



In [28]:

    
##See the below for loading a model. We suggest you load the parameters only, and not the entire model.
## Note that column specified the `column` in data which the model is attempting to predict.
# model = Model(data = None, learning_rate = 0.01, penalization = 0.001, hidden_factors=100, column='log_plays')
## Load the parameters for the model. These parameters should match hidden_factors and column to be relevant.
# model.load_params(filename)
## Alternatively, you can initialize the parameters randomly if data is NOT None.
# model.initialize_params()
## If you wish, you can run the model over the data again, to improve the predictions. passes determines how many
## iterations over the data to perform.
# model.learn(passes=3)
## At some point, we recommend you save your current set of model parameters so you can reuse later without retraining.
# model.save_params(filename)
## You can also save the entire model (which includes saving the training data), but this can be a bit finicky.
# model.save(filename)
## If you do save the entire model, you can load it again with the following.
# model = Model.load(filename)
## Once you're ready to make predictions, simply do where df is the dataframe of user/artist ids.
## The model returns a python dictionary such that preds[user][artist] gives the predicted value.
# preds = model.predict(df)
## To make things fast, you can calculate RMSE against a validation dataframe (ie, user/artist/column) dataframe.
## First, extract everything into numpy. Column should be one of 'plays' or 'log_plays'
# p, e = extract_expected_and_preds(preds, expected_df, column)
## Then calculate the error.
# rm = rmse(p,e)
# ae = absolute_error(p,e)
## At the end, we recommend you save the entire model...



In [29]:

    
model_last_logs = Model()
#model_last_logs.initialize_params()



In [73]:

    
model_last_logs.load_params("final_params_model_trained_115_passes")



In [31]:

    
train_set = model_last_logs.train
in_train_set = {(user, artist, plays, log_plays): True for (user, artist, plays, log_plays) in zip(train_set['user'],
                                                                      train_set['artist'],
                                                                      train_set['plays'],
                                                                      train_set['log_plays'])}



In [32]:

    
bool_index = cleaned_train.apply(lambda row: (row['user'],
                                              row['artist'],
                                              row['plays'],
                                              row['log_plays']) in in_train_set, axis=1)



In [33]:

    
validation_set = cleaned_train[~bool_index]



In [34]:

    
len(validation_set) + len(train_set) == len(cleaned_train)









    Out[34]:





True



In [ ]:

    
# Learned first 100.
model_last_logs.learn(passes=100, test=validation_set)



In [60]:

    
# Continue training with smaller penalization.
model_last_logs.learn_rate = 0.02
model_last_logs.regularization = 0.1
model_last_logs.learn(passes=100, test=validation_set)









    



Loss for current item 0.0713301698526.
Done with 10.00 percent.
Loss for current item -0.346096471099.
Done with 20.00 percent.
Loss for current item 0.201617718404.
Done with 30.00 percent.
Loss for current item 0.391093916246.
Done with 40.00 percent.
Loss for current item 0.329401172381.
Done with 50.00 percent.
Loss for current item 0.552185287577.
Done with 60.00 percent.
Loss for current item 1.32315781906.
Done with 70.00 percent.
Loss for current item 0.166981734387.
Done with 80.00 percent.
Loss for current item -0.105536270882.
Done with 90.00 percent.
Loss for current item -0.317814100525.
Done with 100.00 percent.
Finised pass 1.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55846115870013535)
Loss for current item 0.0703309038217.
Done with 10.00 percent.
Loss for current item -0.34592546589.
Done with 20.00 percent.
Loss for current item 0.199180547983.
Done with 30.00 percent.
Loss for current item 0.392607832875.
Done with 40.00 percent.
Loss for current item 0.328494951457.
Done with 50.00 percent.
Loss for current item 0.55238899246.
Done with 60.00 percent.
Loss for current item 1.31715359249.
Done with 70.00 percent.
Loss for current item 0.165955252833.
Done with 80.00 percent.
Loss for current item -0.104618678715.
Done with 90.00 percent.
Loss for current item -0.315802797633.
Done with 100.00 percent.
Finised pass 2.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55841876695723947)
Loss for current item 0.0693193828254.
Done with 10.00 percent.
Loss for current item -0.345857011113.
Done with 20.00 percent.
Loss for current item 0.196742173514.
Done with 30.00 percent.
Loss for current item 0.394062010023.
Done with 40.00 percent.
Loss for current item 0.327668961276.
Done with 50.00 percent.
Loss for current item 0.552595204613.
Done with 60.00 percent.
Loss for current item 1.31130995678.
Done with 70.00 percent.
Loss for current item 0.164908190536.
Done with 80.00 percent.
Loss for current item -0.103735339014.
Done with 90.00 percent.
Loss for current item -0.313784434251.
Done with 100.00 percent.
Finised pass 3.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55838037921824601)
Loss for current item 0.0682945103341.
Done with 10.00 percent.
Loss for current item -0.345886340961.
Done with 20.00 percent.
Loss for current item 0.194306358103.
Done with 30.00 percent.
Loss for current item 0.395457550895.
Done with 40.00 percent.
Loss for current item 0.326917479189.
Done with 50.00 percent.
Loss for current item 0.552801320877.
Done with 60.00 percent.
Loss for current item 1.30562525099.
Done with 70.00 percent.
Loss for current item 0.163843441041.
Done with 80.00 percent.
Loss for current item -0.102885353613.
Done with 90.00 percent.
Loss for current item -0.3117610769.
Done with 100.00 percent.
Finised pass 4.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.5583451881575745)
Loss for current item 0.0672553620319.
Done with 10.00 percent.
Loss for current item -0.346008658005.
Done with 20.00 percent.
Loss for current item 0.191876727968.
Done with 30.00 percent.
Loss for current item 0.396795618193.
Done with 40.00 percent.
Loss for current item 0.326235129179.
Done with 50.00 percent.
Loss for current item 0.553004880923.
Done with 60.00 percent.
Loss for current item 1.30009770871.
Done with 70.00 percent.
Loss for current item 0.162763732693.
Done with 80.00 percent.
Loss for current item -0.102067798359.
Done with 90.00 percent.
Loss for current item -0.309734609291.
Done with 100.00 percent.
Finised pass 5.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55831304587269559)
Loss for current item 0.0662011773378.
Done with 10.00 percent.
Loss for current item -0.346219161541.
Done with 20.00 percent.
Loss for current item 0.189456766914.
Done with 30.00 percent.
Loss for current item 0.398077425275.
Done with 40.00 percent.
Loss for current item 0.325616867322.
Done with 50.00 percent.
Loss for current item 0.55320356614.
Done with 60.00 percent.
Loss for current item 1.29472546558.
Done with 70.00 percent.
Loss for current item 0.161671636205.
Done with 80.00 percent.
Loss for current item -0.101281729553.
Done with 90.00 percent.
Loss for current item -0.307706748082.
Done with 100.00 percent.
Finised pass 6.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55828381804492699)
Loss for current item 0.0651313508238.
Done with 10.00 percent.
Loss for current item -0.346513073113.
Done with 20.00 percent.
Loss for current item 0.187049811822.
Done with 30.00 percent.
Loss for current item 0.399304227689.
Done with 40.00 percent.
Loss for current item 0.325057967019.
Done with 50.00 percent.
Loss for current item 0.553395198051.
Done with 60.00 percent.
Loss for current item 1.2895065668.
Done with 70.00 percent.
Loss for current item 0.160569571987.
Done with 80.00 percent.
Loss for current item -0.100526189852.
Done with 90.00 percent.
Loss for current item -0.305679058022.
Done with 100.00 percent.
Finised pass 7.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55825777544832678)
Loss for current item 0.0640454235853.
Done with 10.00 percent.
Loss for current item -0.346885659314.
Done with 20.00 percent.
Loss for current item 0.184659049084.
Done with 30.00 percent.
Loss for current item 0.400477315135.
Done with 40.00 percent.
Loss for current item 0.324554004159.
Done with 50.00 percent.
Loss for current item 0.553577736307.
Done with 60.00 percent.
Loss for current item 1.28443897434.
Done with 70.00 percent.
Loss for current item 0.159459817204.
Done with 80.00 percent.
Loss for current item -0.0998002136433.
Done with 90.00 percent.
Loss for current item -0.303652966393.
Done with 100.00 percent.
Finised pass 8.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55823406804625308)
Loss for current item 0.0629430746142.
Done with 10.00 percent.
Loss for current item -0.347332252004.
Done with 20.00 percent.
Loss for current item 0.182287511878.
Done with 30.00 percent.
Loss for current item 0.401598003899.
Done with 40.00 percent.
Loss for current item 0.324100842334.
Done with 50.00 percent.
Loss for current item 0.553749276348.
Done with 60.00 percent.
Loss for current item 1.27952057402.
Done with 70.00 percent.
Loss for current item 0.158344512576.
Done with 80.00 percent.
Loss for current item -0.0991028319004.
Done with 90.00 percent.
Loss for current item -0.301629776683.
Done with 100.00 percent.
Finised pass 9.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55821258259919926)
Loss for current item 0.0618241122209.
Done with 10.00 percent.
Loss for current item -0.347848266082.
Done with 20.00 percent.
Loss for current item 0.17993807823.
Done with 30.00 percent.
Loss for current item 0.402667629788.
Done with 40.00 percent.
Loss for current item 0.323694618213.
Done with 50.00 percent.
Loss for current item 0.553908046762.
Done with 60.00 percent.
Loss for current item 1.27474918243.
Done with 70.00 percent.
Loss for current item 0.15722566888.
Done with 80.00 percent.
Loss for current item -0.0984330765592.
Done with 90.00 percent.
Loss for current item -0.299610681419.
Done with 100.00 percent.
Finised pass 10.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55819342587923204)
Loss for current item 0.0606884655446.
Done with 10.00 percent.
Loss for current item -0.348429214933.
Done with 20.00 percent.
Loss for current item 0.177613469773.
Done with 30.00 percent.
Loss for current item 0.40368754158.
Done with 40.00 percent.
Loss for current item 0.323331727157.
Done with 50.00 percent.
Loss for current item 0.554052406407.
Done with 60.00 percent.
Loss for current item 1.2701225536.
Done with 70.00 percent.
Loss for current item 0.156105173182.
Done with 80.00 percent.
Loss for current item -0.0977899844203.
Done with 90.00 percent.
Loss for current item -0.297596774143.
Done with 100.00 percent.
Finised pass 11.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55817648230146311)
Loss for current item 0.0595361761885.
Done with 10.00 percent.
Loss for current item -0.349070723708.
Done with 20.00 percent.
Loss for current item 0.175316251157.
Done with 30.00 percent.
Loss for current item 0.404659095001.
Done with 40.00 percent.
Loss for current item 0.32300880916.
Done with 50.00 percent.
Loss for current item 0.554180841331.
Done with 60.00 percent.
Loss for current item 1.26563838547.
Done with 70.00 percent.
Loss for current item 0.154984794787.
Done with 80.00 percent.
Loss for current item -0.097172600612.
Done with 90.00 percent.
Loss for current item -0.295589060499.
Done with 100.00 percent.
Finised pass 12.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55816165237556348)
Loss for current item 0.0583673900104.
Done with 10.00 percent.
Loss for current item -0.349768540563.
Done with 20.00 percent.
Loss for current item 0.17304883004.
Done with 30.00 percent.
Loss for current item 0.405583647244.
Done with 40.00 percent.
Loss for current item 0.32272273516.
Done with 50.00 percent.
Loss for current item 0.554291961539.
Done with 60.00 percent.
Loss for current item 1.26129432626.
Done with 70.00 percent.
Loss for current item 0.153866190921.
Done with 80.00 percent.
Loss for current item -0.0965799816351.
Done with 90.00 percent.
Loss for current item -0.293588468422.
Done with 100.00 percent.
Finised pass 13.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55814874725477626)
Loss for current item 0.0571823490977.
Done with 10.00 percent.
Loss for current item -0.350518545996.
Done with 20.00 percent.
Loss for current item 0.170813457606.
Done with 30.00 percent.
Loss for current item 0.406462551998.
Done with 40.00 percent.
Loss for current item 0.322470593775.
Done with 50.00 percent.
Loss for current item 0.554384497633.
Done with 60.00 percent.
Loss for current item 1.25708798044.
Done with 70.00 percent.
Loss for current item 0.152750912152.
Done with 80.00 percent.
Loss for current item -0.0960111980188.
Done with 90.00 percent.
Loss for current item -0.291595857438.
Done with 100.00 percent.
Finised pass 14.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55813734774592616)
Loss for current item 0.0559813839476.
Done with 10.00 percent.
Loss for current item -0.351316760433.
Done with 20.00 percent.
Loss for current item 0.168612229583.
Done with 30.00 percent.
Loss for current item 0.407297155002.
Done with 40.00 percent.
Loss for current item 0.322249678501.
Done with 50.00 percent.
Loss for current item 0.554457297347.
Done with 60.00 percent.
Loss for current item 1.25301691469.
Done with 70.00 percent.
Loss for current item 0.151640407559.
Done with 80.00 percent.
Loss for current item -0.0954653366114.
Done with 90.00 percent.
Loss for current item -0.28961202709.
Done with 100.00 percent.
Finised pass 15.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.5581278792548704)
Loss for current item 0.0547649058754.
Done with 10.00 percent.






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-a406ec961f72> in <module>()
      2 model_last_logs.learn_rate = 0.02
      3 model_last_logs.regularization = 0.1
----> 4 model_last_logs.learn(passes=100, test=validation_set)

<ipython-input-27-e52c8642977e> in learn(self, passes, test)
    156         for j in xrange(passes):
    157             i = 0
--> 158             for _, row in self.train.iterrows():
    159                 i += 1
    160                 user = row['user']

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/frame.pyc in iterrows(self)
    697         klass = self._constructor_sliced
    698         for k, v in zip(self.index, self.values):
--> 699             s = klass(v, index=columns, name=k)
    700             yield k, s
    701 

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    245                 data = SingleBlockManager(data, index, fastpath=True)
    246 
--> 247         generic.NDFrame.__init__(self, data, fastpath=True)
    248 
    249         self.name = name

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/generic.pyc in __init__(self, data, axes, copy, dtype, fastpath)
    127         object.__setattr__(self, 'is_copy', None)
    128         object.__setattr__(self, '_data', data)
--> 129         object.__setattr__(self, '_item_cache', {})
    130 
    131     def _validate_dtype(self, dtype):

KeyboardInterrupt:



In [74]:

    
# Just optimize parameters that work.
model_last_logs.regularization = 0.09
model_last_logs.learn(passes=100, test=validation_set)









    



Loss for current item 0.0821658857431.
Done with 10.00 percent.
Loss for current item -0.345926037405.
Done with 20.00 percent.
Loss for current item 0.135337722098.
Done with 30.00 percent.
Loss for current item 0.392568813847.
Done with 40.00 percent.
Loss for current item 0.309499020915.
Done with 50.00 percent.
Loss for current item 0.559152270682.
Done with 60.00 percent.
Loss for current item 1.21056694676.
Done with 70.00 percent.
Loss for current item 0.135502248586.
Done with 80.00 percent.
Loss for current item -0.0821793701507.
Done with 90.00 percent.
Loss for current item -0.289432330059.
Done with 100.00 percent.
Finised pass 1.
Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.
('Absolute mean error %s.', 0.55864651220250228)
Loss for current item 0.0588775225637.
Done with 10.00 percent.
Loss for current item -0.346668326433.
Done with 20.00 percent.
Loss for current item 0.135745189329.
Done with 30.00 percent.
Loss for current item 0.382065793302.
Done with 40.00 percent.






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-74-1b3ed53739b9> in <module>()
      1 # Just optimize parameters that work.
      2 model_last_logs.regularization = 0.09
----> 3 model_last_logs.learn(passes=100, test=validation_set)

<ipython-input-27-e52c8642977e> in learn(self, passes, test)
    159                 i += 1
    160                 user = row['user']
--> 161                 artist = row['artist']
    162                 self._user = user
    163                 self._artist = artist

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    599 
    600     def __getitem__(self, key):
--> 601         key = com._apply_if_callable(key, self)
    602         try:
    603             result = self.index.get_value(self, key)

/Users/nautilik/.virtualenvs/test/lib/python2.7/site-packages/pandas/core/common.pyc in _apply_if_callable(maybe_callable, obj, **kwargs)
    450     otherwise return as it is
    451     """
--> 452     if callable(maybe_callable):
    453         return maybe_callable(obj, **kwargs)
    454     return maybe_callable

KeyboardInterrupt:



In [ ]:

    
# Let's calcualte our regularizationo factor compared to our prediction...?



In [71]:

    
model_last_logs.save_params("final_params_model_trained_115_passes")



In [38]:

    
p_dict_l = model_last_logs.predict(validation_set)









    



Done with 10.00 percent.
Done with 20.00 percent.
Done with 30.00 percent.
Done with 40.00 percent.
Done with 50.00 percent.
Done with 60.00 percent.
Done with 70.00 percent.
Done with 80.00 percent.
Done with 90.00 percent.
Done with 100.00 percent.



In [39]:

    
p_l, e_l = extract_expected_and_preds(p_dict_l, validation_set)



In [35]:

    
# Compare to the user median train set.
user_median = train_set.groupby('user').quantile(.5)
artist_median = train_set.groupby('artist').quantile(.5)
user_mean = train_set.groupby('user').mean()
artist_mean = train_set.groupby('artist').mean()



In [36]:

    
user_median_bias = user_median['log_plays'] - train_set['log_plays'].quantile(.5)
artist_median_bias = artist_median['log_plays'] - train_set['log_plays'].quantile(.5)



In [40]:

    
# Make predictions for the validation set based on median, mean, geometric mean, and log media.
from collections import defaultdict
median_preds = defaultdict(dict)
mean_preds = defaultdict(dict)
geometric_median_preds = defaultdict(dict)
geometric_mean_preds = defaultdict(dict)

for _, row in validation_set.iterrows():
    user = row['user']
    artist = row['artist']
    median_preds[user][artist] = user_median['plays'][user]
    mean_preds[user][artist] = user_mean['plays'][user]
    geometric_median_preds[user][artist] = np.exp(user_median['log_plays'][user])
    geometric_mean_preds[user][artist] = np.exp(user_mean['log_plays'][user])
    
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')
mean_preds, _ = extract_expected_and_preds(mean_preds, validation_set, column='plays')
geometric_median_preds, _ = extract_expected_and_preds(geometric_median_preds, validation_set, column='plays')
geometric_mean_preds, _ = extract_expected_and_preds(geometric_mean_preds, validation_set, column='plays')



In [42]:

    
# Calculate ABS for all of the above.
print "ABS Median User: %s." % absolute_error(median_preds, np.array(validation_set['plays']))
print "ABS Mean User: %s." % absolute_error(mean_preds, np.array(validation_set['plays']))
print "ABS Log Median User: %s." % absolute_error(geometric_median_preds, e_l)
print "ABS Geometric Mean User: %s." % absolute_error(geometric_mean_preds, e_l)
# Our model loss.
print "Log Model ABS: %s." % absolute_error(p_l, e_l)
print "Normal Model ABS: %s." % absolute_error(np.exp(p_l), validation_set['plays'])









    



ABS Median User: 136.089596823.
ABS Mean User: 157.97147812.
ABS Log Median User: 168.958126748.
ABS Geometric Mean User: 187.43924901.
Log Model ABS: 0.558507205596.
Normal Model ABS: 135.863216125.



In [43]:

    
plt.hist(model_last_logs._mu_artist.values(),bins = 100, alpha=0.5, label="artist_bias")
plt.hist(artist_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()



In [44]:

    
plt.hist(model_last_logs._mu_user.values(),bins = 100, alpha=0.5, label="user_bias")
plt.hist(user_median_bias, bins=100, alpha=0.5, label="start_bias")
plt.legend(loc='upper right')
plt.show()



In [45]:

    
# Calculate interaction terms.
interaction_term = train_set.apply(lambda row: np.dot(
    model_last_logs._P[row['user']], model_last_logs._Q[row['artist']]), axis=1)



In [48]:

    
plt.hist(interaction_term,bins = 100, alpha=0.5, label="interaction", range=(-.5,.5))
plt.legend(loc='upper right')
plt.show()



In [69]:

    
# This is the error.
np.sum((p_l - e_l)**2)









    Out[69]:





223630.9990018071



In [68]:

    
# This is the regularization term we're using.
np.sum(np.array(interaction_term) ** 2) + np.sum(np.array(model_last_logs._mu_user.values()) ** 2) + np.sum(np.array(model_last_logs._mu_artist.values()) ** 2) + np.sum(np.array(model_last_logs._mu) ** 2)









    Out[68]:





430517.15226164827



In [49]:

    
plt.hist(p_l, 100, alpha=0.5, label='predicted')
plt.hist(e_l, 100, alpha=0.5, label='expected')
plt.legend(loc='upper right')
plt.show()



In [50]:

    
p_f = np.exp(p_l)
_, e_f = extract_expected_and_preds(p_dict_l, validation_set, column='plays')



In [51]:

    
plt.hist(p_f, bins=100, alpha=0.5, label='predicted', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()



In [ ]:

    
print "Model ABS ERR: %s" % absolute_error(p_f, e_f)



In [ ]:

    
print "Model RMSE: %s" % rmse(p_f, e_f)



In [ ]:

    
print "Median ABS ERR: %s" % absolute_error(median_preds, e_f)



In [59]:

    
plt.hist(median_preds, bins=100, alpha=0.5, label='median', range=(0,1000))
#plt.hist(mean_preds, bins=100, alpha=0.5, label='mean', range=(0,1000))
#plt.hist(geometric_median_preds, bins=100, alpha=0.5, label='geo median', range=(0,1000))
#plt.hist(geometric_mean_preds, bins=100, alpha=0.5, label='geo mean', range=(0,1000))
plt.hist(e_f, bins=100, alpha=0.5, label='expected', range=(0,1000))
plt.legend(loc='upper right')
plt.show()



In [ ]:

    
model_last = Model(train_set, column='plays')



In [ ]:

    
model_last.initialize_params()



In [ ]:

    
model_last.regularization = 0.0



In [ ]:

    
model_last.learn(passes=100,test=validation_set)



In [ ]:

    
# Train directly on play count.
model_plays = Model(train_set, column='plays')
model_plays.initialize_params()



In [ ]:

    
## Save params for initialization.
#model_plays.save_params()



In [ ]:

    
## Load already saved params.
model_plays = Model(column='plays')
model_plays.load_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')



In [ ]:

    
np.seterr(over='raise')



In [ ]:

    
model_plays.learn_rate = 0.00001 # slow learn rate
model_plays.regularization = 0.01 # avoid overfitting.



In [ ]:

    
# Total passes = 2.
model_plays.learn(passes=2)



In [ ]:

    
model_plays.save_params('model_learn_rate_0.0001_penalization_0.001_k_100_on_column_plays_params_passes_2')



In [ ]:

    
# Make the predictions
preds = model_plays.predict(validation_set)



In [ ]:

    
# Extract the predictions.
predictions, expected = extract_expected_and_preds(preds, validation_set, column='plays')



In [ ]:

    
# We set all negative values to 1.
predictions[predictions < 1] = 1



In [ ]:

    
# Compare to the user median model.
user_median = train_set.groupby('user').quantile(.5)
# Make predictions for the validation set.
from collections import defaultdict
median_preds = defaultdict(dict)
for _, row in validation_set.iterrows():
    user = row['user']
    artist = row['artist']
    median_preds[user][artist] = user_median['plays'][user]
median_preds, _ = extract_expected_and_preds(median_preds, validation_set, column='plays')



In [ ]:

    
absolute_error(median_preds, expected), absolute_error(predictions, expected)



In [ ]:

    
absolute_error(predictions, expected)



In [ ]:

    
# Do another pass.
model_plays.learn()



In [ ]:

    
# Predict again
preds2 = model_plays.predict(validation_set)
predictions2, expected = extract_expected_and_preds(preds2, validation_set, column='plays')
predictions2[predictions2 < 1] = 1



In [ ]:

    
absolute_error(median_preds, expected), absolute_error(predictions2, expected)



In [ ]:

    
# We'll try our new model now.
new_model = Model(train_set)



In [ ]:

    
new_model.initialize_params()



In [ ]:

    
new_model.save_params('test')



In [ ]:

    
new_model = Model()
new_model.load_params('test')



In [ ]:

    
# Test the predictions.
ps = new_model.predict(validation_set)
pp, ee = extract_expected_and_preds(ps, validation_set)
absolute_error(pp, ee)



In [ ]:

    
rmse(pp, ee)



In [ ]:

    
np.dot(new_model._P[new_model._user],new_model._Q[new_model._artist])



In [ ]:

    
new_model.learn(passes=100)



In [ ]:

    
# New model for plays.
model_plays_test = Model(train_set, column='plays')
# model_plays_test.initialize_params()



In [ ]:

    
model_plays_test.load_params('test_plays')



In [ ]:

    
model_plays_test.save_params('test_plays')



In [ ]:

    
user = model_plays_test._user
artist = model_plays_test._artist



In [ ]:

    
plt.hist([sum(value) for value in model_plays_test._P.values()], bins=100)



In [ ]:

    
plt.hist([sum(value) for value in model_plays_test._Q.values()], bins=100)



In [ ]:

    
model_plays_test.loss



In [ ]:

    
model_plays_test.learn(passes=100)



In [ ]:

    
# Calculate RMSE and ABSE
np.mean(predictions)



In [ ]:

    
# Load the trained model on the entire data set with log plays.
full_model_log_plays = Model(train)



In [ ]:

    
full_model_log_plays.load_params('params_k_100_learn_log')



In [ ]:

    
full_model_log_plays._mu_user



In [ ]:

    
train['log_plays'].mean()



In [ ]:

    
# Train the model on full. 
model = Model(train_set)



In [ ]:

    
model.load_params('params_k_100_learn_log')



In [ ]:



In [ ]:

    
model.initialize_params()



In [ ]:

    
model.save_params("params_k_100_learn")



In [ ]:

    
model.save("model_k_100_learn_log")



In [ ]:

    
# Run the learning algorithm with one pass.
model.learn(passes=1)



In [ ]:

    
# Test how bad default predictions might be.
predictions = model.predict(validation_set)



In [ ]:

    
def rmse2(predictions, validation_set, log_plays = True):
    return rmse(*extract_expected_and_preds(predictions, validation_set))



In [ ]:

    
rmse2(predictions, validation_set)



In [ ]:

    
preds_log, expected_log = extract_expected_and_preds(predictions, validation_set)



In [ ]:

    
preds, expected = extract_expected_and_preds(predictions, validation_set, column='plays')



In [ ]:

    
rmse(preds, expected)



In [ ]:

    
rmse(preds_log, expected_log)



In [ ]:

    
absolute_error(preds, expected)



In [ ]:

    
absolute_error(preds_log, expected_log)



In [ ]:

    
model_validation_log



In [ ]:

    
# Now we implement gradient descent on our customized algorithm. Iterate over data and use a single sample to
# estimate gradiant and update
def train_algorithm(passes, R, P, Q, bias, user_bias, artist_bias, passes=1):
    for i in xrange(passes):
        for _, row in cleaned_train.iterrows():
            user = row['user']
            artist = row['artist']
            pred = np.dot(P[user], Q[artist]) + user_bias[user] + artist_bias[artist] + bias
        print("Finised pass %i." % i + 1)



In [ ]:

    
def calculate_rmse()



In [ ]:

    
np.dot(P[user], Q[artist])



In [ ]:

    
user



In [ ]:

    
test = pd.read_csv('test.csv')
test.head(10)



In [ ]:

    
artist_genre = {}
artist_mid = {}
artist_qid = {}
for (_, row) in artists[:10].iterrows():
    mid = None
    qid = None
    name = row['name']
    mid = row['artist']
    try:
        mid = get_freebase_id(name)
        qid = get_wikidata_id(mid)
        artist_genre[name] = get_genres(qid)
    except:
        print "Fallback to lastfm genre for name: %s, mid: %s, qid: %s." % (name, mid, qid)
        artist_genre[name] = get_lastfm_tags(row.artist)
    artist_mid[name] = mid
    artist_qid[name] = qid



In [ ]:

    
test.head(10)



In [ ]:

    
# Make the predictions.
final_log_predictions = model_last_logs.predict(test)



In [ ]:

    
# Flatten into a numpy array.
preds_final_log = np.array([val for val in v.values() for v in final_log_predictions.values()])



In [ ]:

    
test['log_predictions'] = test.apply(lambda row: final_log_predictions[row['user']][row['artist']], axis=1)



In [ ]:

    
test['prediction'] = np.exp(test['log_predictions'])



In [ ]:

    
max(test['prediction'])



In [ ]:

    
vals = final_log_predictions.values()
vals = [v.values() for v in vals]



In [ ]:

    
flat = [y for x in vals for y in x]



In [ ]:

    
flat = np.array(flat)



In [ ]:

    
len(flat[flat < 3])



In [ ]:

    
plt.hist(test['log_predictions'], bins=100)



In [ ]:

    
plt.hist(cleaned_train['log_plays'], bins=100)



In [ ]:

    
output = test[['Id','prediction']]



In [ ]:

    
len(test)



In [ ]:

    
output.to_csv("predictions.csv", index=False)



In [ ]:

    
output.to_csv("predictions_no_header.csv", index=False, header=False)



In [ ]:

	Id	plays
0	1	93.565102
1	2	154.017971
2	3	216.456968
3	4	359.470934
4	5	118.769531
5	6	131.482198
6	7	231.391997
7	8	118.004769
8	9	468.329748
9	10	106.941526
10	11	11.058392
11	12	213.800186
12	13	13.045515
13	14	120.691451
14	15	237.291399
15	16	178.692406
16	17	108.174588
17	18	545.453791
18	19	165.248310
19	20	148.012265
20	21	50.673083
21	22	115.050378
22	23	75.189822
23	24	883.220945
24	25	171.198869
25	26	215.505742
26	27	157.525291
27	28	18.319610
28	29	65.512346
29	30	233.665189
...	...	...
4154774	4154775	678.062091
4154775	4154776	113.110890
4154776	4154777	32.564302
4154777	4154778	73.435753
4154778	4154779	459.799568
4154779	4154780	41.853396
4154780	4154781	418.129722
4154781	4154782	27.435197
4154782	4154783	66.071727
4154783	4154784	257.462091
4154784	4154785	56.247032
4154785	4154786	17.359583
4154786	4154787	67.364016
4154787	4154788	45.144167
4154788	4154789	16.148844
4154789	4154790	129.515620
4154790	4154791	234.349911
4154791	4154792	47.734475
4154792	4154793	648.616435
4154793	4154794	29.131306
4154794	4154795	144.731214
4154795	4154796	45.230527
4154796	4154797	20.230446
4154797	4154798	62.153981
4154798	4154799	216.481956
4154799	4154800	29.844646
4154800	4154801	333.076001
4154801	4154802	125.266609
4154802	4154803	263.941871
4154803	4154804	93.929988

	user	artist	plays	log_plays
0	eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03	5a8e07d5-d932-4484-a7f7-e700793a9c94	554	6.317165
1	44ce793a6cd9d20f13f4a576a818ef983314bb5d	a3a92047-be1c-4f3e-8960-c4f8570984df	81	4.394449
2	da9cf3f557161d54b76f24db64be9cc76db008e3	eeb1195b-f213-4ce1-b28c-8565211f8e43	708	6.562444
3	8fa49ab25d425edcf05d44bfc1d5aea895287d81	a1419808-65d3-4d40-998c-1a0bac65eabc	265	5.579730
4	b85fcaef67d2669cd99b334b5e8c8705263db2cf	a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432	220	5.393628
5	feed7a0dc74c5251283a1505adf453a2061d08f7	1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506	2113	7.655864
6	cbb86d88a8d2d0bab8956807c6c45cd0c752324b	9c9f1380-2516-4fc9-a3e6-f9f61941d090	127	4.844187
7	5641e1e6f04868a61dc29f7227e34f4640163e9b	832a43c7-aa7d-439b-a6b4-4f1afa671c24	305	5.720312
8	9f748976d303db79f61bf570d9549d6335b11b0e	2fddb92d-24b2-46a5-bf28-3aed46f4684c	705	6.558198
9	056d5d2467dc63c4520963323e2ebf9576b58229	847e8284-8582-4b0e-9c26-b042a4f49e57	7	1.945910