In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pickle

from pyechonest import artist

from pyechonest import config
config.ECHO_NEST_API_KEY='EIVX1I4WCCD7FQRFV'

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

artists = pd.read_csv("artists.csv")
profiles = pd.read_csv("profiles.csv")

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}

#dictionary of artists "hash" to name to help interpret groups
artist_names = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
artist_ids = {artists.name[i]:artists.artist[i] for i in xrange(len(artists))}
names_array = np.array(artists.name)

#create mapping of artist id to a index
artist_indices = {}
for row in artists.iterrows():
    artist_indices[row[1].artist] = row[0]

#create mapping of user id to a index
user_indices = {}
for row in profiles.iterrows():
    user_indices[row[1].user] = row[0]

In [3]:
user_nums = np.array([user_indices[user] for user in train.user])
artist_nums = np.array([artist_indices[artist] for artist in train.artist])

In [4]:
train.plays = np.log(train.plays)

In [5]:
# calculate the user meadian rating for all
users_median = {}
artist_support = {}
for i,row in train.iterrows():
    user = row['user']
    try:
        artist_support[user].append(row['artist'])
        users_median[user].append(row['plays'])
    except KeyError:
        artist_support[user] = [row['artist']]
        users_median[user] = [row['plays']]
    if i % 100000 == 0:
        print "Iteration {} is now done.".format(i)


Iteration 0 is now done.
Iteration 100000 is now done.
Iteration 200000 is now done.
Iteration 300000 is now done.
Iteration 400000 is now done.
Iteration 500000 is now done.
Iteration 600000 is now done.
Iteration 700000 is now done.
Iteration 800000 is now done.
Iteration 900000 is now done.
Iteration 1000000 is now done.
Iteration 1100000 is now done.
Iteration 1200000 is now done.
Iteration 1300000 is now done.
Iteration 1400000 is now done.
Iteration 1500000 is now done.
Iteration 1600000 is now done.
Iteration 1700000 is now done.
Iteration 1800000 is now done.
Iteration 1900000 is now done.
Iteration 2000000 is now done.
Iteration 2100000 is now done.
Iteration 2200000 is now done.
Iteration 2300000 is now done.
Iteration 2400000 is now done.
Iteration 2500000 is now done.
Iteration 2600000 is now done.
Iteration 2700000 is now done.
Iteration 2800000 is now done.
Iteration 2900000 is now done.
Iteration 3000000 is now done.
Iteration 3100000 is now done.
Iteration 3200000 is now done.
Iteration 3300000 is now done.
Iteration 3400000 is now done.
Iteration 3500000 is now done.
Iteration 3600000 is now done.
Iteration 3700000 is now done.
Iteration 3800000 is now done.
Iteration 3900000 is now done.
Iteration 4000000 is now done.
Iteration 4100000 is now done.

In [7]:
# calculate the actual medians for each user
umedians = {user:np.median(plays) for user, plays in users_median.iteritems()}

In [16]:
# include testing data as having values equial to the median of the logged inputs
testpredictions = np.array([umedians[row['user']] for i,row in test.iterrows()])

test_user_nums = np.array([user_indices[user] for user in test.user])
test_artist_nums = np.array([artist_indices[artist] for artist in test.artist])

iindexes = np.concatenate((user_nums,test_user_nums))
jindexes = np.concatenate((artist_nums,test_artist_nums))

np.array(train.plays).shape, testpredictions.shape


Out[16]:
((4154804,), (4154804,))

In [75]:
len(iindexes), len(jindexes), len(data)


Out[75]:
(8309608, 8309608, 8309608)

In [77]:
A = csr_matrix((data, (iindexes, jindexes)),shape=(len(profiles), len(artists)))

In [48]:
#Do NOT RUN THIS WILL CRASH YOUR COMP
#import nimfa
#nmf = nimfa.Nmf(A, seed="nndsvd", rank=10, max_iter=12, update='euclidean',objective='fro')
#nmf_fit = nmf()
def dump_data(data):
    '''
    Input: data - a dictionary of filename: object items to be dumped using pickle.
    '''
    for f,o in data.iteritems():
        pickle.dump(o,open(f + '.p','wb'))
        
dump_data({'pmatrix' : A})

In [80]:
import sklearn.decomposition

In [81]:
model = sklearn.decomposition.NMF(n_components=100, max_iter=500)

In [ ]:
W = model.fit_transform(A)

In [52]:
H = model.components_

In [56]:
H.shape


Out[56]:
(32, 2000)

In [26]:
components.shape


Out[26]:
(50, 2000)

In [27]:
W.shape


Out[27]:
(233286, 75)

In [57]:
#Make predictions for training set
preds = [np.NaN]*len(train)
for row in train.iterrows():
    if row[0] % 100000 == 0:
        print row[0]
    user, artist = row[1].user, row[1].artist
    preds[row[0]] = np.percentile(user_media[user], np.dot(W[user_indices[user]], H.T[artist_indices[artist]]))


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000

In [58]:
#Score on training set
np.mean(abs(np.array(preds) - train.plays))


Out[58]:
252.92273341392888

In [73]:
#Make actual predictions
real_preds = [np.NaN]*len(train)
for row in test.iterrows():
    if row[0] % 100000 == 0:
        print row[0]
    user, artist = row[1].user, row[1].artist
    real_preds[row[0]] = np.dot(W[user_indices[user]], H.T[artist_indices[artist]])

In [72]:
#Write out actual predictions
import csv
soln_file = #name your own output file!
with open(soln_file, 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id','plays'])
    for i in xrange(len(test)):
        soln_csv.writerow([i + 1, real_preds[i]])

In [ ]:
# mapping from similarities to respective index in sorted list (one below actual value)
for user in user_plays.iterkeys():
    l = list(user_plays[key]['list'])
    l.sort()
    p = {el: float(i+1) / len(l) for i,el in percentiles}
    percentiles[key] = p
    if j % 50000 == 0:
        print "Finished with the {}-th".format(j)
    j+=1