In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pickle

from pyechonest import artist

from pyechonest import config
config.ECHO_NEST_API_KEY='EIVX1I4WCCD7FQRFV'

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

artists = pd.read_csv("artists.csv")
profiles = pd.read_csv("profiles.csv")

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}

#dictionary of artists "hash" to name to help interpret groups
artist_names = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
artist_ids = {artists.name[i]:artists.artist[i] for i in xrange(len(artists))}
names_array = np.array(artists.name)

#create mapping of artist id to a index
artist_indices = {}
for row in artists.iterrows():
    artist_indices[row[1].artist] = row[0]

#create mapping of user id to a index
user_indices = {}
for row in profiles.iterrows():
    user_indices[row[1].user] = row[0]

In [3]:
user_nums = np.array([user_indices[user] for user in train.user])
artist_nums = np.array([artist_indices[artist] for artist in train.artist])

In [41]:
# Now we calculate the percentiles
user_plays = {}
for i,row in train.iterrows():
    user = row['user']
    try:
        user_plays[user]['list'].append(row['plays'])
    except KeyError:
        user_plays[user] = {'list':[row['plays']]}
    if i % 50000 == 0:
        print "Finished with the {}-th user".format(i)


Finished with the 0-th user
Finished with the 50000-th user
Finished with the 100000-th user
Finished with the 150000-th user
Finished with the 200000-th user
Finished with the 250000-th user
Finished with the 300000-th user
Finished with the 350000-th user
Finished with the 400000-th user
Finished with the 450000-th user
Finished with the 500000-th user
Finished with the 550000-th user
Finished with the 600000-th user
Finished with the 650000-th user
Finished with the 700000-th user
Finished with the 750000-th user
Finished with the 800000-th user
Finished with the 850000-th user
Finished with the 900000-th user
Finished with the 950000-th user
Finished with the 1000000-th user
Finished with the 1050000-th user
Finished with the 1100000-th user
Finished with the 1150000-th user
Finished with the 1200000-th user
Finished with the 1250000-th user
Finished with the 1300000-th user
Finished with the 1350000-th user
Finished with the 1400000-th user
Finished with the 1450000-th user
Finished with the 1500000-th user
Finished with the 1550000-th user
Finished with the 1600000-th user
Finished with the 1650000-th user
Finished with the 1700000-th user
Finished with the 1750000-th user
Finished with the 1800000-th user
Finished with the 1850000-th user
Finished with the 1900000-th user
Finished with the 1950000-th user
Finished with the 2000000-th user
Finished with the 2050000-th user
Finished with the 2100000-th user
Finished with the 2150000-th user
Finished with the 2200000-th user
Finished with the 2250000-th user
Finished with the 2300000-th user
Finished with the 2350000-th user
Finished with the 2400000-th user
Finished with the 2450000-th user
Finished with the 2500000-th user
Finished with the 2550000-th user
Finished with the 2600000-th user
Finished with the 2650000-th user
Finished with the 2700000-th user
Finished with the 2750000-th user
Finished with the 2800000-th user
Finished with the 2850000-th user
Finished with the 2900000-th user
Finished with the 2950000-th user
Finished with the 3000000-th user
Finished with the 3050000-th user
Finished with the 3100000-th user
Finished with the 3150000-th user
Finished with the 3200000-th user
Finished with the 3250000-th user
Finished with the 3300000-th user
Finished with the 3350000-th user
Finished with the 3400000-th user
Finished with the 3450000-th user
Finished with the 3500000-th user
Finished with the 3550000-th user
Finished with the 3600000-th user
Finished with the 3650000-th user
Finished with the 3700000-th user
Finished with the 3750000-th user
Finished with the 3800000-th user
Finished with the 3850000-th user
Finished with the 3900000-th user
Finished with the 3950000-th user
Finished with the 4000000-th user
Finished with the 4050000-th user
Finished with the 4100000-th user
Finished with the 4150000-th user

In [42]:
# now we need to sort each of the lists while maintaining
# percentiles is a mapping from #plays to the percentile that play receives
# now that we remove repeat plays
j = 0
percentiles = {}
for key in user_plays.iterkeys():
    l = list(user_plays[key]['list'])
    l.sort()
    p = {el: float(i+1) / len(l) for i,el in enumerate(l)}
    percentiles[key] = p
    if j % 50000 == 0:
        print "Finished with the {}-th".format(j)
    j+=1


Finished with the 0-th
Finished with the 50000-th
Finished with the 100000-th
Finished with the 150000-th
Finished with the 200000-th

In [43]:
def get_user_percentiles(row):
    try:
        return percentiles[row['user']][row['plays']]
    except KeyError:
        print "{}, {}".format(row['user'], row['plays'])
        return

In [46]:
matrixdata = train.apply(get_user_percentiles, axis=1)

In [74]:
# include testing data as having values of 0.5 percentile
testpredictions = np.array([0.5 for _ in xrange(len(test))])

test_user_nums = np.array([user_indices[user] for user in test.user])
test_artist_nums = np.array([artist_indices[artist] for artist in test.artist])

iindexes = np.concatenate((user_nums,test_user_nums))
jindexes = np.concatenate((artist_nums,test_artist_nums))

data = np.concatenate((matrixdata, testpredictions))

In [75]:
len(iindexes), len(jindexes), len(data)


Out[75]:
(8309608, 8309608, 8309608)

In [77]:
A = csr_matrix((data, (iindexes, jindexes)),shape=(len(profiles), len(artists)))

In [48]:
#Do NOT RUN THIS WILL CRASH YOUR COMP
#import nimfa
#nmf = nimfa.Nmf(A, seed="nndsvd", rank=10, max_iter=12, update='euclidean',objective='fro')
#nmf_fit = nmf()
def dump_data(data):
    '''
    Input: data - a dictionary of filename: object items to be dumped using pickle.
    '''
    for f,o in data.iteritems():
        pickle.dump(o,open(f + '.p','wb'))
        
dump_data({'pmatrix' : A})

In [80]:
import sklearn.decomposition

In [81]:
model = sklearn.decomposition.NMF(n_components=100, max_iter=500)

In [ ]:
W = model.fit_transform(A)

In [52]:
H = model.components_

In [56]:
H.shape


Out[56]:
(32, 2000)

In [26]:
components.shape


Out[26]:
(50, 2000)

In [27]:
W.shape


Out[27]:
(233286, 75)

In [62]:
# calculate the user meadian rating for all
users_median = {}
artist_support = {}
for i,row in train.iterrows():
    user = row['user']
    try:
        artist_support[user].append(row['artist'])
        users_median[user].append(row['plays'])
    except KeyError:
        artist_support[user] = [row['artist']]
        users_median[user] = [row['plays']]
    if i % 100000 == 0:
        print "Iteration {} is now done.".format(i)


Iteration 0 is now done.
Iteration 50000 is now done.
Iteration 100000 is now done.
Iteration 150000 is now done.
Iteration 200000 is now done.
Iteration 250000 is now done.
Iteration 300000 is now done.
Iteration 350000 is now done.
Iteration 400000 is now done.
Iteration 450000 is now done.
Iteration 500000 is now done.
Iteration 550000 is now done.
Iteration 600000 is now done.
Iteration 650000 is now done.
Iteration 700000 is now done.
Iteration 750000 is now done.
Iteration 800000 is now done.
Iteration 850000 is now done.
Iteration 900000 is now done.
Iteration 950000 is now done.
Iteration 1000000 is now done.
Iteration 1050000 is now done.
Iteration 1100000 is now done.
Iteration 1150000 is now done.
Iteration 1200000 is now done.
Iteration 1250000 is now done.
Iteration 1300000 is now done.
Iteration 1350000 is now done.
Iteration 1400000 is now done.
Iteration 1450000 is now done.
Iteration 1500000 is now done.
Iteration 1550000 is now done.
Iteration 1600000 is now done.
Iteration 1650000 is now done.
Iteration 1700000 is now done.
Iteration 1750000 is now done.
Iteration 1800000 is now done.
Iteration 1850000 is now done.
Iteration 1900000 is now done.
Iteration 1950000 is now done.
Iteration 2000000 is now done.
Iteration 2050000 is now done.
Iteration 2100000 is now done.
Iteration 2150000 is now done.
Iteration 2200000 is now done.
Iteration 2250000 is now done.
Iteration 2300000 is now done.
Iteration 2350000 is now done.
Iteration 2400000 is now done.
Iteration 2450000 is now done.
Iteration 2500000 is now done.
Iteration 2550000 is now done.
Iteration 2600000 is now done.
Iteration 2650000 is now done.
Iteration 2700000 is now done.
Iteration 2750000 is now done.
Iteration 2800000 is now done.
Iteration 2850000 is now done.
Iteration 2900000 is now done.
Iteration 2950000 is now done.
Iteration 3000000 is now done.
Iteration 3050000 is now done.
Iteration 3100000 is now done.
Iteration 3150000 is now done.
Iteration 3200000 is now done.
Iteration 3250000 is now done.
Iteration 3300000 is now done.
Iteration 3350000 is now done.
Iteration 3400000 is now done.
Iteration 3450000 is now done.
Iteration 3500000 is now done.
Iteration 3550000 is now done.
Iteration 3600000 is now done.
Iteration 3650000 is now done.
Iteration 3700000 is now done.
Iteration 3750000 is now done.
Iteration 3800000 is now done.
Iteration 3850000 is now done.
Iteration 3900000 is now done.
Iteration 3950000 is now done.
Iteration 4000000 is now done.
Iteration 4050000 is now done.
Iteration 4100000 is now done.
Iteration 4150000 is now done.

In [66]:
# calculate the actual medians for each user
umedians = {user:np.median(plays) for user, plays in users_median.iteritems()}

In [57]:
#Make predictions for training set
preds = [np.NaN]*len(train)
for row in train.iterrows():
    if row[0] % 100000 == 0:
        print row[0]
    user, artist = row[1].user, row[1].artist
    preds[row[0]] = np.percentile(user_media[user], np.dot(W[user_indices[user]], H.T[artist_indices[artist]]))


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000

In [58]:
#Score on training set
np.mean(abs(np.array(preds) - train.plays))


Out[58]:
252.92273341392888

In [73]:
#Make actual predictions
real_preds = [np.NaN]*len(train)
for row in test.iterrows():
    if row[0] % 100000 == 0:
        print row[0]
    user, artist = row[1].user, row[1].artist
    real_preds[row[0]] = np.dot(W[user_indices[user]], H.T[artist_indices[artist]])

In [72]:
#Write out actual predictions
import csv
soln_file = #name your own output file!
with open(soln_file, 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id','plays'])
    for i in xrange(len(test)):
        soln_csv.writerow([i + 1, real_preds[i]])

In [ ]:
# mapping from similarities to respective index in sorted list (one below actual value)
for user in user_plays.iterkeys():
    l = list(user_plays[key]['list'])
    l.sort()
    p = {el: float(i+1) / len(l) for i,el in percentiles}
    percentiles[key] = p
    if j % 50000 == 0:
        print "Finished with the {}-th".format(j)
    j+=1