In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pickle
from pyechonest import artist
from pyechonest import config
config.ECHO_NEST_API_KEY='EIVX1I4WCCD7FQRFV'
In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
artists = pd.read_csv("artists.csv")
profiles = pd.read_csv("profiles.csv")
#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
#dictionary of artists "hash" to name to help interpret groups
artist_names = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
artist_ids = {artists.name[i]:artists.artist[i] for i in xrange(len(artists))}
names_array = np.array(artists.name)
#create mapping of artist id to a index
artist_indices = {}
for row in artists.iterrows():
artist_indices[row[1].artist] = row[0]
#create mapping of user id to a index
user_indices = {}
for row in profiles.iterrows():
user_indices[row[1].user] = row[0]
In [3]:
user_nums = np.array([user_indices[user] for user in train.user])
artist_nums = np.array([artist_indices[artist] for artist in train.artist])
In [4]:
train.plays = np.log(train.plays)
In [5]:
# calculate the user meadian rating for all
users_median = {}
artist_support = {}
for i,row in train.iterrows():
user = row['user']
try:
artist_support[user].append(row['artist'])
users_median[user].append(row['plays'])
except KeyError:
artist_support[user] = [row['artist']]
users_median[user] = [row['plays']]
if i % 100000 == 0:
print "Iteration {} is now done.".format(i)
In [7]:
# calculate the actual medians for each user
umedians = {user:np.median(plays) for user, plays in users_median.iteritems()}
In [16]:
# include testing data as having values equial to the median of the logged inputs
testpredictions = np.array([umedians[row['user']] for i,row in test.iterrows()])
test_user_nums = np.array([user_indices[user] for user in test.user])
test_artist_nums = np.array([artist_indices[artist] for artist in test.artist])
iindexes = np.concatenate((user_nums,test_user_nums))
jindexes = np.concatenate((artist_nums,test_artist_nums))
np.array(train.plays).shape, testpredictions.shape
Out[16]:
In [75]:
len(iindexes), len(jindexes), len(data)
Out[75]:
In [77]:
A = csr_matrix((data, (iindexes, jindexes)),shape=(len(profiles), len(artists)))
In [48]:
#Do NOT RUN THIS WILL CRASH YOUR COMP
#import nimfa
#nmf = nimfa.Nmf(A, seed="nndsvd", rank=10, max_iter=12, update='euclidean',objective='fro')
#nmf_fit = nmf()
def dump_data(data):
'''
Input: data - a dictionary of filename: object items to be dumped using pickle.
'''
for f,o in data.iteritems():
pickle.dump(o,open(f + '.p','wb'))
dump_data({'pmatrix' : A})
In [80]:
import sklearn.decomposition
In [81]:
model = sklearn.decomposition.NMF(n_components=100, max_iter=500)
In [ ]:
W = model.fit_transform(A)
In [52]:
H = model.components_
In [56]:
H.shape
Out[56]:
In [26]:
components.shape
Out[26]:
In [27]:
W.shape
Out[27]:
In [57]:
#Make predictions for training set
preds = [np.NaN]*len(train)
for row in train.iterrows():
if row[0] % 100000 == 0:
print row[0]
user, artist = row[1].user, row[1].artist
preds[row[0]] = np.percentile(user_media[user], np.dot(W[user_indices[user]], H.T[artist_indices[artist]]))
In [58]:
#Score on training set
np.mean(abs(np.array(preds) - train.plays))
Out[58]:
In [73]:
#Make actual predictions
real_preds = [np.NaN]*len(train)
for row in test.iterrows():
if row[0] % 100000 == 0:
print row[0]
user, artist = row[1].user, row[1].artist
real_preds[row[0]] = np.dot(W[user_indices[user]], H.T[artist_indices[artist]])
In [72]:
#Write out actual predictions
import csv
soln_file = #name your own output file!
with open(soln_file, 'w') as soln_fh:
soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
soln_csv.writerow(['Id','plays'])
for i in xrange(len(test)):
soln_csv.writerow([i + 1, real_preds[i]])
In [ ]:
# mapping from similarities to respective index in sorted list (one below actual value)
for user in user_plays.iterkeys():
l = list(user_plays[key]['list'])
l.sort()
p = {el: float(i+1) / len(l) for i,el in percentiles}
percentiles[key] = p
if j % 50000 == 0:
print "Finished with the {}-th".format(j)
j+=1