In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
# turn off to avoid displaying test output
display = False
In [2]:
train = pd.read_csv("train.csv")
In [3]:
test = pd.read_csv("test.csv")
In [4]:
if display:
train.head()
In [5]:
artists = pd.read_csv("artists.csv")
In [6]:
if display:
artists.head()
In [7]:
profiles = pd.read_csv("profiles.csv")
In [8]:
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5
#impute missing age with mean age
profiles[pd.isnull(profiles.age)].age = np.mean(profiles.age)
#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
In [9]:
if display: profiles.sex
In [10]:
if display: len(np.unique(profiles.country))
In [11]:
if display: len(profiles)
In [12]:
user0 = profiles.user[0]
In [13]:
if display: train[train.user == user0]
In [14]:
if display: test[test.user == user0]
In [15]:
if display: sum(profiles.sex == 0.5)
In [16]:
def similar_users(user, plus_minus):
age = profiles.age[user]
gender = profiles.sex[user]
return (profiles.sex == gender) & (profiles.age >= age - plus_minus) & (profiles.age <= age + plus_minus)
In [17]:
test.head()
Out[17]:
In [18]:
#silly-can likely ignore this
def predict_row(index):
user = test.user[index]
artist = test.artist[index]
user_idx = list(profiles.user).index(user)
matches = similar_users(user_idx, 3)
In [19]:
user = test.user[0]
artist = test.artist[0]
# finding index of 1st test user in profiles
user_idx = list(profiles.user).index(user)
matches = similar_users(user_idx, 3)
In [20]:
if display: list(profiles.user).index(user)
In [21]:
if display: profiles.user[matches]
In [22]:
if display: len(matches)
In [23]:
if display: len(train.user)
In [24]:
subset = train[train.user == user]
In [25]:
if display: subset
In [26]:
#to help interpret
for row in subset.iterrows():
if display: print bands[row[1].artist], row[1].plays
In [27]:
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
In [28]:
U = len(profiles)
D = len(artists)
In [29]:
# Make dictionary of user to user_id
user_id = {profiles.user[i]:i for i in xrange(len(profiles))}
id_user = {value: key for key,value in user_id.iteritems()}
# Make dictionary of artist to artist_id
artist_id = {artists.artist[i]:i for i in xrange(len(artists))}
id_artist = {value: key for key,value in artist_id.iteritems()}
In [30]:
import pickle
# Iterate through train data (user, artist, plays) and fill in sparse matrix R
def load_data(filenames):
d = {}
for f in filenames:
try:
d[f] = pickle.load(open(f + '.p','rb'))
except _:
print "Could not load {} into data.".format(f)
if 'matrix' in f:
print "Assuming matrix. Attempting to recreate with available data."
d[f] = lil_matrix((U,D))
for i in xrange(len(train)):
if i % 100000 == 0:
print i
d[f][user_id[train.user[i]], artist_id[train.artist[i]]] = train.plays[i]
return d
def dump_data(data):
'''
Input: data - a dictionary of filename: object items to be dumped using pickle.
'''
for f,o in data.iteritems():
pickle.dump(o,open(f,'wb'))
In [31]:
datafiles = ['artist_hotness', 'could_not_find', 'R_matrix']
saved_data = load_data(datafiles)
In [32]:
# List of (i,j) tuples in train data
train_entries = []
for i in xrange(len(train)):
if i % 500000 == 0:
print i
train_entries.append((user_id[train.user[i]], artist_id[train.artist[i]]))
In [33]:
# Initialize P and Q -- will use 2 dimensions to start
P = np.random.rand(U,15)
Q = np.random.rand(D,15)
In [38]:
# Matrix factorization model
# Except that EVERYTHING SHOULD BE IN A FOR LOOP THAT LOOPS 1000 TIMES AND EACH ITERATION TAKES ROUGHLY 3MIN I THINK
# We could just loop 100 times
def matrix_factorization(R,P,Q,K,steps=100,alpha = 0.01,beta = 0.02):
Q = Q.T
# for those (i,j) entries that belong to train data; train_entries is 4 million long
for step in xrange(steps):
for (idx, (i,j)) in enumerate(train_entries):
if idx % 500000 == 0:
print idx
# computes the prediction error
eij = R[i,j] - np.dot(P[i,:],Q[:,j])
for k in xrange(K):
# learning rule
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
print 'learning rule done'
# new R and error e
eR = np.dot(P,Q)
e = 0
for (idx, (i,j)) in enumerate(train_entries):
if idx % 500000 == 0:
print idx
e += (R[i,j] - np.dot(P[i,:],Q[:,j]))**2
for k in range(K):
e = e + (beta/2) * P[i][k]**2 + Q[k][j]**2
if e < 0.1:
break
print "Successfully completed step {} with error: {}".format(step, e)
return P,Q
In [ ]:
R = saved_data['R_matrix']
P,Q = matrix_factorization(R,P,Q,2)
In [35]:
def write_predictons(P,Q,f="one"):
'''
Given a P,Q of factorized prediction matrix, writes out results.
'''
Rhat = np.dot(P,Q)
test_entries = [(test.Id[i], user_id[test.user[i]], artist_id[test.artist[i]]) for i in xrange(len(test))]
# For (id,i,j) in test_entries, find predicted song plays in Rhat
predictions = {Id: R[i][j] for (Id, i,j) in test_entries}
pdrs = pd.DataFrame(predictions. columns=['Id','plays'])
pdrs.to_csv(f + ".csv")
In [ ]: