In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# turn off to avoid displaying test output
display = False

In [2]:
train = pd.read_csv("train.csv")

In [3]:
test = pd.read_csv("test.csv")

In [4]:
if display: 
    train.head()

In [5]:
artists = pd.read_csv("artists.csv")

In [6]:
if display: 
    artists.head()

In [7]:
profiles = pd.read_csv("profiles.csv")

In [8]:
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5

#impute missing age with mean age
profiles[pd.isnull(profiles.age)].age = np.mean(profiles.age)

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}


C:\Anaconda\lib\site-packages\pandas\core\generic.py:1858: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  self[name] = value

In [9]:
if display: profiles.sex

In [10]:
if display: len(np.unique(profiles.country))

In [11]:
if display: len(profiles)

In [12]:
user0 = profiles.user[0]

In [13]:
if display: train[train.user == user0]

In [14]:
if display: test[test.user == user0]

In [15]:
if display: sum(profiles.sex == 0.5)

In [16]:
def similar_users(user, plus_minus):
    age = profiles.age[user]
    gender = profiles.sex[user]
    return (profiles.sex == gender)  & (profiles.age >= age - plus_minus)  & (profiles.age <= age + plus_minus)

In [17]:
test.head()


Out[17]:
Id user artist
0 1 306e19cce2522fa2d39ff5dfc870992100ec22d2 4ac4e32b-bd18-402e-adad-ae00e72f8d85
1 2 9450d351278df4938bdea4ed86aec940a4e927ac 1f574ab1-a46d-4586-9331-f0ded23e0411
2 3 801909d6955f59033c88595d3d7f8a6a5dcd53cc 3eb72791-6322-466b-87d3-24d74901eb2d
3 4 e3ed47445c127fbeff47fb58f6bbf2f3b4535d82 61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4 5 a73f46652103f3a5f7429159310f6928f79644aa 5dfdca28-9ddc-4853-933c-8bc97d87beec

In [18]:
#silly-can likely ignore this
def predict_row(index):
    user = test.user[index]
    artist = test.artist[index]
    user_idx = list(profiles.user).index(user)
    matches = similar_users(user_idx, 3)

In [19]:
user = test.user[0]
artist = test.artist[0]
# finding index of 1st test user in profiles
user_idx = list(profiles.user).index(user)
matches = similar_users(user_idx, 3)

In [20]:
if display: list(profiles.user).index(user)

In [21]:
if display: profiles.user[matches]

In [22]:
if display: len(matches)

In [23]:
if display: len(train.user)

In [24]:
subset = train[train.user == user]

In [25]:
if display: subset

In [26]:
#to help interpret
for row in subset.iterrows():
    if display: print bands[row[1].artist], row[1].plays

In [27]:
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix

In [28]:
U = len(profiles)
D = len(artists)

In [29]:
# Make dictionary of user to user_id
user_id = {profiles.user[i]:i for i in xrange(len(profiles))}
id_user = {value: key for key,value in user_id.iteritems()}
# Make dictionary of artist to artist_id
artist_id = {artists.artist[i]:i for i in xrange(len(artists))}
id_artist = {value: key for key,value in artist_id.iteritems()}

In [30]:
import pickle 

# Iterate through train data (user, artist, plays) and fill in sparse matrix R
def load_data(filenames):
    d = {}
    for f in filenames:
        try:
            d[f] = pickle.load(open(f + '.p','rb'))
        except _:
            print "Could not load {} into data.".format(f)
            if 'matrix' in f:
                print "Assuming matrix. Attempting to recreate with available data."
                d[f] = lil_matrix((U,D)) 
                for i in xrange(len(train)):
                    if i % 100000 == 0:
                        print i
                    d[f][user_id[train.user[i]], artist_id[train.artist[i]]] = train.plays[i]

    return d

def dump_data(data):
    '''
    Input: data - a dictionary of filename: object items to be dumped using pickle.
    '''
    for f,o in data.iteritems():
        pickle.dump(o,open(f,'wb'))

In [31]:
datafiles = ['artist_hotness', 'could_not_find', 'R_matrix']
saved_data = load_data(datafiles)

In [32]:
# List of (i,j) tuples in train data
train_entries = []
for i in xrange(len(train)):
    if i % 500000 == 0:
        print i
    train_entries.append((user_id[train.user[i]], artist_id[train.artist[i]]))


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000

In [33]:
# Initialize P and Q -- will use 2 dimensions to start
P = np.random.rand(U,15)
Q = np.random.rand(D,15)

In [38]:
# Matrix factorization model
# Except that EVERYTHING SHOULD BE IN A FOR LOOP THAT LOOPS 1000 TIMES AND EACH ITERATION TAKES ROUGHLY 3MIN I THINK
# We could just loop 100 times
def matrix_factorization(R,P,Q,K,steps=100,alpha = 0.01,beta = 0.02):
    Q = Q.T
    # for those (i,j) entries that belong to train data; train_entries is 4 million long
    for step in xrange(steps):
        for (idx, (i,j)) in enumerate(train_entries):
            if idx % 500000 == 0:
                print idx
            # computes the prediction error
            eij = R[i,j] - np.dot(P[i,:],Q[:,j])
            for k in xrange(K):
                # learning rule
                P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        print 'learning rule done'
        # new R and error e
        eR = np.dot(P,Q)
        e = 0
        for (idx, (i,j)) in enumerate(train_entries):
            if idx % 500000 == 0:
                print idx
            e += (R[i,j] - np.dot(P[i,:],Q[:,j]))**2
            for k in range(K):
                e = e + (beta/2) * P[i][k]**2 + Q[k][j]**2
        if e < 0.1:
            break
        print "Successfully completed step {} with error: {}".format(step, e)
        
    return P,Q

In [ ]:
R = saved_data['R_matrix']
P,Q = matrix_factorization(R,P,Q,2)


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 0 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 1 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 2 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 3 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 4 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 5 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 6 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 7 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 8 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 9 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 10 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 11 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 12 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 13 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 14 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 15 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 16 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-39-13b9ba505841> in <module>()
      1 R = saved_data['R_matrix']
----> 2 P,Q = matrix_factorization(R,P,Q,2)

<ipython-input-38-044f2e9e0714> in matrix_factorization(R, P, Q, K, steps, alpha, beta)
     22             if idx % 500000 == 0:
     23                 print idx
---> 24             e += (R[i,j] - np.dot(P[i,:],Q[:,j]))**2
     25             for k in range(K):
     26                 e = e + (beta/2) * P[i][k]**2 + Q[k][j]**2

C:\Anaconda\lib\site-packages\scipy\sparse\lil.pyc in __getitem__(self, index)
    233 
    234         # Scalar fast path first
--> 235         if isinstance(index, tuple) and len(index) == 2:
    236             i, j = index
    237             # Use isinstance checks for common index types; this is

KeyboardInterrupt: 

In [35]:
def write_predictons(P,Q,f="one"):
    '''
    Given a P,Q of factorized prediction matrix, writes out results.
    '''
    Rhat = np.dot(P,Q)
    test_entries = [(test.Id[i], user_id[test.user[i]], artist_id[test.artist[i]]) for i in xrange(len(test))]
    
    # For (id,i,j) in test_entries, find predicted song plays in Rhat
    predictions = {Id: R[i][j] for (Id, i,j) in test_entries}
    
    pdrs = pd.DataFrame(predictions. columns=['Id','plays'])
    
    pdrs.to_csv(f + ".csv")


  File "<ipython-input-35-bb2abe5f19de>", line 11
    pdrs = pd.DataFrame(predictions. columns=['Id','plays'])
SyntaxError: keyword can't be an expression

In [ ]: