notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# turn off to avoid displaying test output
display = False



In [2]:

    
train = pd.read_csv("train.csv")



In [3]:

    
test = pd.read_csv("test.csv")



In [4]:

    
if display: 
    train.head()



In [5]:

    
artists = pd.read_csv("artists.csv")



In [6]:

    
if display: 
    artists.head()



In [7]:

    
profiles = pd.read_csv("profiles.csv")



In [8]:

    
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5

#impute missing age with mean age
profiles[pd.isnull(profiles.age)].age = np.mean(profiles.age)

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}









    



C:\Anaconda\lib\site-packages\pandas\core\generic.py:1858: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  self[name] = value



In [9]:

    
if display: profiles.sex



In [10]:

    
if display: len(np.unique(profiles.country))



In [11]:

    
if display: len(profiles)



In [12]:

    
user0 = profiles.user[0]



In [13]:

    
if display: train[train.user == user0]



In [14]:

    
if display: test[test.user == user0]



In [15]:

    
if display: sum(profiles.sex == 0.5)



In [16]:

    
def similar_users(user, plus_minus):
    age = profiles.age[user]
    gender = profiles.sex[user]
    return (profiles.sex == gender)  & (profiles.age >= age - plus_minus)  & (profiles.age <= age + plus_minus)



In [17]:

    
test.head()









    Out[17]:






  
    
      
      Id
      user
      artist
    
  
  
    
      0
       1
       306e19cce2522fa2d39ff5dfc870992100ec22d2
       4ac4e32b-bd18-402e-adad-ae00e72f8d85
    
    
      1
       2
       9450d351278df4938bdea4ed86aec940a4e927ac
       1f574ab1-a46d-4586-9331-f0ded23e0411
    
    
      2
       3
       801909d6955f59033c88595d3d7f8a6a5dcd53cc
       3eb72791-6322-466b-87d3-24d74901eb2d
    
    
      3
       4
       e3ed47445c127fbeff47fb58f6bbf2f3b4535d82
       61604b45-8a91-4e33-a1b6-45d7b1fec4e5
    
    
      4
       5
       a73f46652103f3a5f7429159310f6928f79644aa
       5dfdca28-9ddc-4853-933c-8bc97d87beec



In [18]:

    
#silly-can likely ignore this
def predict_row(index):
    user = test.user[index]
    artist = test.artist[index]
    user_idx = list(profiles.user).index(user)
    matches = similar_users(user_idx, 3)



In [19]:

    
user = test.user[0]
artist = test.artist[0]
# finding index of 1st test user in profiles
user_idx = list(profiles.user).index(user)
matches = similar_users(user_idx, 3)



In [20]:

    
if display: list(profiles.user).index(user)



In [21]:

    
if display: profiles.user[matches]



In [22]:

    
if display: len(matches)



In [23]:

    
if display: len(train.user)



In [24]:

    
subset = train[train.user == user]



In [25]:

    
if display: subset



In [26]:

    
#to help interpret
for row in subset.iterrows():
    if display: print bands[row[1].artist], row[1].plays



In [27]:

    
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix



In [28]:

    
U = len(profiles)
D = len(artists)



In [29]:

    
# Make dictionary of user to user_id
user_id = {profiles.user[i]:i for i in xrange(len(profiles))}
id_user = {value: key for key,value in user_id.iteritems()}
# Make dictionary of artist to artist_id
artist_id = {artists.artist[i]:i for i in xrange(len(artists))}
id_artist = {value: key for key,value in artist_id.iteritems()}



In [30]:

    
import pickle 

# Iterate through train data (user, artist, plays) and fill in sparse matrix R
def load_data(filenames):
    d = {}
    for f in filenames:
        try:
            d[f] = pickle.load(open(f + '.p','rb'))
        except _:
            print "Could not load {} into data.".format(f)
            if 'matrix' in f:
                print "Assuming matrix. Attempting to recreate with available data."
                d[f] = lil_matrix((U,D)) 
                for i in xrange(len(train)):
                    if i % 100000 == 0:
                        print i
                    d[f][user_id[train.user[i]], artist_id[train.artist[i]]] = train.plays[i]

    return d

def dump_data(data):
    '''
    Input: data - a dictionary of filename: object items to be dumped using pickle.
    '''
    for f,o in data.iteritems():
        pickle.dump(o,open(f,'wb'))



In [31]:

    
datafiles = ['artist_hotness', 'could_not_find', 'R_matrix']
saved_data = load_data(datafiles)



In [32]:

    
# List of (i,j) tuples in train data
train_entries = []
for i in xrange(len(train)):
    if i % 500000 == 0:
        print i
    train_entries.append((user_id[train.user[i]], artist_id[train.artist[i]]))



In [33]:

    
# Initialize P and Q -- will use 2 dimensions to start
P = np.random.rand(U,15)
Q = np.random.rand(D,15)



In [38]:

    
# Matrix factorization model
# Except that EVERYTHING SHOULD BE IN A FOR LOOP THAT LOOPS 1000 TIMES AND EACH ITERATION TAKES ROUGHLY 3MIN I THINK
# We could just loop 100 times
def matrix_factorization(R,P,Q,K,steps=100,alpha = 0.01,beta = 0.02):
    Q = Q.T
    # for those (i,j) entries that belong to train data; train_entries is 4 million long
    for step in xrange(steps):
        for (idx, (i,j)) in enumerate(train_entries):
            if idx % 500000 == 0:
                print idx
            # computes the prediction error
            eij = R[i,j] - np.dot(P[i,:],Q[:,j])
            for k in xrange(K):
                # learning rule
                P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        print 'learning rule done'
        # new R and error e
        eR = np.dot(P,Q)
        e = 0
        for (idx, (i,j)) in enumerate(train_entries):
            if idx % 500000 == 0:
                print idx
            e += (R[i,j] - np.dot(P[i,:],Q[:,j]))**2
            for k in range(K):
                e = e + (beta/2) * P[i][k]**2 + Q[k][j]**2
        if e < 0.1:
            break
        print "Successfully completed step {} with error: {}".format(step, e)
        
    return P,Q



In [ ]:

    
R = saved_data['R_matrix']
P,Q = matrix_factorization(R,P,Q,2)









    



0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 0 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 1 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 2 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 3 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 4 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 5 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 6 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 7 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 8 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 9 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 10 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 11 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 12 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 13 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 14 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 15 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
Successfully completed step 16 with error: nan
0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000
2500000
3000000
2500000
3000000
3500000
4000000
learning rule done
0
500000
1000000
1500000
2000000






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-39-13b9ba505841> in <module>()
      1 R = saved_data['R_matrix']
----> 2 P,Q = matrix_factorization(R,P,Q,2)

<ipython-input-38-044f2e9e0714> in matrix_factorization(R, P, Q, K, steps, alpha, beta)
     22             if idx % 500000 == 0:
     23                 print idx
---> 24             e += (R[i,j] - np.dot(P[i,:],Q[:,j]))**2
     25             for k in range(K):
     26                 e = e + (beta/2) * P[i][k]**2 + Q[k][j]**2

C:\Anaconda\lib\site-packages\scipy\sparse\lil.pyc in __getitem__(self, index)
    233 
    234         # Scalar fast path first
--> 235         if isinstance(index, tuple) and len(index) == 2:
    236             i, j = index
    237             # Use isinstance checks for common index types; this is

KeyboardInterrupt:



In [35]:

    
def write_predictons(P,Q,f="one"):
    '''
    Given a P,Q of factorized prediction matrix, writes out results.
    '''
    Rhat = np.dot(P,Q)
    test_entries = [(test.Id[i], user_id[test.user[i]], artist_id[test.artist[i]]) for i in xrange(len(test))]
    
    # For (id,i,j) in test_entries, find predicted song plays in Rhat
    predictions = {Id: R[i][j] for (Id, i,j) in test_entries}
    
    pdrs = pd.DataFrame(predictions. columns=['Id','plays'])
    
    pdrs.to_csv(f + ".csv")









    



  File "<ipython-input-35-bb2abe5f19de>", line 11
    pdrs = pd.DataFrame(predictions. columns=['Id','plays'])
SyntaxError: keyword can't be an expression



In [ ]:

	Id	user	artist
0	1	306e19cce2522fa2d39ff5dfc870992100ec22d2	4ac4e32b-bd18-402e-adad-ae00e72f8d85
1	2	9450d351278df4938bdea4ed86aec940a4e927ac	1f574ab1-a46d-4586-9331-f0ded23e0411
2	3	801909d6955f59033c88595d3d7f8a6a5dcd53cc	3eb72791-6322-466b-87d3-24d74901eb2d
3	4	e3ed47445c127fbeff47fb58f6bbf2f3b4535d82	61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4	5	a73f46652103f3a5f7429159310f6928f79644aa	5dfdca28-9ddc-4853-933c-8bc97d87beec