notebook.community

Edit and run



In [30]:

    
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans



In [31]:

    
load_test = True
load_train = True



In [32]:

    
# load training data
train = pd.read_csv("train.csv") if load_test else None



In [22]:

    
test = pd.read_csv("test.csv") if load_train else None



In [10]:

    
artists = pd.read_csv("artists.csv")



In [11]:

    
artists.head()









    Out[11]:






  
    
      
      artist
      name
    
  
  
    
      0
       03098741-08b3-4dd7-b3f6-1b0bfa2c879c
                         Liars
    
    
      1
       69c4cc43-8163-41c5-ac81-30946d27bb69
               CunninLynguists
    
    
      2
       7a2e6b55-f149-4e74-be6a-30a1b1a387bb
           The Desert Sessions
    
    
      3
       7002bf88-1269-4965-a772-4ba1e7a91eaa
                   Glenn Gould
    
    
      4
       dbf7c761-e332-467b-b4d9-aafe06bbcf8f
       G. Love & Special Sauce



In [12]:

    
profiles = pd.read_csv("profiles.csv")



In [18]:

    
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5

#impute missing age with mean age
profiles[pd.isnull(profiles.age)] = np.mean(profiles.age)

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}



In [33]:

    
profiles.sex









    Out[33]:





0       1
1       0
2       0
3       0
4       0
5       0
6     0.5
7     0.5
8     0.5
9     0.5
10      0
11      0
12      0
13      0
14      1
...
233271      0
233272      1
233273      1
233274      0
233275    0.5
233276      0
233277      0
233278      1
233279      1
233280      0
233281      0
233282      1
233283    0.5
233284      0
233285      0
Name: sex, Length: 233286, dtype: object



In [23]:

    
len(np.unique(profiles.country))









    Out[23]:





239



In [24]:

    
len(profiles)









    Out[24]:





233286



In [34]:

    
sum(profiles.sex == 0.5)









    Out[34]:





55432



In [39]:

    
test.head()









    Out[39]:






  
    
      
      Id
      user
      artist
    
  
  
    
      0
       1
       306e19cce2522fa2d39ff5dfc870992100ec22d2
       4ac4e32b-bd18-402e-adad-ae00e72f8d85
    
    
      1
       2
       9450d351278df4938bdea4ed86aec940a4e927ac
       1f574ab1-a46d-4586-9331-f0ded23e0411
    
    
      2
       3
       801909d6955f59033c88595d3d7f8a6a5dcd53cc
       3eb72791-6322-466b-87d3-24d74901eb2d
    
    
      3
       4
       e3ed47445c127fbeff47fb58f6bbf2f3b4535d82
       61604b45-8a91-4e33-a1b6-45d7b1fec4e5
    
    
      4
       5
       a73f46652103f3a5f7429159310f6928f79644aa
       5dfdca28-9ddc-4853-933c-8bc97d87beec



In [49]:

    
# load additional user artist similarity data from local file
import cPickle as pickle

# assumes key is the name of a file with .p extension in same directory
datafiles = ['similar_artists', 'artist_hotness', 'could_not_find']
artistInfo = {key: pickle.load(open(key + '.p')) for key in datafiles}



In [67]:

    
similar_artists = {key: artistInfo['similar_artists']['2Pac'] for key in artistInfo['similar_artists']}



In [ ]:

    
def index_mappings(data):
    for i,user in enumerate(data.user)

from scipy.sparse import csr_matrix

# we implement an algorithm to load our user/artist data into a sparce matrix
def create_sparse_data_matrix(data):
    userToRow, rowToUser = {},{}
    for i,user in enumerate(data.user):
        if user not in userToRow:
            userToRow[]

	artist	name
0	03098741-08b3-4dd7-b3f6-1b0bfa2c879c	Liars
1	69c4cc43-8163-41c5-ac81-30946d27bb69	CunninLynguists
2	7a2e6b55-f149-4e74-be6a-30a1b1a387bb	The Desert Sessions
3	7002bf88-1269-4965-a772-4ba1e7a91eaa	Glenn Gould
4	dbf7c761-e332-467b-b4d9-aafe06bbcf8f	G. Love & Special Sauce

	Id	user	artist
0	1	306e19cce2522fa2d39ff5dfc870992100ec22d2	4ac4e32b-bd18-402e-adad-ae00e72f8d85
1	2	9450d351278df4938bdea4ed86aec940a4e927ac	1f574ab1-a46d-4586-9331-f0ded23e0411
2	3	801909d6955f59033c88595d3d7f8a6a5dcd53cc	3eb72791-6322-466b-87d3-24d74901eb2d
3	4	e3ed47445c127fbeff47fb58f6bbf2f3b4535d82	61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4	5	a73f46652103f3a5f7429159310f6928f79644aa	5dfdca28-9ddc-4853-933c-8bc97d87beec