In [30]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [31]:
load_test = True
load_train = True

In [32]:
# load training data
train = pd.read_csv("train.csv") if load_test else None

In [22]:
test = pd.read_csv("test.csv") if load_train else None

In [10]:
artists = pd.read_csv("artists.csv")

In [11]:
artists.head()


Out[11]:
artist name
0 03098741-08b3-4dd7-b3f6-1b0bfa2c879c Liars
1 69c4cc43-8163-41c5-ac81-30946d27bb69 CunninLynguists
2 7a2e6b55-f149-4e74-be6a-30a1b1a387bb The Desert Sessions
3 7002bf88-1269-4965-a772-4ba1e7a91eaa Glenn Gould
4 dbf7c761-e332-467b-b4d9-aafe06bbcf8f G. Love & Special Sauce

In [12]:
profiles = pd.read_csv("profiles.csv")

In [18]:
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5

#impute missing age with mean age
profiles[pd.isnull(profiles.age)] = np.mean(profiles.age)

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}

In [33]:
profiles.sex


Out[33]:
0       1
1       0
2       0
3       0
4       0
5       0
6     0.5
7     0.5
8     0.5
9     0.5
10      0
11      0
12      0
13      0
14      1
...
233271      0
233272      1
233273      1
233274      0
233275    0.5
233276      0
233277      0
233278      1
233279      1
233280      0
233281      0
233282      1
233283    0.5
233284      0
233285      0
Name: sex, Length: 233286, dtype: object

In [23]:
len(np.unique(profiles.country))


Out[23]:
239

In [24]:
len(profiles)


Out[24]:
233286

In [34]:
sum(profiles.sex == 0.5)


Out[34]:
55432

In [39]:
test.head()


Out[39]:
Id user artist
0 1 306e19cce2522fa2d39ff5dfc870992100ec22d2 4ac4e32b-bd18-402e-adad-ae00e72f8d85
1 2 9450d351278df4938bdea4ed86aec940a4e927ac 1f574ab1-a46d-4586-9331-f0ded23e0411
2 3 801909d6955f59033c88595d3d7f8a6a5dcd53cc 3eb72791-6322-466b-87d3-24d74901eb2d
3 4 e3ed47445c127fbeff47fb58f6bbf2f3b4535d82 61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4 5 a73f46652103f3a5f7429159310f6928f79644aa 5dfdca28-9ddc-4853-933c-8bc97d87beec

In [49]:
# load additional user artist similarity data from local file
import cPickle as pickle

# assumes key is the name of a file with .p extension in same directory
datafiles = ['similar_artists', 'artist_hotness', 'could_not_find']
artistInfo = {key: pickle.load(open(key + '.p')) for key in datafiles}

In [67]:
similar_artists = {key: artistInfo['similar_artists']['2Pac'] for key in artistInfo['similar_artists']}

In [ ]:
def index_mappings(data):
    for i,user in enumerate(data.user)

from scipy.sparse import csr_matrix

# we implement an algorithm to load our user/artist data into a sparce matrix
def create_sparse_data_matrix(data):
    userToRow, rowToUser = {},{}
    for i,user in enumerate(data.user):
        if user not in userToRow:
            userToRow[]