In [30]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
In [31]:
load_test = True
load_train = True
In [32]:
# load training data
train = pd.read_csv("train.csv") if load_test else None
In [22]:
test = pd.read_csv("test.csv") if load_train else None
In [10]:
artists = pd.read_csv("artists.csv")
In [11]:
artists.head()
Out[11]:
In [12]:
profiles = pd.read_csv("profiles.csv")
In [18]:
#Turn sex data numerical
profiles.sex[profiles.sex == 'f'] = 1
profiles.sex[profiles.sex == 'm'] = 0
profiles.sex[(profiles.sex != 0) & (profiles.sex != 1)] = 0.5
#impute missing age with mean age
profiles[pd.isnull(profiles.age)] = np.mean(profiles.age)
#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
In [33]:
profiles.sex
Out[33]:
In [23]:
len(np.unique(profiles.country))
Out[23]:
In [24]:
len(profiles)
Out[24]:
In [34]:
sum(profiles.sex == 0.5)
Out[34]:
In [39]:
test.head()
Out[39]:
In [49]:
# load additional user artist similarity data from local file
import cPickle as pickle
# assumes key is the name of a file with .p extension in same directory
datafiles = ['similar_artists', 'artist_hotness', 'could_not_find']
artistInfo = {key: pickle.load(open(key + '.p')) for key in datafiles}
In [67]:
similar_artists = {key: artistInfo['similar_artists']['2Pac'] for key in artistInfo['similar_artists']}
In [ ]:
def index_mappings(data):
for i,user in enumerate(data.user)
from scipy.sparse import csr_matrix
# we implement an algorithm to load our user/artist data into a sparce matrix
def create_sparse_data_matrix(data):
userToRow, rowToUser = {},{}
for i,user in enumerate(data.user):
if user not in userToRow:
userToRow[]