In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import pickle

from pyechonest import artist

from pyechonest import config
config.ECHO_NEST_API_KEY='EIVX1I4WCCD7FQRFV'

In [3]:
similar_artists = pickle.load(open( "similar_artists.p", "rb" ))
artist_hotness = pickle.load(open( "artist_hotness.p", "rb" ))
no_matches = pickle.load(open( "could_not_find.p", "rb" ))

In [21]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [189]:
artists = pd.read_csv("artists.csv")
profiles = pd.read_csv("profiles.csv")

#dictionary of artists "hash" to name to help interpret groups
bands = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}

In [79]:
#dictionary of artists "hash" to name to help interpret groups
artist_names = {artists.artist[i]:artists.name[i] for i in xrange(len(artists))}
artist_ids = {artists.name[i]:artists.artist[i] for i in xrange(len(artists))}
names_array = np.array(artists.name)

In [14]:
similar_names = {}
for artist in similar_artists:
    similar = [match.name for match in similar_artists[artist]]
    similar_names[artist] = similar

In [26]:
artist_medians = {}
for artist in artists.artist:
    artist_medians[artist] = np.median(train[train.artist == artist].plays)

In [28]:
#pickle.dump(artist_medians, open( "artist_medians.p", "wb" ))

In [36]:
_, user, artist= test.ix[0]


Out[36]:
Id                                               1
user      306e19cce2522fa2d39ff5dfc870992100ec22d2
artist        4ac4e32b-bd18-402e-adad-ae00e72f8d85
Name: 0, dtype: object

In [90]:
def predict_row(row):
    _, user, artist= test.ix[row]
    artist_id = [artist_ids[name] for name in similar_names[artist_names[artist]] if name in artist_ids]
    
    
    
    
    
    return artist_id

In [139]:
user = test.ix[35000].user
artists = predict_row(35000)

In [142]:
print artist_names[artist]


Keane

In [ ]:
print

In [140]:
for artist in train[train.user == user].artist:
    print artist in artists


False
False
False
False
False
False
False
False
False
False
False
False
False

In [82]:
[similar_names['The Kooks'] in artist_ids]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-82-6192761d3d41> in <module>()
----> 1 [similar_names['The Kooks'] in artist_ids]

TypeError: unhashable type: 'list'

In [49]:
artists[artists.name == 'Arctic Monkeys']


Out[49]:
artist name
434 ada7a83c-e3e1-40f1-93f9-3e73dbc9298a Arctic Monkeys

In [58]:
similar_names['The Kooks'][1] == artists.name[434]


Out[58]:
True

In [84]:
[(name in artist_ids) for name in similar_names[artist_names['ada7a83c-e3e1-40f1-93f9-3e73dbc9298a']]]


Out[84]:
[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True]

In [76]:
similar_names['The Kooks'][7]


Out[76]:
u'Max\xefmo Park'

In [469]:
artist_names;

In [86]:
8000000.0/450000000


Out[86]:
0.017777777777777778

In [164]:
artist_words = pickle.load(open( "artist_terms.p", "rb" ))

In [175]:
artist_words['Audioslave'][0]['name']


Out[175]:
u'rock'

In [158]:
terms


Out[158]:
{2}

In [177]:
artist_terms = {}
unique_terms = set([])
for artist in artist_words:
    terms = [match for match in artist_words[artist]]
    stuff = [(thing['name'], thing['weight']) for thing in terms]
    for thing in terms:
        unique_terms.add(thing['name'])
    artist_terms[artist] = stuff

In [1]:
artist_terms


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-7d02d93c460e> in <module>()
----> 1 artist_terms

NameError: name 'artist_terms' is not defined

In [180]:
len(unique_terms)


Out[180]:
808

In [201]:
for data in train[train.user == profiles.user[0]].iterrows():
    print bands[data[1].artist],  data[1].plays


 Andrew Lloyd Webber 60
The Beatles 51
Hello Saferide 156
The Beach Boys 32
Red Hot Chili Peppers 50
Anna Ternheim 83
Sufjan Stevens 136
Queen 240
Gavin DeGraw 54
Foo Fighters 84
Live 139
Mika 41
Blur 33
The Cardigans 72
Ane Brun 40
Counting Crows 237
José González 32
Nirvana 66
UB40 70
Avril Lavigne 47

In [211]:
sum(train[train.user == profiles.user[0]].plays)


Out[211]:
1723

In [215]:
no_matches


Out[215]:
['f1a95c6b-fb2a-41a6-bfcb-2453fee2a38c',
 'bb1f91d2-1b54-4ee5-b55a-626492a5904f',
 '9bf79f68-c064-44a1-8c2c-5764f1d7c016',
 '89ad4ac3-39f7-470e-963a-56509c546377',
 '4b179fe2-dfa5-40b1-b6db-b56dbc3b5f09',
 '10b7b68c-390d-469a-915b-40bac704f288',
 '5aca3051-afa2-4f5c-9974-cc9418482a58',
 'b5da400c-9a62-4686-b6fe-91518e57ce5d',
 'ae681605-2801-4120-9a48-e18752042306',
 '8f3f7fec-cabf-4366-9c31-06f204b402f5',
 'f26c72d3-e52c-467b-b651-679c73d8e1a7',
 '64b86e99-b6ec-4fb1-a5cd-f95482d3b57a',
 'e01646f2-2a04-450d-8bf2-0d993082e058',
 '5385c403-1c49-4f2f-9b98-7085b5c84371']

In [219]:
count = 0
for i in xrange(len(no_matches)):
    count = count + len(test[test.artist==no_matches[i]])

In [220]:



Out[220]:
11851

In [222]:
pickle.dump(artist_terms, open("clean_artist_terms.p", "wb"))

In [226]:
profiles.user


Out[226]:
0     fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d
1     5909125332c108365a26ccf0ee62636eee08215c
2     d1867cbda35e0d48e9a8390d9f5e079c9d99ea96
3     63268cce0d68127729890c1691f62d5be5abd87c
4     02871cd952d607ba69b64e2e107773012c708113
5     0938eb3d1b449b480c4e2431c457f6ead7063a34
6     e4c6b36e65db3d48474dd538fe74d2dbb5a2e79e
7     b97479f9a563a5c43b423a976f51fd509e1ec5ba
8     3bb020df0ff376dfdded4d5e63e2d35a50b3c535
9     f3fb86c0f024f640cae3fb479f3a27e0dd499891
10    ed3f59e940c08a34434ecb999cbb184bd4f3fe60
11    84b0024025a9b85ac11250419f2223cd36171be7
12    6af1f8a3748f2905913f1ef698d319fb39251200
13    6f6061ee85758d2f28dc813793d10fc28cc7ab3e
14    beb313d271e1b0aee0d43e2598ad606b97246151
...
233271    9c34997e5b77219785b9c34fc7733bc4612a53c8
233272    bcdb72cc8c5c0109eaaeed326ff6f2d8205b2283
233273    a1bca71ebc26a6374bee683d6bc6f50bda31c0cd
233274    7574ac82d225d8c09e0fa56efb805b4cc44ca4dc
233275    5a2f3cbe64c34f3941eb18e050a9fbef33b902bb
233276    b8916697b5fecb4de815c8a265c9e1a6ea5f5f7c
233277    8b583d8dac0c25313f1564c25b6f74693a0db2cc
233278    7b32f67a47cb8ff53c2d0f6297b2329182828751
233279    7e528097def0a178be1208993c4e9963b0a26c0c
233280    99dbcbb090e46947efc47be63a7dd654466f6f00
233281    1e0f096b96d437429f87bb69d4d9ec47846b2d9d
233282    5b35a58cb51f890507c30d086121ef66b92b35f1
233283    ebda8b6e7d5787c31bddbb4d4c08af80af2cdf3a
233284    bc2eaff609707b14bca23e06fb5979ccd4994af1
233285    118276dbaf61bc28a11bfc5abca1042de1990d83
Name: user, Length: 233286, dtype: object

In [231]:
small_train = train[:5]

In [245]:
#Get total playcounts for each user for training set

user_playcounts = {}
for row in train.iterrows():
    if row[0] % 100000 == 0:
        print row[0]
    user = row[1].user
    if user in user_playcounts:
        user_playcounts[user] = user_playcounts[user] + row[1].plays
    else:
         user_playcounts[user]  = row[1].plays + 0.0

In [248]:
#each "point" will be a 808 element array
#following map from term to index
i = 0
term_index = {}
for term in unique_terms:
    term_index[term] = i
    i = i + 1

In [257]:
#likewise for users
user_index = {}
for row in profiles.iterrows():
    user_index[row[1].user] = row[0]

In [296]:
#likewise for users
artist_index = {}
for row in artists.iterrows():
    artist_index[row[1].artist] = row[0]

In [263]:
user_matrix = np.zeros((len(profiles), len(unique_terms)))

In [412]:
subset = train[train.user == "ed3f59e940c08a34434ecb999cbb184bd4f3fe60"]
subset.head()


Out[412]:
user artist plays
10134 ed3f59e940c08a34434ecb999cbb184bd4f3fe60 67f66c07-6e61-4026-ade5-7e782fad3a5d 176
233503 ed3f59e940c08a34434ecb999cbb184bd4f3fe60 f4a31f0a-51dd-4fa7-986d-3095c40c5ed9 140
262683 ed3f59e940c08a34434ecb999cbb184bd4f3fe60 d13f0f47-36f9-4661-87fe-2de56f45c649 1253
749080 ed3f59e940c08a34434ecb999cbb184bd4f3fe60 4449ccf6-c948-4d33-aa97-b6ad98ce4b5b 210
910645 ed3f59e940c08a34434ecb999cbb184bd4f3fe60 4b585938-f271-45e2-b19a-91c634b5e396 347

In [421]:
a


Out[421]:
{2}

In [422]:
#fill the user matrix
flagged_artists = set([])
for row in train.iterrows():
    if row[0] % 200000 == 0:
        print row[0]
    artist = row[1].artist
    user = row[1].user
    plays = row[1].plays
    user_idx = user_index[user]
    try:
        for term, weight in artist_terms[bands[artist]]:
            user_matrix[user_idx][term_index[term]] +=  weight*plays
    except:
        flagged_artists.add(artist)


0
200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2200000
2400000
2600000
2800000
3000000
3200000
3400000
3600000
3800000
4000000

In [382]:
artist_matrix = np.zeros((len(artists), len(unique_terms)))
for artist in artists.artist:
    artist_idx = artist_index[artist]
    if bands[artist] in artist_terms:
        for term, weight in artist_terms[bands[artist]]:
            artist_matrix[artist_idx][term_index[term]] +=  weight
    else: 
        print "Could not add", artist


Could not add f1a95c6b-fb2a-41a6-bfcb-2453fee2a38c
Could not add 9bf79f68-c064-44a1-8c2c-5764f1d7c016
Could not add 89ad4ac3-39f7-470e-963a-56509c546377
Could not add 4b179fe2-dfa5-40b1-b6db-b56dbc3b5f09
Could not add 10b7b68c-390d-469a-915b-40bac704f288
Could not add 5aca3051-afa2-4f5c-9974-cc9418482a58
Could not add b5da400c-9a62-4686-b6fe-91518e57ce5d
Could not add ae681605-2801-4120-9a48-e18752042306
Could not add 8f3f7fec-cabf-4366-9c31-06f204b402f5
Could not add f26c72d3-e52c-467b-b651-679c73d8e1a7
Could not add 64b86e99-b6ec-4fb1-a5cd-f95482d3b57a
Could not add 5385c403-1c49-4f2f-9b98-7085b5c84371

In [424]:
len(flagged_artists)


Out[424]:
12

In [383]:
user_matrix[user_index[user]]


Out[383]:
array([  350.95277673,     0.        ,     0.        ,     0.        ,
         559.23645802,     0.        ,     0.        ,     0.        ,
           0.        ,   144.12817391,     0.        ,  1014.82236261,
        2066.42606827,     0.        ,     0.        ,     0.        ,
         328.78603122,     0.        ,     0.        ,     0.        ,
           0.        ,    26.95815574,     0.        ,    93.48616896,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   310.71566069,     0.        ,
           0.        ,   177.18302769,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    62.66484707,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   997.52099884,    96.50268187,
        1341.7879714 ,     0.        ,  1438.98026332,     0.        ,
           0.        ,     0.        ,     0.        ,   187.08446955,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    75.85795335,     0.        ,
           0.        ,     0.        ,     0.        ,    61.63228674,
         233.43368592,     0.        ,     0.        ,     0.        ,
         233.13785509,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   275.10921459,     0.        ,
           0.        ,     0.        ,     0.        ,    58.64988827,
           0.        ,     0.        ,     0.        ,     0.        ,
         745.63835696,     0.        ,     0.        ,   252.93401522,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,   824.45753802,     0.        ,     0.        ,
           0.        ,     0.        ,   584.28620363,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
        1575.34455438,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    34.06530589,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
          78.42462416,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,  1008.02137335,     0.        ,
         540.13061528,     0.        ,     0.        ,     0.        ,
         392.72536138,   241.17501655,     0.        ,  1476.90132219,
          56.73346265,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,    56.52432185,     0.        ,   356.39663009,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,    80.47237019,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,  6753.72665997,
           0.        ,     0.        ,     0.        ,     0.        ,
         366.43249571,     0.        ,     0.        ,     0.        ,
         412.53053511,     0.        ,     0.        ,     0.        ,
        1518.54025515,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         144.20741265,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         498.48101183,     0.        ,   229.66915983,   178.99891881,
           0.        ,     0.        ,     0.        ,    80.52613834,
           0.        ,    57.6264674 ,     0.        ,     0.        ,
           0.        ,     0.        ,  1346.05668549,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,   139.79558913,     0.        ,     0.        ,
          60.28736166,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   600.68668531,  2374.00141843,
           0.        ,     0.        ,     0.        ,   197.50476685,
         176.09816726,     0.        ,     0.        ,   388.81271711,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    46.01217837,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         153.22502118,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   351.593844  ,     0.        ,
          98.42696267,     0.        ,     0.        ,  1623.46102344,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,    56.32101879,     0.        ,     0.        ,
         819.92644794,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   120.96487457,
         245.20806424,     0.        ,     0.        ,     0.        ,
         257.27634231,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         340.15837438,    78.86222873,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   232.5994756 ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,  1174.7296862 ,     0.        ,
           0.        ,     0.        ,   125.47500793,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         161.15528479,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,  2122.31545681,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   480.1808959 ,   136.73698551,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   129.80920239,
           0.        ,     0.        ,   161.26752007,     0.        ,
           0.        ,     0.        ,   267.7763915 ,     0.        ,
           0.        ,     0.        ,   395.38045503,  1013.10656428,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         385.63764725,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   128.98311768,     0.        ,
           0.        ,    77.21692189,     0.        ,  1622.31293318,
           0.        ,     0.        ,     0.        ,  3914.45356773,
           0.        ,     0.        ,     0.        ,   150.97374626,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   193.01540851,
           0.        ,   338.96833983,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,   232.22475338,     0.        ,     0.        ,
           0.        ,     0.        ,   627.86651143,     0.        ,
           0.        ,   978.52496214,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         105.35437353,     0.        ,  1825.83809188,   109.85245623,
         363.3696366 ,     0.        ,     0.        ,     0.        ,
           0.        ,   786.27304598,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,  2566.21568849,     0.        ,
         828.70647048,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   745.14652994,
         137.45462602,   671.37510378,     0.        ,   452.65664697,
           0.        ,   900.81002301,     0.        ,   454.68495076,
           0.        ,     0.        ,   110.85849674,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         788.49635542,     0.        ,     0.        ,     0.        ,
           0.        ,   134.27829281,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,   636.30601359,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   194.3417614 ,
           0.        ,   100.68275869,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   383.9834865 ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,   258.96331175,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    40.45800372,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,   864.60339781,
           0.        ,   239.02659214,     0.        ,     0.        ,
           0.        ,     0.        ,   577.73159849,     0.        ,
           0.        ,    51.61648607,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,   238.33635752,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,    43.38967835,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
         193.20205045,     0.        ,     0.        ,     0.        ,
           0.        ,  1432.63536948,     0.        ,   494.14171681,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,    33.82023175,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ])

In [384]:
np.dot(artist_matrix[artist_index[artist]], user_matrix[user_index[user]])


Out[384]:
10464.236680789982

In [321]:
artist = "fbcd7b29-455f-49e6-9c4f-8249d20a055e"

In [465]:
factor = []
error = []
plays_array = []
dot_prods = []
for row in subset.iterrows():
    user = row[1].user
    artist, plays = row[1].artist, row[1].plays
    print plays/np.dot(artist_matrix[artist_index[artist]], user_matrix[user_index[user]])
    plays_array.append(plays)
    dot_prods.append(np.dot(artist_matrix[artist_index[artist]], user_matrix[user_index[user]]))
    factor.append(plays/np.dot(artist_matrix[artist_index[artist]], user_matrix[user_index[user]]))
    error.append(abs(constant*np.dot(artist_matrix[artist_index[artist]], user_matrix[user_index[user]]) - plays))
print np.mean(error)


0.0116158314768
0.0100778845679
0.047497933679
0.00947974535071
0.0175858872538
0.00788411384253
0.00886000442058
0.00983086596861
0.0143106777637
0.0258666556721
0.00859597368749
0.0222875306125
0.0121694970112
0.0288088327391
0.0495989104044
0.0165371079065
0.0115766505037
163.251468794

In [428]:
constant = np.mean(factor)

In [464]:
constant = np.median(factor)

In [392]:
constant = np.median(subset.plays)

In [425]:
plays


Out[425]:
553

In [432]:
plays_array


Out[432]:
[176,
 140,
 1253,
 210,
 347,
 188,
 164,
 157,
 243,
 410,
 162,
 410,
 188,
 412,
 824,
 239,
 171]

In [433]:
dot_prods


Out[433]:
[15151.734970657461,
 13891.804282675892,
 26380.094941970441,
 22152.493788690379,
 19731.731188263228,
 23845.419251285217,
 18510.148778148017,
 15970.108889832205,
 16980.3278372149,
 15850.522201121734,
 18846.032560075811,
 18395.936594665218,
 15448.461002713093,
 14301.169496570648,
 16613.268180306648,
 14452.345679291107,
 14771.111898524146]

In [434]:
from sklearn import linear_model

In [454]:
model = linear_model.LinearRegression(fit_intercept="True")
model.fit(np.array(dot_prods), np.array(plays_array))


Out[454]:
LinearRegression(copy_X=True, fit_intercept='True', normalize=False)

In [457]:
b = model.intercept_
coef = model.coef_[0]

In [449]:
dot_prods = [[dot_prod] for dot_prod in dot_prods]

In [471]:
user_matrix


Out[471]:
array([[  81.83936095,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [ 182.03429176,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [  20.33675475,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       ..., 
       [  75.69919693,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [  72.79605711,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [ 127.42017054,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])

In [472]:
pickle.dump( user_matrix, open( "user_matrix_unscaled.p", "wb" ) )

In [473]:
pickle.dump( artist_matrix, open( "artist_matrix.p", "wb" ) )

In [ ]: