In [1]:
import getpass

from muslytics.DatabaseUtils import connect, Session
from muslytics.Utils import Track

import numpy as np
import pandas as pd

In [2]:
db = connect('root', getpass.getpass(), 'localhost', 'muslytics')
select = Session().query(Track)
results = db.execute(select.statement).fetchall()
df_orig = pd.read_sql(select.statement, db, index_col='id')

print('Read {num} tracks'.format(num=len(df_orig)))


········
Read 1690 tracks

Predictions


In [35]:
TEST_SIZE = 50

df = df_orig.drop(['spotify_id', 'name', 'artists'], 1)
df = df.dropna()
df['loved'] = df['loved'].astype('int')
df['genre'] = df['genre'].astype('category')

test_data = df.sample(n=TEST_SIZE, random_state=42)
train_data = df.drop(test_data.index)

df_dummies = pd.get_dummies(df, prefix='genre', columns=['genre'], drop_first=True) 
dummy_test = df_dummies.loc[test_data.index.values]
dummy_train = df_dummies.drop(test_data.index)

dependent = 'rating'
base_predictors = ['plays', 'loved', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence', 'year']

base_predictors_dummy = base_predictors[:]
base_predictors_dummy.extend(['genre_christian & gospel', 'genre_comedy', 'genre_country',
                              'genre_dance', 'genre_electronic', 'genre_hip hop/rap',
                              'genre_hip-hop/rap', 'genre_holiday', 'genre_piano', 'genre_pop',
                              'genre_pop latino', 'genre_r&b/soul', 'genre_rock',
                              'genre_singer/songwriter', 'genre_soundtrack'])

dummy_train_y = dummy_train.rating.values

test_data_X = dummy_test.as_matrix(columns=base_predictors_dummy)
test_data_y = dummy_test.rating.values

FMT_CORRECT = '{i}:\tPredicted: {p:.2f}\tActual: {a:.0f}\t\tCORRECT'
FMT_INCORRECT = '{i}:\tPredicted: {p:.2f}\tActual: {a:.0f}\tErr: {e:.2f}'
FMT_CORRECT_ROUND = '{i}:\tPredicted: {p:.0f}\tActual: {a:.0f}\t\tCORRECT'
FMT_INCORRECT_ROUND = '{i}:\tPredicted: {p:.0f}\tActual: {a:.0f}\tErr: {e:.0f}'

def print_accuracy(results, round=True):
    correct_fmt = FMT_CORRECT_ROUND if round else FMT_CORRECT
    incorrect_fmt = FMT_INCORRECT_ROUND if round else FMT_INCORRECT
    
    total_correct = 0
    total_err = 0
    for pred, act, i in results:
        err = (act-pred)**2
        if err == 0:
            total_correct += 1
            print correct_fmt.format(p=pred, a=act, i=i)
        else:
            print incorrect_fmt.format(p=pred, a=act, i=i, e=err)
        total_err += err

    print '\nTotal Correct: {c} of {t}'.format(c=total_correct, t=len(dummy_test))

    print 'MSE: {e:.4f}'.format(e=total_err/float(len(dummy_test)))

Multiple Regression

Note that this probably won't be a great fit because we want to predict a discrete rating.


In [18]:
from sklearn.feature_selection import f_regression
ALPHA = 0.05

def get_least_stat_sig(predictors):
    F, p = f_regression(dummy_train.as_matrix(columns=predictors), dummy_train_y)
    not_stat_sig = []
    for feature, p_val in zip(predictors, p):
        if p_val >= ALPHA:
            not_stat_sig.append((feature, p_val))
            
    if not_stat_sig:
        least = sorted(not_stat_sig, key=lambda x: x[1], reverse=True)[0]
        print '{f} is not statistically significant at a={a:.2f} (p={p:.4f})'.format(f=least[0], a=ALPHA, p=least[1])
        return least
    else:
        return False
    
multiple_regression_predictors = base_predictors_dummy[:]
to_remove=None

while True:
    to_remove = get_least_stat_sig(multiple_regression_predictors)
    if to_remove:
        multiple_regression_predictors.remove(to_remove[0])
    else:
        break
    
print '\nStatistically significant: {f}'.format(f=', '.join(multiple_regression_predictors))


tempo is not statistically significant at a=0.05 (p=0.7432)
danceability is not statistically significant at a=0.05 (p=0.7237)
valence is not statistically significant at a=0.05 (p=0.7068)
key is not statistically significant at a=0.05 (p=0.6981)
genre_christian & gospel is not statistically significant at a=0.05 (p=0.6828)
liveness is not statistically significant at a=0.05 (p=0.6517)
genre_electronic is not statistically significant at a=0.05 (p=0.5659)
mode is not statistically significant at a=0.05 (p=0.4944)
energy is not statistically significant at a=0.05 (p=0.4749)
genre_pop is not statistically significant at a=0.05 (p=0.4523)
genre_comedy is not statistically significant at a=0.05 (p=0.4323)
genre_rock is not statistically significant at a=0.05 (p=0.2980)
acousticness is not statistically significant at a=0.05 (p=0.2879)
genre_r&b/soul is not statistically significant at a=0.05 (p=0.2787)
genre_hip-hop/rap is not statistically significant at a=0.05 (p=0.2313)
popularity is not statistically significant at a=0.05 (p=0.2217)
time_signature is not statistically significant at a=0.05 (p=0.2185)
genre_piano is not statistically significant at a=0.05 (p=0.1554)
genre_dance is not statistically significant at a=0.05 (p=0.1089)
speechiness is not statistically significant at a=0.05 (p=0.0767)
year is not statistically significant at a=0.05 (p=0.0588)

Statistically significant: plays, loved, duration_ms, instrumentalness, loudness, genre_country, genre_hip hop/rap, genre_holiday, genre_pop latino, genre_singer/songwriter, genre_soundtrack

In [19]:
from sklearn import linear_model
lin_reg = linear_model.LinearRegression()
dummy_train_X = dummy_train.as_matrix(columns=multiple_regression_predictors)
lin_reg.fit(dummy_train_X, dummy_train_y)
print 'Linear Regression Model Score (training): {s:.4f}'.format(s=lin_reg.score(dummy_train_X, dummy_train_y))


Linear Regression Model Score (training): 0.1988

In [20]:
coefs = ['{name}*{coef:.5f}'.format(name=name.upper(), coef=coef)
         for (name, coef) in zip(multiple_regression_predictors, lin_reg.coef_)]
equation = 'rating = {int} + {coefs}'.format(int=lin_reg.intercept_, coefs=' + '.join(coefs))
print equation


rating = 3.33677305224 + PLAYS*0.00025 + LOVED*0.92902 + DURATION_MS*0.00000 + INSTRUMENTALNESS*-0.56153 + LOUDNESS*0.01658 + GENRE_COUNTRY*-0.12041 + GENRE_HIP HOP/RAP*-1.76554 + GENRE_HOLIDAY*-0.97176 + GENRE_POP LATINO*-0.28165 + GENRE_SINGER/SONGWRITER*0.12845 + GENRE_SOUNDTRACK*0.34749

In [36]:
mr_test_data_X = dummy_test.as_matrix(columns=multiple_regression_predictors)
mult_reg_predictions = zip(lin_reg.predict(mr_test_data_X),
                           test_data_y,
                           dummy_test.index.values)
                           
print_accuracy(mult_reg_predictions, round=False)
print 'Logistic Regression Model Score (testing): {s:.4f}'.format(s=lin_reg.score(mr_test_data_X, test_data_y))


5591:	Predicted: 3.53	Actual: 4	Err: 0.22
8633:	Predicted: 3.69	Actual: 3	Err: 0.47
4889:	Predicted: 3.55	Actual: 2	Err: 2.41
6857:	Predicted: 3.60	Actual: 2	Err: 2.55
4915:	Predicted: 3.51	Actual: 3	Err: 0.26
2825:	Predicted: 3.03	Actual: 3	Err: 0.00
8315:	Predicted: 3.43	Actual: 2	Err: 2.05
3659:	Predicted: 3.59	Actual: 4	Err: 0.17
5641:	Predicted: 3.58	Actual: 4	Err: 0.17
2801:	Predicted: 3.60	Actual: 4	Err: 0.16
3769:	Predicted: 3.73	Actual: 4	Err: 0.07
5119:	Predicted: 3.71	Actual: 4	Err: 0.08
2695:	Predicted: 3.65	Actual: 4	Err: 0.12
5967:	Predicted: 4.52	Actual: 5	Err: 0.23
2803:	Predicted: 3.54	Actual: 4	Err: 0.21
4727:	Predicted: 3.72	Actual: 3	Err: 0.52
2945:	Predicted: 3.72	Actual: 2	Err: 2.95
6167:	Predicted: 3.51	Actual: 4	Err: 0.24
6745:	Predicted: 2.58	Actual: 2	Err: 0.34
5313:	Predicted: 3.78	Actual: 4	Err: 0.05
4111:	Predicted: 4.01	Actual: 4	Err: 0.00
3633:	Predicted: 3.56	Actual: 4	Err: 0.19
4481:	Predicted: 3.50	Actual: 3	Err: 0.25
4643:	Predicted: 3.45	Actual: 4	Err: 0.30
8375:	Predicted: 4.96	Actual: 5	Err: 0.00
8637:	Predicted: 3.61	Actual: 2	Err: 2.59
5921:	Predicted: 3.72	Actual: 4	Err: 0.08
5403:	Predicted: 3.72	Actual: 5	Err: 1.64
4463:	Predicted: 3.82	Actual: 4	Err: 0.03
4205:	Predicted: 3.82	Actual: 4	Err: 0.03
5125:	Predicted: 3.60	Actual: 4	Err: 0.16
3299:	Predicted: 3.72	Actual: 4	Err: 0.08
5549:	Predicted: 3.68	Actual: 3	Err: 0.46
3359:	Predicted: 3.69	Actual: 3	Err: 0.48
6255:	Predicted: 3.66	Actual: 4	Err: 0.11
3409:	Predicted: 3.53	Actual: 4	Err: 0.22
6229:	Predicted: 3.54	Actual: 4	Err: 0.21
4359:	Predicted: 3.69	Actual: 1	Err: 7.23
2897:	Predicted: 4.11	Actual: 4	Err: 0.01
5307:	Predicted: 3.77	Actual: 4	Err: 0.05
2757:	Predicted: 3.71	Actual: 3	Err: 0.51
5121:	Predicted: 3.66	Actual: 4	Err: 0.11
4375:	Predicted: 3.67	Actual: 3	Err: 0.45
3685:	Predicted: 3.66	Actual: 4	Err: 0.12
8563:	Predicted: 3.47	Actual: 4	Err: 0.28
8263:	Predicted: 3.30	Actual: 4	Err: 0.49
5825:	Predicted: 3.73	Actual: 4	Err: 0.07
3567:	Predicted: 3.73	Actual: 4	Err: 0.07
3145:	Predicted: 3.63	Actual: 4	Err: 0.14
8701:	Predicted: 3.57	Actual: 4	Err: 0.19

Total Correct: 0 of 50
MSE: 0.5970
Logistic Regression Model Score (testing): 0.1749

In [37]:
round_mult_reg_predictions = zip([int(round(x)) for x in lin_reg.predict(dummy_test.as_matrix(columns=multiple_regression_predictors))],
                                 dummy_test.rating.values, dummy_test.index.values)

print_accuracy(round_mult_reg_predictions)


5591:	Predicted: 4	Actual: 4		CORRECT
8633:	Predicted: 4	Actual: 3	Err: 1
4889:	Predicted: 4	Actual: 2	Err: 4
6857:	Predicted: 4	Actual: 2	Err: 4
4915:	Predicted: 4	Actual: 3	Err: 1
2825:	Predicted: 3	Actual: 3		CORRECT
8315:	Predicted: 3	Actual: 2	Err: 1
3659:	Predicted: 4	Actual: 4		CORRECT
5641:	Predicted: 4	Actual: 4		CORRECT
2801:	Predicted: 4	Actual: 4		CORRECT
3769:	Predicted: 4	Actual: 4		CORRECT
5119:	Predicted: 4	Actual: 4		CORRECT
2695:	Predicted: 4	Actual: 4		CORRECT
5967:	Predicted: 5	Actual: 5		CORRECT
2803:	Predicted: 4	Actual: 4		CORRECT
4727:	Predicted: 4	Actual: 3	Err: 1
2945:	Predicted: 4	Actual: 2	Err: 4
6167:	Predicted: 4	Actual: 4		CORRECT
6745:	Predicted: 3	Actual: 2	Err: 1
5313:	Predicted: 4	Actual: 4		CORRECT
4111:	Predicted: 4	Actual: 4		CORRECT
3633:	Predicted: 4	Actual: 4		CORRECT
4481:	Predicted: 3	Actual: 3		CORRECT
4643:	Predicted: 3	Actual: 4	Err: 1
8375:	Predicted: 5	Actual: 5		CORRECT
8637:	Predicted: 4	Actual: 2	Err: 4
5921:	Predicted: 4	Actual: 4		CORRECT
5403:	Predicted: 4	Actual: 5	Err: 1
4463:	Predicted: 4	Actual: 4		CORRECT
4205:	Predicted: 4	Actual: 4		CORRECT
5125:	Predicted: 4	Actual: 4		CORRECT
3299:	Predicted: 4	Actual: 4		CORRECT
5549:	Predicted: 4	Actual: 3	Err: 1
3359:	Predicted: 4	Actual: 3	Err: 1
6255:	Predicted: 4	Actual: 4		CORRECT
3409:	Predicted: 4	Actual: 4		CORRECT
6229:	Predicted: 4	Actual: 4		CORRECT
4359:	Predicted: 4	Actual: 1	Err: 9
2897:	Predicted: 4	Actual: 4		CORRECT
5307:	Predicted: 4	Actual: 4		CORRECT
2757:	Predicted: 4	Actual: 3	Err: 1
5121:	Predicted: 4	Actual: 4		CORRECT
4375:	Predicted: 4	Actual: 3	Err: 1
3685:	Predicted: 4	Actual: 4		CORRECT
8563:	Predicted: 3	Actual: 4	Err: 1
8263:	Predicted: 3	Actual: 4	Err: 1
5825:	Predicted: 4	Actual: 4		CORRECT
3567:	Predicted: 4	Actual: 4		CORRECT
3145:	Predicted: 4	Actual: 4		CORRECT
8701:	Predicted: 4	Actual: 4		CORRECT

Total Correct: 32 of 50
MSE: 0.7600

Logistic Regression


In [38]:
log_reg = linear_model.LogisticRegression(solver='newton-cg', multi_class='multinomial')
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
log_reg.fit(train_data_X, train_data_y)
print 'Logistic Regression Model Score (training): {s:.4f}'.format(s=log_reg.score(train_data_X, train_data_y))
# http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/
# http://nbviewer.jupyter.org/gist/justmarkham/6d5c061ca5aee67c4316471f8c2ae976


Logistic Regression Model Score (training): 0.6077

In [13]:
log_reg.coef_
pd.DataFrame(zip(base_predictors_dummy, *(log_reg.coef_)))


Out[13]:
0 1 2 3 4 5
0 plays -0.024199 -0.026011 0.011629 0.018828 0.019753
1 loved -0.001931 0.000149 -0.159436 -0.772364 0.933582
2 popularity 0.006677 -0.016757 0.010621 -0.004465 0.003925
3 acousticness 0.017378 0.030967 -0.062454 0.079344 -0.065234
4 danceability -0.004669 -0.009688 0.027307 0.021216 -0.034167
5 duration_ms -0.000006 -0.000004 0.000002 0.000003 0.000005
6 energy 0.010961 -0.037920 0.038173 -0.041850 0.030636
7 instrumentalness 0.035971 0.051345 0.096214 -0.115088 -0.068442
8 key -0.046486 0.007254 0.019992 -0.005750 0.024990
9 liveness 0.026039 -0.026599 0.005765 -0.030104 0.024898
10 loudness -0.071114 -0.061302 0.038025 0.060209 0.034181
11 mode 0.015357 0.078486 0.084226 -0.087345 -0.090725
12 speechiness -0.007621 -0.005057 -0.021848 -0.088545 0.123071
13 tempo 0.005548 -0.002493 -0.001101 0.001033 -0.002986
14 time_signature 0.013703 0.173334 0.015695 -0.151886 -0.050846
15 valence 0.029680 0.015450 -0.038639 0.043681 -0.050172
16 year -0.000802 0.000730 -0.000061 0.000872 -0.000739
17 genre_christian & gospel -0.000117 -0.000742 -0.006548 0.008833 -0.001425
18 genre_comedy -0.000372 -0.002285 0.018885 -0.014411 -0.001818
19 genre_country -0.118472 -0.156657 0.361132 0.076941 -0.162944
20 genre_dance -0.000271 -0.001925 -0.007571 -0.015086 0.024852
21 genre_electronic 0.024779 -0.028136 0.017166 -0.035223 0.021414
22 genre_hip hop/rap -0.000332 0.022494 -0.008458 -0.010689 -0.003015
23 genre_hip-hop/rap -0.021920 -0.023609 0.044793 -0.019851 0.020588
24 genre_holiday -0.019237 0.197550 -0.022160 -0.126384 -0.029769
25 genre_piano -0.000723 -0.012259 -0.072373 0.115204 -0.029849
26 genre_pop 0.207269 0.229626 0.002660 -0.195852 -0.243703
27 genre_pop latino -0.031315 0.216509 -0.027634 -0.302175 0.144615
28 genre_r&b/soul -0.005113 -0.023283 -0.027963 0.043869 0.012490
29 genre_rock -0.009281 0.097841 -0.089361 0.006563 -0.005762
30 genre_singer/songwriter -0.010193 -0.105306 0.146869 0.020920 -0.052290
31 genre_soundtrack -0.028594 -0.153490 -0.082112 -0.227129 0.491326

In [42]:
log_reg_predictions = zip(log_reg.predict(test_data_X),
                          test_data_y,
                          dummy_test.index.values)
print_accuracy(log_reg_predictions)
print 'Logistic Regression Model Score (testing): {s:.4f}'.format(s=log_reg.score(test_data_X, test_data_y))


5591:	Predicted: 4	Actual: 4		CORRECT
8633:	Predicted: 4	Actual: 3	Err: 1
4889:	Predicted: 4	Actual: 2	Err: 4
6857:	Predicted: 4	Actual: 2	Err: 4
4915:	Predicted: 4	Actual: 3	Err: 1
2825:	Predicted: 4	Actual: 3	Err: 1
8315:	Predicted: 4	Actual: 2	Err: 4
3659:	Predicted: 4	Actual: 4		CORRECT
5641:	Predicted: 4	Actual: 4		CORRECT
2801:	Predicted: 4	Actual: 4		CORRECT
3769:	Predicted: 4	Actual: 4		CORRECT
5119:	Predicted: 4	Actual: 4		CORRECT
2695:	Predicted: 4	Actual: 4		CORRECT
5967:	Predicted: 4	Actual: 5	Err: 1
2803:	Predicted: 4	Actual: 4		CORRECT
4727:	Predicted: 4	Actual: 3	Err: 1
2945:	Predicted: 4	Actual: 2	Err: 4
6167:	Predicted: 4	Actual: 4		CORRECT
6745:	Predicted: 4	Actual: 2	Err: 4
5313:	Predicted: 4	Actual: 4		CORRECT
4111:	Predicted: 4	Actual: 4		CORRECT
3633:	Predicted: 4	Actual: 4		CORRECT
4481:	Predicted: 4	Actual: 3	Err: 1
4643:	Predicted: 4	Actual: 4		CORRECT
8375:	Predicted: 5	Actual: 5		CORRECT
8637:	Predicted: 4	Actual: 2	Err: 4
5921:	Predicted: 4	Actual: 4		CORRECT
5403:	Predicted: 4	Actual: 5	Err: 1
4463:	Predicted: 4	Actual: 4		CORRECT
4205:	Predicted: 4	Actual: 4		CORRECT
5125:	Predicted: 4	Actual: 4		CORRECT
3299:	Predicted: 4	Actual: 4		CORRECT
5549:	Predicted: 4	Actual: 3	Err: 1
3359:	Predicted: 4	Actual: 3	Err: 1
6255:	Predicted: 4	Actual: 4		CORRECT
3409:	Predicted: 4	Actual: 4		CORRECT
6229:	Predicted: 4	Actual: 4		CORRECT
4359:	Predicted: 4	Actual: 1	Err: 9
2897:	Predicted: 4	Actual: 4		CORRECT
5307:	Predicted: 4	Actual: 4		CORRECT
2757:	Predicted: 4	Actual: 3	Err: 1
5121:	Predicted: 4	Actual: 4		CORRECT
4375:	Predicted: 4	Actual: 3	Err: 1
3685:	Predicted: 4	Actual: 4		CORRECT
8563:	Predicted: 3	Actual: 4	Err: 1
8263:	Predicted: 4	Actual: 4		CORRECT
5825:	Predicted: 4	Actual: 4		CORRECT
3567:	Predicted: 4	Actual: 4		CORRECT
3145:	Predicted: 4	Actual: 4		CORRECT
8701:	Predicted: 4	Actual: 4		CORRECT

Total Correct: 31 of 50
MSE: 0.9000
Logistic Regression Model Score (testing): 0.6200

Random Forest Classifier


In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
forest.fit(train_data_X, train_data_y)
print 'Random Forest Model Score (training): {s:.4f}'.format(s=forest.score(train_data_X, train_data_y))


Random Forest Model Score (training): 1.0000

In [41]:
forest_predictions = zip(forest.predict(test_data_X),
                          test_data_y,
                          dummy_test.index.values)

print_accuracy(forest_predictions)
print 'Random Forest Model Score (testing): {s:.4f}'.format(s=forest.score(test_data_X, test_data_y))


5591:	Predicted: 4	Actual: 4		CORRECT
8633:	Predicted: 4	Actual: 3	Err: 1
4889:	Predicted: 2	Actual: 2		CORRECT
6857:	Predicted: 4	Actual: 2	Err: 4
4915:	Predicted: 4	Actual: 3	Err: 1
2825:	Predicted: 4	Actual: 3	Err: 1
8315:	Predicted: 4	Actual: 2	Err: 4
3659:	Predicted: 4	Actual: 4		CORRECT
5641:	Predicted: 4	Actual: 4		CORRECT
2801:	Predicted: 4	Actual: 4		CORRECT
3769:	Predicted: 4	Actual: 4		CORRECT
5119:	Predicted: 4	Actual: 4		CORRECT
2695:	Predicted: 4	Actual: 4		CORRECT
5967:	Predicted: 5	Actual: 5		CORRECT
2803:	Predicted: 4	Actual: 4		CORRECT
4727:	Predicted: 4	Actual: 3	Err: 1
2945:	Predicted: 2	Actual: 2		CORRECT
6167:	Predicted: 3	Actual: 4	Err: 1
6745:	Predicted: 2	Actual: 2		CORRECT
5313:	Predicted: 4	Actual: 4		CORRECT
4111:	Predicted: 4	Actual: 4		CORRECT
3633:	Predicted: 4	Actual: 4		CORRECT
4481:	Predicted: 4	Actual: 3	Err: 1
4643:	Predicted: 4	Actual: 4		CORRECT
8375:	Predicted: 5	Actual: 5		CORRECT
8637:	Predicted: 3	Actual: 2	Err: 1
5921:	Predicted: 4	Actual: 4		CORRECT
5403:	Predicted: 4	Actual: 5	Err: 1
4463:	Predicted: 4	Actual: 4		CORRECT
4205:	Predicted: 4	Actual: 4		CORRECT
5125:	Predicted: 4	Actual: 4		CORRECT
3299:	Predicted: 4	Actual: 4		CORRECT
5549:	Predicted: 4	Actual: 3	Err: 1
3359:	Predicted: 4	Actual: 3	Err: 1
6255:	Predicted: 4	Actual: 4		CORRECT
3409:	Predicted: 4	Actual: 4		CORRECT
6229:	Predicted: 4	Actual: 4		CORRECT
4359:	Predicted: 4	Actual: 1	Err: 9
2897:	Predicted: 4	Actual: 4		CORRECT
5307:	Predicted: 4	Actual: 4		CORRECT
2757:	Predicted: 4	Actual: 3	Err: 1
5121:	Predicted: 4	Actual: 4		CORRECT
4375:	Predicted: 3	Actual: 3		CORRECT
3685:	Predicted: 4	Actual: 4		CORRECT
8563:	Predicted: 4	Actual: 4		CORRECT
8263:	Predicted: 4	Actual: 4		CORRECT
5825:	Predicted: 4	Actual: 4		CORRECT
3567:	Predicted: 4	Actual: 4		CORRECT
3145:	Predicted: 4	Actual: 4		CORRECT
8701:	Predicted: 4	Actual: 4		CORRECT

Total Correct: 36 of 50
MSE: 0.5600
Random Forest Model Score (testing): 0.7200

Naive Bayes


In [43]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
naive_bayes.fit(train_data_X, train_data_y)
print 'Naive Bayes Model Score (training): {s:.4f}'.format(s=naive_bayes.score(train_data_X, train_data_y))
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html


Naive Bayes Model Score (training): 0.3466

In [45]:
nb_predictions = zip(naive_bayes.predict(test_data_X),
                          test_data_y,
                          dummy_test.index.values)

print_accuracy(nb_predictions)
print 'Naive Bayes Model Score (testing): {s:.4f}'.format(s=naive_bayes.score(test_data_X, test_data_y))


5591:	Predicted: 3	Actual: 4	Err: 1
8633:	Predicted: 3	Actual: 3		CORRECT
4889:	Predicted: 2	Actual: 2		CORRECT
6857:	Predicted: 2	Actual: 2		CORRECT
4915:	Predicted: 2	Actual: 3	Err: 1
2825:	Predicted: 2	Actual: 3	Err: 1
8315:	Predicted: 2	Actual: 2		CORRECT
3659:	Predicted: 3	Actual: 4	Err: 1
5641:	Predicted: 2	Actual: 4	Err: 4
2801:	Predicted: 4	Actual: 4		CORRECT
3769:	Predicted: 4	Actual: 4		CORRECT
5119:	Predicted: 2	Actual: 4	Err: 4
2695:	Predicted: 2	Actual: 4	Err: 4
5967:	Predicted: 4	Actual: 5	Err: 1
2803:	Predicted: 4	Actual: 4		CORRECT
4727:	Predicted: 3	Actual: 3		CORRECT
2945:	Predicted: 2	Actual: 2		CORRECT
6167:	Predicted: 3	Actual: 4	Err: 1
6745:	Predicted: 2	Actual: 2		CORRECT
5313:	Predicted: 4	Actual: 4		CORRECT
4111:	Predicted: 4	Actual: 4		CORRECT
3633:	Predicted: 3	Actual: 4	Err: 1
4481:	Predicted: 3	Actual: 3		CORRECT
4643:	Predicted: 3	Actual: 4	Err: 1
8375:	Predicted: 3	Actual: 5	Err: 4
8637:	Predicted: 3	Actual: 2	Err: 1
5921:	Predicted: 4	Actual: 4		CORRECT
5403:	Predicted: 4	Actual: 5	Err: 1
4463:	Predicted: 4	Actual: 4		CORRECT
4205:	Predicted: 3	Actual: 4	Err: 1
5125:	Predicted: 4	Actual: 4		CORRECT
3299:	Predicted: 4	Actual: 4		CORRECT
5549:	Predicted: 2	Actual: 3	Err: 1
3359:	Predicted: 3	Actual: 3		CORRECT
6255:	Predicted: 2	Actual: 4	Err: 4
3409:	Predicted: 4	Actual: 4		CORRECT
6229:	Predicted: 2	Actual: 4	Err: 4
4359:	Predicted: 3	Actual: 1	Err: 4
2897:	Predicted: 5	Actual: 4	Err: 1
5307:	Predicted: 4	Actual: 4		CORRECT
2757:	Predicted: 4	Actual: 3	Err: 1
5121:	Predicted: 2	Actual: 4	Err: 4
4375:	Predicted: 2	Actual: 3	Err: 1
3685:	Predicted: 2	Actual: 4	Err: 4
8563:	Predicted: 3	Actual: 4	Err: 1
8263:	Predicted: 3	Actual: 4	Err: 1
5825:	Predicted: 4	Actual: 4		CORRECT
3567:	Predicted: 4	Actual: 4		CORRECT
3145:	Predicted: 4	Actual: 4		CORRECT
8701:	Predicted: 2	Actual: 4	Err: 4

Total Correct: 23 of 50
MSE: 1.1400
Naive Bayes Model Score (testing): 0.4600

Linear Discriminant


In [48]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
lda.fit(train_data_X, train_data_y)#.transform(train_data_X)
print 'LDA Model Score (training): {s:.4f}'.format(s=lda.score(train_data_X, train_data_y))
# http://stackoverflow.com/questions/31107945/how-to-perform-prediction-with-lda-linear-discriminant-in-scikit-learn


LDA Model Score (training): 0.6089

In [49]:
lda_predictions = zip(lda.predict(test_data_X),
                          test_data_y,
                          dummy_test.index.values)

print_accuracy(lda_predictions)
print 'LDA Model Score (testing): {s:.4f}'.format(s=lda.score(test_data_X, test_data_y))


5591:	Predicted: 3	Actual: 4	Err: 1
8633:	Predicted: 4	Actual: 3	Err: 1
4889:	Predicted: 4	Actual: 2	Err: 4
6857:	Predicted: 4	Actual: 2	Err: 4
4915:	Predicted: 4	Actual: 3	Err: 1
2825:	Predicted: 1	Actual: 3	Err: 4
8315:	Predicted: 4	Actual: 2	Err: 4
3659:	Predicted: 4	Actual: 4		CORRECT
5641:	Predicted: 4	Actual: 4		CORRECT
2801:	Predicted: 4	Actual: 4		CORRECT
3769:	Predicted: 4	Actual: 4		CORRECT
5119:	Predicted: 4	Actual: 4		CORRECT
2695:	Predicted: 4	Actual: 4		CORRECT
5967:	Predicted: 5	Actual: 5		CORRECT
2803:	Predicted: 4	Actual: 4		CORRECT
4727:	Predicted: 4	Actual: 3	Err: 1
2945:	Predicted: 4	Actual: 2	Err: 4
6167:	Predicted: 4	Actual: 4		CORRECT
6745:	Predicted: 2	Actual: 2		CORRECT
5313:	Predicted: 4	Actual: 4		CORRECT
4111:	Predicted: 4	Actual: 4		CORRECT
3633:	Predicted: 4	Actual: 4		CORRECT
4481:	Predicted: 4	Actual: 3	Err: 1
4643:	Predicted: 4	Actual: 4		CORRECT
8375:	Predicted: 5	Actual: 5		CORRECT
8637:	Predicted: 3	Actual: 2	Err: 1
5921:	Predicted: 4	Actual: 4		CORRECT
5403:	Predicted: 4	Actual: 5	Err: 1
4463:	Predicted: 4	Actual: 4		CORRECT
4205:	Predicted: 4	Actual: 4		CORRECT
5125:	Predicted: 4	Actual: 4		CORRECT
3299:	Predicted: 4	Actual: 4		CORRECT
5549:	Predicted: 4	Actual: 3	Err: 1
3359:	Predicted: 4	Actual: 3	Err: 1
6255:	Predicted: 4	Actual: 4		CORRECT
3409:	Predicted: 4	Actual: 4		CORRECT
6229:	Predicted: 4	Actual: 4		CORRECT
4359:	Predicted: 4	Actual: 1	Err: 9
2897:	Predicted: 4	Actual: 4		CORRECT
5307:	Predicted: 4	Actual: 4		CORRECT
2757:	Predicted: 4	Actual: 3	Err: 1
5121:	Predicted: 4	Actual: 4		CORRECT
4375:	Predicted: 4	Actual: 3	Err: 1
3685:	Predicted: 4	Actual: 4		CORRECT
8563:	Predicted: 3	Actual: 4	Err: 1
8263:	Predicted: 2	Actual: 4	Err: 4
5825:	Predicted: 4	Actual: 4		CORRECT
3567:	Predicted: 4	Actual: 4		CORRECT
3145:	Predicted: 4	Actual: 4		CORRECT
8701:	Predicted: 4	Actual: 4		CORRECT

Total Correct: 31 of 50
MSE: 0.9000
LDA Model Score (testing): 0.6200

In [ ]: