In [1]:
import getpass
from muslytics.DatabaseUtils import connect, Session
from muslytics.Utils import Track
import numpy as np
import pandas as pd
In [2]:
db = connect('root', getpass.getpass(), 'localhost', 'muslytics')
select = Session().query(Track)
results = db.execute(select.statement).fetchall()
df_orig = pd.read_sql(select.statement, db, index_col='id')
print('Read {num} tracks'.format(num=len(df_orig)))
In [35]:
TEST_SIZE = 50
df = df_orig.drop(['spotify_id', 'name', 'artists'], 1)
df = df.dropna()
df['loved'] = df['loved'].astype('int')
df['genre'] = df['genre'].astype('category')
test_data = df.sample(n=TEST_SIZE, random_state=42)
train_data = df.drop(test_data.index)
df_dummies = pd.get_dummies(df, prefix='genre', columns=['genre'], drop_first=True)
dummy_test = df_dummies.loc[test_data.index.values]
dummy_train = df_dummies.drop(test_data.index)
dependent = 'rating'
base_predictors = ['plays', 'loved', 'popularity',
'acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
'speechiness', 'tempo', 'time_signature', 'valence', 'year']
base_predictors_dummy = base_predictors[:]
base_predictors_dummy.extend(['genre_christian & gospel', 'genre_comedy', 'genre_country',
'genre_dance', 'genre_electronic', 'genre_hip hop/rap',
'genre_hip-hop/rap', 'genre_holiday', 'genre_piano', 'genre_pop',
'genre_pop latino', 'genre_r&b/soul', 'genre_rock',
'genre_singer/songwriter', 'genre_soundtrack'])
dummy_train_y = dummy_train.rating.values
test_data_X = dummy_test.as_matrix(columns=base_predictors_dummy)
test_data_y = dummy_test.rating.values
FMT_CORRECT = '{i}:\tPredicted: {p:.2f}\tActual: {a:.0f}\t\tCORRECT'
FMT_INCORRECT = '{i}:\tPredicted: {p:.2f}\tActual: {a:.0f}\tErr: {e:.2f}'
FMT_CORRECT_ROUND = '{i}:\tPredicted: {p:.0f}\tActual: {a:.0f}\t\tCORRECT'
FMT_INCORRECT_ROUND = '{i}:\tPredicted: {p:.0f}\tActual: {a:.0f}\tErr: {e:.0f}'
def print_accuracy(results, round=True):
correct_fmt = FMT_CORRECT_ROUND if round else FMT_CORRECT
incorrect_fmt = FMT_INCORRECT_ROUND if round else FMT_INCORRECT
total_correct = 0
total_err = 0
for pred, act, i in results:
err = (act-pred)**2
if err == 0:
total_correct += 1
print correct_fmt.format(p=pred, a=act, i=i)
else:
print incorrect_fmt.format(p=pred, a=act, i=i, e=err)
total_err += err
print '\nTotal Correct: {c} of {t}'.format(c=total_correct, t=len(dummy_test))
print 'MSE: {e:.4f}'.format(e=total_err/float(len(dummy_test)))
In [18]:
from sklearn.feature_selection import f_regression
ALPHA = 0.05
def get_least_stat_sig(predictors):
F, p = f_regression(dummy_train.as_matrix(columns=predictors), dummy_train_y)
not_stat_sig = []
for feature, p_val in zip(predictors, p):
if p_val >= ALPHA:
not_stat_sig.append((feature, p_val))
if not_stat_sig:
least = sorted(not_stat_sig, key=lambda x: x[1], reverse=True)[0]
print '{f} is not statistically significant at a={a:.2f} (p={p:.4f})'.format(f=least[0], a=ALPHA, p=least[1])
return least
else:
return False
multiple_regression_predictors = base_predictors_dummy[:]
to_remove=None
while True:
to_remove = get_least_stat_sig(multiple_regression_predictors)
if to_remove:
multiple_regression_predictors.remove(to_remove[0])
else:
break
print '\nStatistically significant: {f}'.format(f=', '.join(multiple_regression_predictors))
In [19]:
from sklearn import linear_model
lin_reg = linear_model.LinearRegression()
dummy_train_X = dummy_train.as_matrix(columns=multiple_regression_predictors)
lin_reg.fit(dummy_train_X, dummy_train_y)
print 'Linear Regression Model Score (training): {s:.4f}'.format(s=lin_reg.score(dummy_train_X, dummy_train_y))
In [20]:
coefs = ['{name}*{coef:.5f}'.format(name=name.upper(), coef=coef)
for (name, coef) in zip(multiple_regression_predictors, lin_reg.coef_)]
equation = 'rating = {int} + {coefs}'.format(int=lin_reg.intercept_, coefs=' + '.join(coefs))
print equation
In [36]:
mr_test_data_X = dummy_test.as_matrix(columns=multiple_regression_predictors)
mult_reg_predictions = zip(lin_reg.predict(mr_test_data_X),
test_data_y,
dummy_test.index.values)
print_accuracy(mult_reg_predictions, round=False)
print 'Logistic Regression Model Score (testing): {s:.4f}'.format(s=lin_reg.score(mr_test_data_X, test_data_y))
In [37]:
round_mult_reg_predictions = zip([int(round(x)) for x in lin_reg.predict(dummy_test.as_matrix(columns=multiple_regression_predictors))],
dummy_test.rating.values, dummy_test.index.values)
print_accuracy(round_mult_reg_predictions)
In [38]:
log_reg = linear_model.LogisticRegression(solver='newton-cg', multi_class='multinomial')
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
log_reg.fit(train_data_X, train_data_y)
print 'Logistic Regression Model Score (training): {s:.4f}'.format(s=log_reg.score(train_data_X, train_data_y))
# http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/
# http://nbviewer.jupyter.org/gist/justmarkham/6d5c061ca5aee67c4316471f8c2ae976
In [13]:
log_reg.coef_
pd.DataFrame(zip(base_predictors_dummy, *(log_reg.coef_)))
Out[13]:
In [42]:
log_reg_predictions = zip(log_reg.predict(test_data_X),
test_data_y,
dummy_test.index.values)
print_accuracy(log_reg_predictions)
print 'Logistic Regression Model Score (testing): {s:.4f}'.format(s=log_reg.score(test_data_X, test_data_y))
In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
forest.fit(train_data_X, train_data_y)
print 'Random Forest Model Score (training): {s:.4f}'.format(s=forest.score(train_data_X, train_data_y))
In [41]:
forest_predictions = zip(forest.predict(test_data_X),
test_data_y,
dummy_test.index.values)
print_accuracy(forest_predictions)
print 'Random Forest Model Score (testing): {s:.4f}'.format(s=forest.score(test_data_X, test_data_y))
In [43]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
naive_bayes.fit(train_data_X, train_data_y)
print 'Naive Bayes Model Score (training): {s:.4f}'.format(s=naive_bayes.score(train_data_X, train_data_y))
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
In [45]:
nb_predictions = zip(naive_bayes.predict(test_data_X),
test_data_y,
dummy_test.index.values)
print_accuracy(nb_predictions)
print 'Naive Bayes Model Score (testing): {s:.4f}'.format(s=naive_bayes.score(test_data_X, test_data_y))
In [48]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
train_data_X = dummy_train.as_matrix(columns=base_predictors_dummy)
train_data_y = dummy_train.rating.values
lda.fit(train_data_X, train_data_y)#.transform(train_data_X)
print 'LDA Model Score (training): {s:.4f}'.format(s=lda.score(train_data_X, train_data_y))
# http://stackoverflow.com/questions/31107945/how-to-perform-prediction-with-lda-linear-discriminant-in-scikit-learn
In [49]:
lda_predictions = zip(lda.predict(test_data_X),
test_data_y,
dummy_test.index.values)
print_accuracy(lda_predictions)
print 'LDA Model Score (testing): {s:.4f}'.format(s=lda.score(test_data_X, test_data_y))
In [ ]: