In [18]:
import getpass
from muslytics.DatabaseUtils import connect, Session
from muslytics.Utils import Track
from bokeh.charts import BoxPlot, Histogram
from bokeh.io import output_notebook, push_notebook, show
from bokeh.layouts import row, column
from bokeh.models import CategoricalColorMapper, ColumnDataSource, HoverTool
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral
from bokeh.plotting import figure
from ipywidgets import interact
import numpy as np
import pandas as pd
output_notebook()
In [2]:
db = connect('root', getpass.getpass(), 'localhost', 'muslytics')
select = Session().query(Track)
results = db.execute(select.statement).fetchall()
df = pd.read_sql(select.statement, db, index_col='id')
print('Read {num} tracks'.format(num=len(df)))
In [3]:
df.info()
In [4]:
df.loc[df.isnull().any(axis=1)]
Out[4]:
In [5]:
df = df.dropna()
df.describe()
Out[5]:
In [6]:
def get_bounds(df):
col_vals = df.plays
s3 = 3*np.std(col_vals)
col_mean = np.mean(col_vals)
lower = col_mean - s3
upper = col_mean + s3
print("\tThe mean plays is {mean}. Outliers are < {x} and > {y}".format(mean=col_mean,
x=lower,
y=upper))
return (col_mean, lower, upper)
def find_outliers(df, lower, upper):
outliers = df.loc[df['plays'] > upper]
print("\tThere are {x} outliers by play count".format(x=len(outliers)))
return outliers
def drop_outliers(df, outliers):
dropped = df.drop(outliers.index)
print("Removing {x} outliers. {y} tracks left.".format(x=len(outliers), y=len(dropped)))
return dropped
In [7]:
print("Starting with all {x} songs ...".format(x=len(df)))
mean, lower, upper = get_bounds(df)
outliers = find_outliers(df, lower, upper)
no_outliers = []
current_df = df
while len(outliers) > 0:
no_outlier_df = drop_outliers(current_df, outliers)
no_outliers.append(no_outlier_df)
mean, lower, upper = get_bounds(no_outlier_df)
outliers = find_outliers(no_outlier_df, lower, upper)
current_df = no_outlier_df
no_outlier_df = no_outliers[-1]
In [8]:
no_outlier_df.describe()
Out[8]:
In [9]:
def make_plays_rating_hist(df, title):
plays = Histogram(df, 'plays', bins=10, title='Play Count Distribution ({x})'.format(x=title))
ratings = Histogram(df, 'rating', bins=5, title='Rating Distribution ({x})'.format(x=title))
return (plays, ratings)
def make_plays_boxplot(df, title):
return BoxPlot(df, values='plays', label='genre', color='genre', title='Plays by Genre ({x})'.format(x=title))
In [10]:
all_row = row(*make_plays_rating_hist(df, 'All'))
no_outlier_row = row(*make_plays_rating_hist(no_outlier_df, 'Removed Outliers'))
layout = column(all_row, no_outlier_row)
show(layout)
In [11]:
all_row = make_plays_boxplot(df, 'All')
no_outlier_row = make_plays_boxplot(no_outlier_df, 'Removed Outliers')
layout = column(all_row, no_outlier_row)
show(layout)
In [12]:
genres = no_outlier_df.genre.unique()
panels = []
for genre in genres:
no_outlier_genre_df = no_outlier_df.loc[no_outlier_df.genre == genre]
plays, ratings = make_plays_rating_hist(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
play_box = make_plays_boxplot(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
panels.append(Panel(child=column(row(plays, play_box), ratings), title="{g} ({x})".format(g=genre.title(),
x=len(no_outlier_genre_df))))
tabs = Tabs(tabs=panels)
show(tabs)
In [13]:
spectral11 = Spectral[10]
color_map = { genre : spectral11[i % len(spectral11)] for i, genre in enumerate(df.genre.unique())}
mapper = CategoricalColorMapper(factors=color_map.keys(), palette=color_map.values())
hover = HoverTool(tooltips=[
('name', '@name'),
('plays', '@plays'),
('rating', '@rating'),
('genre', '@genre')
])
source = ColumnDataSource(data={'x': no_outlier_df['valence'],
'y': no_outlier_df['acousticness'],
'name': no_outlier_df['name'],
'genre': no_outlier_df['genre'],
'rating': no_outlier_df['rating'],
'plays': no_outlier_df['plays']})
def update(genre='all', x='valence', y='acousticness'):
if genre == 'all':
data = no_outlier_df
else:
data = no_outlier_df.loc[no_outlier_df['genre'] == genre]
source.data['x'] = data[x]
source.data['y'] = data[y]
plot.title.text = '{x} by {y} ({g})'.format(x=x.title(), y=y.title(), g=genre.title())
plot.xaxis.axis_label = x.title()
plot.yaxis.axis_label = y.title()
push_notebook(handle=interhandle)
plot = figure(x_axis_label='valence',
y_axis_label='acousticness',
tools=[hover, 'pan', 'wheel_zoom', 'save'],
title='Valence vs. Acousticness (All)', plot_height=400, plot_width=400)
plot.scatter('x', 'y',
source=source,
fill_alpha=0.6,
size=5
)
options = ['popularity', 'acousticness', 'danceability', 'duration_ms',
'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
'tempo', 'valence', 'rating', 'plays']
genres = ['all']
genres.extend(df.genre.unique())
In [19]:
interhandle = show(plot, notebook_handle=True)
In [21]:
_ = interact(update, genre=genres, x=options, y=options)
In [ ]: