Muslytics EDA

Setup

Run the contents of the following cells to load the relevant packages and data.

The second cell will read the tracks table in the muslytics database into dataframe df. You will be prompted for a password.


In [18]:
import getpass

from muslytics.DatabaseUtils import connect, Session
from muslytics.Utils import Track

from bokeh.charts import BoxPlot, Histogram
from bokeh.io import output_notebook, push_notebook, show
from bokeh.layouts import row, column
from bokeh.models import CategoricalColorMapper, ColumnDataSource, HoverTool
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral
from bokeh.plotting import figure
from ipywidgets import interact
import numpy as np
import pandas as pd

output_notebook()


Loading BokehJS ...

In [2]:
db = connect('root', getpass.getpass(), 'localhost', 'muslytics')
select = Session().query(Track)
results = db.execute(select.statement).fetchall()
df = pd.read_sql(select.statement, db, index_col='id')

print('Read {num} tracks'.format(num=len(df)))


········
Read 1690 tracks

Summary Stats

Run the following cell to see the summary statistics for the tracks table.


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690 entries, 2545 to 8803
Data columns (total 22 columns):
spotify_id          1690 non-null object
name                1690 non-null object
artists             1690 non-null object
genre               1690 non-null object
plays               1690 non-null int64
rating              1690 non-null float64
loved               1690 non-null bool
popularity          1690 non-null int64
acousticness        1690 non-null float64
danceability        1689 non-null float64
duration_ms         1690 non-null int64
energy              1690 non-null float64
instrumentalness    1690 non-null float64
key                 1690 non-null int64
liveness            1690 non-null float64
loudness            1690 non-null float64
mode                1690 non-null int64
speechiness         1689 non-null float64
tempo               1690 non-null float64
time_signature      1689 non-null float64
valence             1689 non-null float64
year                1690 non-null int64
dtypes: bool(1), float64(11), int64(6), object(4)
memory usage: 292.1+ KB

In [4]:
df.loc[df.isnull().any(axis=1)]


Out[4]:
spotify_id name artists genre plays rating loved popularity acousticness danceability ... instrumentalness key liveness loudness mode speechiness tempo time_signature valence year
id
3375 46O6QtxuzX3iZn9hMXoeqo Horrible Theme Instrumental soundtrack 8 3.0 False 0 0.568 NaN ... 0.977 9 0.237 -10.925 0 NaN 0.0 NaN NaN 2008

1 rows × 22 columns


In [5]:
df = df.dropna()
df.describe()


Out[5]:
plays rating popularity acousticness danceability duration_ms energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence year
count 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000 1689.000000
mean 201.725873 3.655417 47.391356 0.216711 0.551033 217442.838958 0.685700 0.057329 5.103612 0.173724 -6.197137 0.798697 0.060370 125.261895 3.932504 0.484611 2011.815275
std 633.806641 0.838023 15.520555 0.263215 0.128896 48045.657635 0.203086 0.207385 3.480895 0.129666 2.844253 0.401092 0.072681 29.164364 0.302346 0.230956 3.915455
min 0.000000 1.000000 1.000000 0.000022 0.097200 31733.000000 0.047300 0.000000 0.000000 0.021000 -28.663000 0.000000 0.022500 51.316000 1.000000 0.027600 1975.000000
25% 12.000000 3.000000 37.000000 0.018900 0.478000 193535.000000 0.560000 0.000000 2.000000 0.093500 -7.270000 1.000000 0.032000 102.031000 4.000000 0.303000 2010.000000
50% 35.000000 4.000000 48.000000 0.092900 0.558000 215533.000000 0.733000 0.000000 5.000000 0.121000 -5.576000 1.000000 0.039700 123.970000 4.000000 0.485000 2013.000000
75% 103.000000 4.000000 60.000000 0.329000 0.636000 239907.000000 0.844000 0.000021 8.000000 0.213000 -4.371000 1.000000 0.055200 145.993000 4.000000 0.660000 2015.000000
max 12012.000000 5.000000 85.000000 0.986000 0.965000 536080.000000 0.992000 0.990000 11.000000 0.952000 -0.521000 1.000000 0.852000 207.914000 5.000000 0.965000 2016.000000

Play Count Analysis

Some basic EDA by play count.


In [6]:
def get_bounds(df):
    col_vals = df.plays
    s3 = 3*np.std(col_vals)
    col_mean = np.mean(col_vals)
    lower = col_mean - s3
    upper = col_mean + s3
    print("\tThe mean plays is {mean}. Outliers are < {x} and > {y}".format(mean=col_mean,
                                                                            x=lower,
                                                                            y=upper))
    return (col_mean, lower, upper)

def find_outliers(df, lower, upper):
    outliers = df.loc[df['plays'] > upper]
    print("\tThere are {x} outliers by play count".format(x=len(outliers)))
    return outliers

def drop_outliers(df, outliers):
    dropped = df.drop(outliers.index)
    print("Removing {x} outliers. {y} tracks left.".format(x=len(outliers), y=len(dropped)))
    return dropped

In [7]:
print("Starting with all {x} songs ...".format(x=len(df)))
mean, lower, upper = get_bounds(df)
outliers = find_outliers(df, lower, upper)
no_outliers = []
current_df = df

while len(outliers) > 0:
    no_outlier_df = drop_outliers(current_df, outliers)
    no_outliers.append(no_outlier_df)
    mean, lower, upper = get_bounds(no_outlier_df)
    outliers = find_outliers(no_outlier_df, lower, upper)
    current_df = no_outlier_df

no_outlier_df = no_outliers[-1]


Starting with all 1689 songs ...
	The mean plays is 201.725873298. Outliers are < -1699.13108229 and > 2102.58282889
	There are 34 outliers by play count
Removing 34 outliers. 1655 tracks left.
	The mean plays is 130.868277946. Outliers are < -774.639747433 and > 1036.37630332
	There are 55 outliers by play count
Removing 55 outliers. 1600 tracks left.
	The mean plays is 82.375. Outliers are < -343.534534849 and > 508.284534849
	There are 47 outliers by play count
Removing 47 outliers. 1553 tracks left.
	The mean plays is 63.2962009015. Outliers are < -197.587788099 and > 324.180189902
	There are 47 outliers by play count
Removing 47 outliers. 1506 tracks left.
	The mean plays is 52.7563081009. Outliers are < -137.881751873 and > 243.394368075
	There are 37 outliers by play count
Removing 37 outliers. 1469 tracks left.
	The mean plays is 46.9482641253. Outliers are < -110.506427755 and > 204.402956005
	There are 31 outliers by play count
Removing 31 outliers. 1438 tracks left.
	The mean plays is 43.1154381085. Outliers are < -94.8594077563 and > 181.090283973
	There are 26 outliers by play count
Removing 26 outliers. 1412 tracks left.
	The mean plays is 40.3661473088. Outliers are < -84.5984803112 and > 165.330774929
	There are 20 outliers by play count
Removing 20 outliers. 1392 tracks left.
	The mean plays is 38.4489942529. Outliers are < -77.7504212875 and > 154.648409793
	There are 23 outliers by play count
Removing 23 outliers. 1369 tracks left.
	The mean plays is 36.4200146092. Outliers are < -70.7516779032 and > 143.591707122
	There are 19 outliers by play count
Removing 19 outliers. 1350 tracks left.
	The mean plays is 34.8377777778. Outliers are < -65.2748745473 and > 134.950430103
	There are 9 outliers by play count
Removing 9 outliers. 1341 tracks left.
	The mean plays is 34.1364653244. Outliers are < -62.9492584668 and > 131.222189116
	There are 4 outliers by play count
Removing 4 outliers. 1337 tracks left.
	The mean plays is 33.8414360509. Outliers are < -62.0293300073 and > 129.712202109
	There are 3 outliers by play count
Removing 3 outliers. 1334 tracks left.
	The mean plays is 33.6236881559. Outliers are < -61.3588945727 and > 128.606270885
	There are 1 outliers by play count
Removing 1 outliers. 1333 tracks left.
	The mean plays is 33.5521380345. Outliers are < -61.1420815729 and > 128.246357642
	There are 0 outliers by play count

In [8]:
no_outlier_df.describe()


Out[8]:
plays rating popularity acousticness danceability duration_ms energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence year
count 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.000000 1333.00000 1333.000000 1333.000000 1333.000000
mean 33.552138 3.496624 48.395349 0.215481 0.551779 216098.885221 0.689294 0.060511 5.148537 0.171861 -6.184607 0.794449 0.062115 125.86609 3.929482 0.484980 2012.185296
std 31.576586 0.830732 15.727805 0.261571 0.126618 48377.780655 0.200752 0.211965 3.482107 0.127658 2.769119 0.404256 0.078870 29.71124 0.318795 0.227169 3.845675
min 0.000000 1.000000 1.000000 0.000022 0.124000 31733.000000 0.051600 0.000000 0.000000 0.021000 -25.889000 0.000000 0.022500 51.31600 1.000000 0.029300 1975.000000
25% 8.000000 3.000000 37.000000 0.018900 0.479000 192920.000000 0.563000 0.000000 2.000000 0.093500 -7.276000 1.000000 0.031900 102.04500 4.000000 0.308000 2011.000000
50% 25.000000 4.000000 49.000000 0.092100 0.556000 213813.000000 0.737000 0.000000 5.000000 0.121000 -5.575000 1.000000 0.039800 124.94500 4.000000 0.489000 2013.000000
75% 50.000000 4.000000 61.000000 0.322000 0.631000 238040.000000 0.846000 0.000024 8.000000 0.208000 -4.388000 1.000000 0.054800 147.96600 4.000000 0.659000 2015.000000
max 126.000000 5.000000 85.000000 0.986000 0.965000 536080.000000 0.992000 0.990000 11.000000 0.952000 -0.521000 1.000000 0.852000 207.91400 5.000000 0.965000 2016.000000

In [9]:
def make_plays_rating_hist(df, title):
    plays = Histogram(df, 'plays', bins=10, title='Play Count Distribution ({x})'.format(x=title))
    ratings = Histogram(df, 'rating', bins=5, title='Rating Distribution ({x})'.format(x=title))
    return (plays, ratings)

def make_plays_boxplot(df, title):
    return BoxPlot(df, values='plays', label='genre', color='genre', title='Plays by Genre ({x})'.format(x=title))

Play Count and Rating Histograms


In [10]:
all_row = row(*make_plays_rating_hist(df, 'All'))
no_outlier_row = row(*make_plays_rating_hist(no_outlier_df, 'Removed Outliers'))
layout = column(all_row, no_outlier_row)
show(layout)


Play Count Boxplots


In [11]:
all_row = make_plays_boxplot(df, 'All')
no_outlier_row = make_plays_boxplot(no_outlier_df, 'Removed Outliers')
layout = column(all_row, no_outlier_row)
show(layout)


Play Counts by Genre


In [12]:
genres = no_outlier_df.genre.unique()
panels = []
for genre in genres:
    no_outlier_genre_df = no_outlier_df.loc[no_outlier_df.genre == genre]
    plays, ratings = make_plays_rating_hist(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
    play_box = make_plays_boxplot(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
    panels.append(Panel(child=column(row(plays, play_box), ratings), title="{g} ({x})".format(g=genre.title(),
                                                                                             x=len(no_outlier_genre_df))))
    
tabs = Tabs(tabs=panels)
show(tabs)


Feature Correlation

  • Run the first cell to perform necessary setup.
  • The second cell outputs a scatter plot showing the correlation between two features.
  • The third cell gives dropdowns to select which data the plot should show.

In [13]:
spectral11 = Spectral[10]
color_map = { genre : spectral11[i % len(spectral11)] for i, genre in enumerate(df.genre.unique())}
mapper = CategoricalColorMapper(factors=color_map.keys(), palette=color_map.values())

hover = HoverTool(tooltips=[
        ('name', '@name'),
        ('plays', '@plays'),
        ('rating', '@rating'),
        ('genre', '@genre')
    ])

source = ColumnDataSource(data={'x': no_outlier_df['valence'],
                                'y': no_outlier_df['acousticness'],
                                'name': no_outlier_df['name'],
                                'genre': no_outlier_df['genre'],
                                'rating': no_outlier_df['rating'],
                                'plays': no_outlier_df['plays']})

def update(genre='all', x='valence', y='acousticness'):
    if genre == 'all':
        data = no_outlier_df
    else:
        data = no_outlier_df.loc[no_outlier_df['genre'] == genre]
        
    source.data['x'] = data[x]
    source.data['y'] = data[y]
    plot.title.text = '{x} by {y} ({g})'.format(x=x.title(), y=y.title(), g=genre.title())
    plot.xaxis.axis_label = x.title()
    plot.yaxis.axis_label = y.title()
    push_notebook(handle=interhandle)

plot = figure(x_axis_label='valence',
              y_axis_label='acousticness',
              tools=[hover, 'pan', 'wheel_zoom', 'save'],
              title='Valence vs. Acousticness (All)', plot_height=400, plot_width=400)
plot.scatter('x', 'y',
          source=source,
          fill_alpha=0.6,
          size=5
         )
options = ['popularity', 'acousticness', 'danceability', 'duration_ms',
           'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
           'tempo', 'valence', 'rating', 'plays']
genres = ['all']
genres.extend(df.genre.unique())

In [19]:
interhandle = show(plot, notebook_handle=True)



In [21]:
_ = interact(update, genre=genres, x=options, y=options)

In [ ]: