Muslytics EDA

Setup

Run the contents of the following cells to load the relevant packages and data.

The second cell will read the tracks table in the muslytics database into dataframe df. You will be prompted for a password.



In [18]:

    
import getpass

from muslytics.DatabaseUtils import connect, Session
from muslytics.Utils import Track

from bokeh.charts import BoxPlot, Histogram
from bokeh.io import output_notebook, push_notebook, show
from bokeh.layouts import row, column
from bokeh.models import CategoricalColorMapper, ColumnDataSource, HoverTool
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral
from bokeh.plotting import figure
from ipywidgets import interact
import numpy as np
import pandas as pd

output_notebook()









    





    
        
        Loading BokehJS ...



In [2]:

    
db = connect('root', getpass.getpass(), 'localhost', 'muslytics')
select = Session().query(Track)
results = db.execute(select.statement).fetchall()
df = pd.read_sql(select.statement, db, index_col='id')

print('Read {num} tracks'.format(num=len(df)))









    



········
Read 1690 tracks

Summary Stats

Run the following cell to see the summary statistics for the tracks table.



In [3]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690 entries, 2545 to 8803
Data columns (total 22 columns):
spotify_id          1690 non-null object
name                1690 non-null object
artists             1690 non-null object
genre               1690 non-null object
plays               1690 non-null int64
rating              1690 non-null float64
loved               1690 non-null bool
popularity          1690 non-null int64
acousticness        1690 non-null float64
danceability        1689 non-null float64
duration_ms         1690 non-null int64
energy              1690 non-null float64
instrumentalness    1690 non-null float64
key                 1690 non-null int64
liveness            1690 non-null float64
loudness            1690 non-null float64
mode                1690 non-null int64
speechiness         1689 non-null float64
tempo               1690 non-null float64
time_signature      1689 non-null float64
valence             1689 non-null float64
year                1690 non-null int64
dtypes: bool(1), float64(11), int64(6), object(4)
memory usage: 292.1+ KB



In [4]:

    
df.loc[df.isnull().any(axis=1)]









    Out[4]:






  
    
      
      spotify_id
      name
      artists
      genre
      plays
      rating
      loved
      popularity
      acousticness
      danceability
      ...
      instrumentalness
      key
      liveness
      loudness
      mode
      speechiness
      tempo
      time_signature
      valence
      year
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      3375
      46O6QtxuzX3iZn9hMXoeqo
      Horrible Theme
      Instrumental
      soundtrack
      8
      3.0
      False
      0
      0.568
      NaN
      ...
      0.977
      9
      0.237
      -10.925
      0
      NaN
      0.0
      NaN
      NaN
      2008
    
  

1 rows × 22 columns



In [5]:

    
df = df.dropna()
df.describe()









    Out[5]:






  
    
      
      plays
      rating
      popularity
      acousticness
      danceability
      duration_ms
      energy
      instrumentalness
      key
      liveness
      loudness
      mode
      speechiness
      tempo
      time_signature
      valence
      year
    
  
  
    
      count
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
      1689.000000
    
    
      mean
      201.725873
      3.655417
      47.391356
      0.216711
      0.551033
      217442.838958
      0.685700
      0.057329
      5.103612
      0.173724
      -6.197137
      0.798697
      0.060370
      125.261895
      3.932504
      0.484611
      2011.815275
    
    
      std
      633.806641
      0.838023
      15.520555
      0.263215
      0.128896
      48045.657635
      0.203086
      0.207385
      3.480895
      0.129666
      2.844253
      0.401092
      0.072681
      29.164364
      0.302346
      0.230956
      3.915455
    
    
      min
      0.000000
      1.000000
      1.000000
      0.000022
      0.097200
      31733.000000
      0.047300
      0.000000
      0.000000
      0.021000
      -28.663000
      0.000000
      0.022500
      51.316000
      1.000000
      0.027600
      1975.000000
    
    
      25%
      12.000000
      3.000000
      37.000000
      0.018900
      0.478000
      193535.000000
      0.560000
      0.000000
      2.000000
      0.093500
      -7.270000
      1.000000
      0.032000
      102.031000
      4.000000
      0.303000
      2010.000000
    
    
      50%
      35.000000
      4.000000
      48.000000
      0.092900
      0.558000
      215533.000000
      0.733000
      0.000000
      5.000000
      0.121000
      -5.576000
      1.000000
      0.039700
      123.970000
      4.000000
      0.485000
      2013.000000
    
    
      75%
      103.000000
      4.000000
      60.000000
      0.329000
      0.636000
      239907.000000
      0.844000
      0.000021
      8.000000
      0.213000
      -4.371000
      1.000000
      0.055200
      145.993000
      4.000000
      0.660000
      2015.000000
    
    
      max
      12012.000000
      5.000000
      85.000000
      0.986000
      0.965000
      536080.000000
      0.992000
      0.990000
      11.000000
      0.952000
      -0.521000
      1.000000
      0.852000
      207.914000
      5.000000
      0.965000
      2016.000000

Play Count Analysis

Some basic EDA by play count.



In [6]:

    
def get_bounds(df):
    col_vals = df.plays
    s3 = 3*np.std(col_vals)
    col_mean = np.mean(col_vals)
    lower = col_mean - s3
    upper = col_mean + s3
    print("\tThe mean plays is {mean}. Outliers are < {x} and > {y}".format(mean=col_mean,
                                                                            x=lower,
                                                                            y=upper))
    return (col_mean, lower, upper)

def find_outliers(df, lower, upper):
    outliers = df.loc[df['plays'] > upper]
    print("\tThere are {x} outliers by play count".format(x=len(outliers)))
    return outliers

def drop_outliers(df, outliers):
    dropped = df.drop(outliers.index)
    print("Removing {x} outliers. {y} tracks left.".format(x=len(outliers), y=len(dropped)))
    return dropped



In [7]:

    
print("Starting with all {x} songs ...".format(x=len(df)))
mean, lower, upper = get_bounds(df)
outliers = find_outliers(df, lower, upper)
no_outliers = []
current_df = df

while len(outliers) > 0:
    no_outlier_df = drop_outliers(current_df, outliers)
    no_outliers.append(no_outlier_df)
    mean, lower, upper = get_bounds(no_outlier_df)
    outliers = find_outliers(no_outlier_df, lower, upper)
    current_df = no_outlier_df

no_outlier_df = no_outliers[-1]









    



Starting with all 1689 songs ...
	The mean plays is 201.725873298. Outliers are < -1699.13108229 and > 2102.58282889
	There are 34 outliers by play count
Removing 34 outliers. 1655 tracks left.
	The mean plays is 130.868277946. Outliers are < -774.639747433 and > 1036.37630332
	There are 55 outliers by play count
Removing 55 outliers. 1600 tracks left.
	The mean plays is 82.375. Outliers are < -343.534534849 and > 508.284534849
	There are 47 outliers by play count
Removing 47 outliers. 1553 tracks left.
	The mean plays is 63.2962009015. Outliers are < -197.587788099 and > 324.180189902
	There are 47 outliers by play count
Removing 47 outliers. 1506 tracks left.
	The mean plays is 52.7563081009. Outliers are < -137.881751873 and > 243.394368075
	There are 37 outliers by play count
Removing 37 outliers. 1469 tracks left.
	The mean plays is 46.9482641253. Outliers are < -110.506427755 and > 204.402956005
	There are 31 outliers by play count
Removing 31 outliers. 1438 tracks left.
	The mean plays is 43.1154381085. Outliers are < -94.8594077563 and > 181.090283973
	There are 26 outliers by play count
Removing 26 outliers. 1412 tracks left.
	The mean plays is 40.3661473088. Outliers are < -84.5984803112 and > 165.330774929
	There are 20 outliers by play count
Removing 20 outliers. 1392 tracks left.
	The mean plays is 38.4489942529. Outliers are < -77.7504212875 and > 154.648409793
	There are 23 outliers by play count
Removing 23 outliers. 1369 tracks left.
	The mean plays is 36.4200146092. Outliers are < -70.7516779032 and > 143.591707122
	There are 19 outliers by play count
Removing 19 outliers. 1350 tracks left.
	The mean plays is 34.8377777778. Outliers are < -65.2748745473 and > 134.950430103
	There are 9 outliers by play count
Removing 9 outliers. 1341 tracks left.
	The mean plays is 34.1364653244. Outliers are < -62.9492584668 and > 131.222189116
	There are 4 outliers by play count
Removing 4 outliers. 1337 tracks left.
	The mean plays is 33.8414360509. Outliers are < -62.0293300073 and > 129.712202109
	There are 3 outliers by play count
Removing 3 outliers. 1334 tracks left.
	The mean plays is 33.6236881559. Outliers are < -61.3588945727 and > 128.606270885
	There are 1 outliers by play count
Removing 1 outliers. 1333 tracks left.
	The mean plays is 33.5521380345. Outliers are < -61.1420815729 and > 128.246357642
	There are 0 outliers by play count



In [8]:

    
no_outlier_df.describe()









    Out[8]:






  
    
      
      plays
      rating
      popularity
      acousticness
      danceability
      duration_ms
      energy
      instrumentalness
      key
      liveness
      loudness
      mode
      speechiness
      tempo
      time_signature
      valence
      year
    
  
  
    
      count
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.000000
      1333.00000
      1333.000000
      1333.000000
      1333.000000
    
    
      mean
      33.552138
      3.496624
      48.395349
      0.215481
      0.551779
      216098.885221
      0.689294
      0.060511
      5.148537
      0.171861
      -6.184607
      0.794449
      0.062115
      125.86609
      3.929482
      0.484980
      2012.185296
    
    
      std
      31.576586
      0.830732
      15.727805
      0.261571
      0.126618
      48377.780655
      0.200752
      0.211965
      3.482107
      0.127658
      2.769119
      0.404256
      0.078870
      29.71124
      0.318795
      0.227169
      3.845675
    
    
      min
      0.000000
      1.000000
      1.000000
      0.000022
      0.124000
      31733.000000
      0.051600
      0.000000
      0.000000
      0.021000
      -25.889000
      0.000000
      0.022500
      51.31600
      1.000000
      0.029300
      1975.000000
    
    
      25%
      8.000000
      3.000000
      37.000000
      0.018900
      0.479000
      192920.000000
      0.563000
      0.000000
      2.000000
      0.093500
      -7.276000
      1.000000
      0.031900
      102.04500
      4.000000
      0.308000
      2011.000000
    
    
      50%
      25.000000
      4.000000
      49.000000
      0.092100
      0.556000
      213813.000000
      0.737000
      0.000000
      5.000000
      0.121000
      -5.575000
      1.000000
      0.039800
      124.94500
      4.000000
      0.489000
      2013.000000
    
    
      75%
      50.000000
      4.000000
      61.000000
      0.322000
      0.631000
      238040.000000
      0.846000
      0.000024
      8.000000
      0.208000
      -4.388000
      1.000000
      0.054800
      147.96600
      4.000000
      0.659000
      2015.000000
    
    
      max
      126.000000
      5.000000
      85.000000
      0.986000
      0.965000
      536080.000000
      0.992000
      0.990000
      11.000000
      0.952000
      -0.521000
      1.000000
      0.852000
      207.91400
      5.000000
      0.965000
      2016.000000



In [9]:

    
def make_plays_rating_hist(df, title):
    plays = Histogram(df, 'plays', bins=10, title='Play Count Distribution ({x})'.format(x=title))
    ratings = Histogram(df, 'rating', bins=5, title='Rating Distribution ({x})'.format(x=title))
    return (plays, ratings)

def make_plays_boxplot(df, title):
    return BoxPlot(df, values='plays', label='genre', color='genre', title='Plays by Genre ({x})'.format(x=title))

Play Count and Rating Histograms



In [10]:

    
all_row = row(*make_plays_rating_hist(df, 'All'))
no_outlier_row = row(*make_plays_rating_hist(no_outlier_df, 'Removed Outliers'))
layout = column(all_row, no_outlier_row)
show(layout)

Play Count Boxplots



In [11]:

    
all_row = make_plays_boxplot(df, 'All')
no_outlier_row = make_plays_boxplot(no_outlier_df, 'Removed Outliers')
layout = column(all_row, no_outlier_row)
show(layout)

Play Counts by Genre



In [12]:

    
genres = no_outlier_df.genre.unique()
panels = []
for genre in genres:
    no_outlier_genre_df = no_outlier_df.loc[no_outlier_df.genre == genre]
    plays, ratings = make_plays_rating_hist(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
    play_box = make_plays_boxplot(no_outlier_genre_df, '{x}: Removed Outliers'.format(x=genre.title()))
    panels.append(Panel(child=column(row(plays, play_box), ratings), title="{g} ({x})".format(g=genre.title(),
                                                                                             x=len(no_outlier_genre_df))))
    
tabs = Tabs(tabs=panels)
show(tabs)

Feature Correlation

Run the first cell to perform necessary setup.
The second cell outputs a scatter plot showing the correlation between two features.
The third cell gives dropdowns to select which data the plot should show.



In [13]:

    
spectral11 = Spectral[10]
color_map = { genre : spectral11[i % len(spectral11)] for i, genre in enumerate(df.genre.unique())}
mapper = CategoricalColorMapper(factors=color_map.keys(), palette=color_map.values())

hover = HoverTool(tooltips=[
        ('name', '@name'),
        ('plays', '@plays'),
        ('rating', '@rating'),
        ('genre', '@genre')
    ])

source = ColumnDataSource(data={'x': no_outlier_df['valence'],
                                'y': no_outlier_df['acousticness'],
                                'name': no_outlier_df['name'],
                                'genre': no_outlier_df['genre'],
                                'rating': no_outlier_df['rating'],
                                'plays': no_outlier_df['plays']})

def update(genre='all', x='valence', y='acousticness'):
    if genre == 'all':
        data = no_outlier_df
    else:
        data = no_outlier_df.loc[no_outlier_df['genre'] == genre]
        
    source.data['x'] = data[x]
    source.data['y'] = data[y]
    plot.title.text = '{x} by {y} ({g})'.format(x=x.title(), y=y.title(), g=genre.title())
    plot.xaxis.axis_label = x.title()
    plot.yaxis.axis_label = y.title()
    push_notebook(handle=interhandle)

plot = figure(x_axis_label='valence',
              y_axis_label='acousticness',
              tools=[hover, 'pan', 'wheel_zoom', 'save'],
              title='Valence vs. Acousticness (All)', plot_height=400, plot_width=400)
plot.scatter('x', 'y',
          source=source,
          fill_alpha=0.6,
          size=5
         )
options = ['popularity', 'acousticness', 'danceability', 'duration_ms',
           'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
           'tempo', 'valence', 'rating', 'plays']
genres = ['all']
genres.extend(df.genre.unique())



In [19]:

    
interhandle = show(plot, notebook_handle=True)



In [21]:

    
_ = interact(update, genre=genres, x=options, y=options)



In [ ]:

	spotify_id	name	artists	genre	plays	rating	loved	popularity	acousticness	danceability	...	instrumentalness	key	liveness	loudness	mode	speechiness	tempo	time_signature	valence	year
id
3375	46O6QtxuzX3iZn9hMXoeqo	Horrible Theme	Instrumental	soundtrack	8	3.0	False	0	0.568	NaN	...	0.977	9	0.237	-10.925	0	NaN	0.0	NaN	NaN	2008

	plays	rating	popularity	acousticness	danceability	duration_ms	energy	instrumentalness	key	liveness	loudness	mode	speechiness	tempo	time_signature	valence	year
count	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000	1689.000000
mean	201.725873	3.655417	47.391356	0.216711	0.551033	217442.838958	0.685700	0.057329	5.103612	0.173724	-6.197137	0.798697	0.060370	125.261895	3.932504	0.484611	2011.815275
std	633.806641	0.838023	15.520555	0.263215	0.128896	48045.657635	0.203086	0.207385	3.480895	0.129666	2.844253	0.401092	0.072681	29.164364	0.302346	0.230956	3.915455
min	0.000000	1.000000	1.000000	0.000022	0.097200	31733.000000	0.047300	0.000000	0.000000	0.021000	-28.663000	0.000000	0.022500	51.316000	1.000000	0.027600	1975.000000
25%	12.000000	3.000000	37.000000	0.018900	0.478000	193535.000000	0.560000	0.000000	2.000000	0.093500	-7.270000	1.000000	0.032000	102.031000	4.000000	0.303000	2010.000000
50%	35.000000	4.000000	48.000000	0.092900	0.558000	215533.000000	0.733000	0.000000	5.000000	0.121000	-5.576000	1.000000	0.039700	123.970000	4.000000	0.485000	2013.000000
75%	103.000000	4.000000	60.000000	0.329000	0.636000	239907.000000	0.844000	0.000021	8.000000	0.213000	-4.371000	1.000000	0.055200	145.993000	4.000000	0.660000	2015.000000
max	12012.000000	5.000000	85.000000	0.986000	0.965000	536080.000000	0.992000	0.990000	11.000000	0.952000	-0.521000	1.000000	0.852000	207.914000	5.000000	0.965000	2016.000000

	plays	rating	popularity	acousticness	danceability	duration_ms	energy	instrumentalness	key	liveness	loudness	mode	speechiness	tempo	time_signature	valence	year
count	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.000000	1333.00000	1333.000000	1333.000000	1333.000000
mean	33.552138	3.496624	48.395349	0.215481	0.551779	216098.885221	0.689294	0.060511	5.148537	0.171861	-6.184607	0.794449	0.062115	125.86609	3.929482	0.484980	2012.185296
std	31.576586	0.830732	15.727805	0.261571	0.126618	48377.780655	0.200752	0.211965	3.482107	0.127658	2.769119	0.404256	0.078870	29.71124	0.318795	0.227169	3.845675
min	0.000000	1.000000	1.000000	0.000022	0.124000	31733.000000	0.051600	0.000000	0.000000	0.021000	-25.889000	0.000000	0.022500	51.31600	1.000000	0.029300	1975.000000
25%	8.000000	3.000000	37.000000	0.018900	0.479000	192920.000000	0.563000	0.000000	2.000000	0.093500	-7.276000	1.000000	0.031900	102.04500	4.000000	0.308000	2011.000000
50%	25.000000	4.000000	49.000000	0.092100	0.556000	213813.000000	0.737000	0.000000	5.000000	0.121000	-5.575000	1.000000	0.039800	124.94500	4.000000	0.489000	2013.000000
75%	50.000000	4.000000	61.000000	0.322000	0.631000	238040.000000	0.846000	0.000024	8.000000	0.208000	-4.388000	1.000000	0.054800	147.96600	4.000000	0.659000	2015.000000
max	126.000000	5.000000	85.000000	0.986000	0.965000	536080.000000	0.992000	0.990000	11.000000	0.952000	-0.521000	1.000000	0.852000	207.91400	5.000000	0.965000	2016.000000