In [1]:
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import h5py
import sys
import seaborn as sns
%matplotlib inline
additionalFiles = '../MillionSongSubset/AdditionalFiles/'
data = '../MillionSongSubset/data/'
In [2]:
!ls '../MillionSongSubset/AdditionalFiles/'
In [3]:
# Create engine: engine
engine = create_engine('sqlite:///../MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
# Save the table names to a list: table_names
table_names = engine.table_names()
print(table_names)
with engine.connect() as con:
# Perform query: rs
rs = con.execute('SELECT * from songs')
# Save results of the query to DataFrame: df
df_tm = pd.DataFrame(rs.fetchall())
df_tm.columns = rs.keys()
In [4]:
df_tm.head()
Out[4]:
In [5]:
plt.hist(df_tm.year[df_tm.year>0])
Out[5]:
In [6]:
g = sns.jointplot(x=df_tm.artist_familiarity[df_tm.artist_familiarity>0], y=df_tm.artist_hotttnesss[df_tm.artist_familiarity>0], kind="kde", color="m")
# g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
# g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("artist_familiarity", "artist_hotttnesss");
In [7]:
sns.jointplot(x = df_tm.duration[df_tm.year>0],y = df_tm.year[df_tm.year>0],kind = 'kde')
Out[7]:
In [8]:
# Create engine: engine
engine = create_engine('sqlite:///../MillionSongSubset/AdditionalFiles/subset_artist_term.db')
# Save the table names to a list: table_names
table_names = engine.table_names()
print(table_names)
with engine.connect() as con:
# Perform query: rs
rs = con.execute('SELECT m.artist_id,m.mbtag,t.term FROM artist_mbtag m JOIN artist_term t ON m.artist_id=t.artist_id ')
#mb tag = musicbrainz tag
# Save results of the query to DataFrame: df
df_at = pd.DataFrame(rs.fetchall())
df_at.columns = rs.keys()
In [9]:
df_at.head()
Out[9]:
In [10]:
df_at.term.describe()
Out[10]:
In [11]:
df_at.mbtag.describe()
Out[11]:
In [12]:
# Create engine: engine
engine = create_engine('sqlite:///../MillionSongSubset/AdditionalFiles/subset_artist_similarity.db')
# Save the table names to a list: table_names
table_names = engine.table_names()
print(table_names)
with engine.connect() as con:
# Perform query: rs
rs = con.execute('SELECT * FROM similarity')
#mb tag = musicbrainz tag
# Save results of the query to DataFrame: df
df_as = pd.DataFrame(rs.fetchall())
df_as.columns = rs.keys()
In [13]:
df_as.head()
Out[13]:
In [14]:
with pd.HDFStore('../MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5') as store:
print(store)
analysis_summary = store.select('analysis/songs')
metadata_summary = store.select('metadata/songs')
musicbrainz_summary = store.select('musicbrainz/songs')
In [15]:
analysis_summary.head()
Out[15]:
In [49]:
metadata_summary.describe()
Out[49]:
In [50]:
metadata_plusGenre = pd.merge(metadata_summary,df_at.drop_duplicates('artist_id'), how='left', on=['artist_id'])
# metadata_plusGenre = metadata_plusGenre.drop_duplicates(subset=['song_id'])
metadata_plusGenre.describe()
Out[50]:
In [51]:
musicbrainz_summary.describe()
Out[51]:
In [52]:
analysis_summary.to_csv('Track_analysis.csv')
metadata_plusGenre.to_csv('Track_metadata.csv')
musicbrainz_summary.to_csv('Track_musicbrainz.csv')
analysis_summary.to_excel('Track_analysis.xlsx')
In [ ]:
import string,os
for j in ['A','B']:
for k in string.ascii_uppercase:
for l in string.ascii_uppercase:
try:
loc = '../MillionSongSubset/data/'+j+'/'+k+'/'+l+'/'
fn_list = os.listdir(loc)
for fn in fn_list:
try:
with pd.HDFStore(loc+fn) as store:
if fn == 'TRAAAAW128F429D538.h5':
analysis = store.select('analysis/songs')
metadata = store.select('metadata/songs')
musicbrainz = store.select('musicbrainz/songs')
else:
a = store.select('analysis/songs')
me = store.select('metadata/songs')
mu = store.select('musicbrainz/songs')
analysis = pd.concat([analysis,a],ignore_index=True)
metadata = pd.concat([metadata,me],ignore_index=True)
musicbrainz = pd.concat([musicbrainz,mu],ignore_index=True)
except:
with pd.HDFStore(loc+fn) as store:
print(store)
except:
pass
In [ ]:
analysis.describe()
In [ ]:
metadata.describe()
In [ ]:
musicbrainz.describe()
In [ ]:
plt.figure(figsize=(6,6))
corr = analysis.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True)
In [ ]: