In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
analysis = pd.read_csv('Track_analysis.csv',index_col=0)
metadata = pd.read_csv('Track_metadata.csv',index_col=0)
musicbrainz = pd.read_csv('Track_musicbrainz.csv',index_col=0)
analysis.head()
Out[2]:
In [3]:
analysis.describe()
Out[3]:
In [4]:
metadata.head()
Out[4]:
In [5]:
musicbrainz.head()
Out[5]:
In [6]:
parameters = pd.DataFrame()
# leaving out duration,end_of_fade_in, start_of_fade_out and year because that is unlikely to be relevant to any metric
# key and time_signature are discrete so can't use these
parameters = analysis[['loudness','tempo']]
parameters = pd.concat([parameters,metadata[['artist_familiarity','artist_hotttnesss','song_hotttnesss']]],axis=1)
# parameters['year'] = musicbrainz['year']
In [7]:
parameters = parameters.loc[(parameters != 0).all(1)]
parameters = parameters.dropna()
parameters_description = parameters.describe()
parameters_description
Out[7]:
In [8]:
parameters_standard = (parameters - parameters.mean()) / parameters.std()
parameters_standard.describe()
Out[8]:
In [9]:
parameters.head()
Out[9]:
In [265]:
g = sns.PairGrid(parameters, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter, s = 0.1)
g.map_diag(sns.kdeplot, lw=3)
plt.savefig('../MillionSongSubset/parameter_pairgrid.png')
http://scikit-learn.org/stable/modules/clustering.html#k-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.
In [11]:
parameters_standard = np.array(parameters_standard)
inertia = []
k_vals = range(2,30)
for k in k_vals:
clf = KMeans(n_clusters=k,n_jobs=-1)
clf.fit(parameters_standard)
cluster_labels = clf.predict(parameters_standard)
inertia.append(clf.inertia_)
plt.plot(k_vals,inertia)
Out[11]:
We move forward with the k-value that appears to correspond to a knee
In [12]:
k = 7
clf = KMeans(n_clusters=k,n_jobs=-1)
clf.fit(parameters_standard)
cluster_labels = clf.predict(parameters_standard)
In [238]:
labels = ['loudness','tempo','artist_familiarity','artist_hotttnesss','song_hotttnesss']
fig_size=(20,20)
fig, axes2d = plt.subplots(nrows=5, ncols=5,
sharex=True, sharey=True,
figsize=fig_size)
for i, row in enumerate(axes2d):
for j, cell in enumerate(row):
x_label = labels[j]
y_label = labels[i]
x_in = parameters.columns.get_loc(x_label)
y_in = parameters.columns.get_loc(y_label)
cell.scatter(parameters_standard[:,x_in],
parameters_standard[:,y_in], c=cluster_labels,
s=20,label=cluster_labels)
cell.scatter(clf.cluster_centers_[:,x_in],clf.cluster_centers_[:,y_in],marker='x',s=150,c=range(k))
if i == len(axes2d) - 1:
cell.set_xlabel(x_label)
if j == 0:
cell.set_ylabel(y_label)
plt.tight_layout()
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.
In [14]:
from sklearn.decomposition import PCA
In [15]:
pca = PCA()
pca.fit(parameters_standard)
plt.scatter(range(len(pca.explained_variance_ratio_)),pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained_variance_ratio_')
Out[15]:
In [16]:
X_pca = pca.transform(parameters_standard)
X_pca.shape
Out[16]:
In [234]:
plt.figure(figsize=(3,6))
plt.subplot(311)
plt.scatter(X_pca[:,0],X_pca[:,1], c=cluster_labels,
s=10,label=cluster_labels)
pca_centers = pca.transform(clf.cluster_centers_)
plt.scatter(pca_centers[:,0],pca_centers[:,1],marker='x',s=150,c=range(k))
plt.xlabel("PC 0")
plt.ylabel("PC 1")
plt.grid()
plt.subplot(312)
plt.scatter(X_pca[:,0],X_pca[:,2], c=cluster_labels,
s=10,label=cluster_labels)
pca_centers = pca.transform(clf.cluster_centers_)
plt.scatter(pca_centers[:,0],pca_centers[:,2],marker='x',s=150,c=range(k))
plt.xlabel("PC 0")
plt.ylabel("PC 2")
plt.grid()
plt.subplot(313)
plt.scatter(X_pca[:,1],X_pca[:,2], c=cluster_labels,
s=10,label=cluster_labels)
pca_centers = pca.transform(clf.cluster_centers_)
plt.scatter(pca_centers[:,1],pca_centers[:,2],marker='x',s=150,c=range(k))
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.grid()
plt.tight_layout()
In [ ]:
From http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results.
In [18]:
from sklearn.manifold import TSNE
In [30]:
tsne = TSNE(n_components=2)
In [31]:
X_tsne = tsne.fit_transform(parameters_standard)
In [32]:
plt.scatter(X_tsne[:,0],X_tsne[:,1], c=cluster_labels,
s=50,label=cluster_labels)
plt.xlabel("Component 1")
plt.ylabel("Component 0")
plt.grid()
From http://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html
Non-linear dimensionality reduction through Isometric Mapping
In [22]:
from sklearn.manifold import Isomap
imap = Isomap(n_neighbors=5, n_components=2)
y = imap.fit(parameters_standard)
Y = imap.fit_transform(parameters_standard)
plt.scatter(Y[:, 0], Y[:, 1],c = cluster_labels)
Out[22]:
http://scikit-learn.org/stable/modules/clustering.html#k-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.
In [239]:
from sklearn.cluster import SpectralClustering
spec = SpectralClustering(n_clusters=7,affinity='nearest_neighbors')
spec_labels = spec.fit_predict(parameters_standard)
labels = ['loudness','tempo','artist_familiarity','artist_hotttnesss','song_hotttnesss']
fig_size=(20,20)
fig, axes2d = plt.subplots(nrows=5, ncols=5,
sharex=True, sharey=True,
figsize=fig_size)
for i, row in enumerate(axes2d):
for j, cell in enumerate(row):
x_label = labels[j]
y_label = labels[i]
x_in = parameters.columns.get_loc(x_label)
y_in = parameters.columns.get_loc(y_label)
cell.scatter(parameters_standard[:,x_in],
parameters_standard[:,y_in], c=spec_labels,
s=20,label=cluster_labels)
cell.scatter(clf.cluster_centers_[:,x_in],clf.cluster_centers_[:,y_in],marker='x',s=150,c=range(k))
if i == len(axes2d) - 1:
cell.set_xlabel(x_label)
if j == 0:
cell.set_ylabel(y_label)
plt.tight_layout()
In [220]:
plt.scatter(X_pca[:,0],X_pca[:,1], c=spec_labels,
s=10,label=cluster_labels)
pca_centers = pca.transform(clf.cluster_centers_)
plt.scatter(pca_centers[:,0],pca_centers[:,1],marker='x',s=150,c=range(k))
plt.xlabel("PC 1")
plt.ylabel("PC0")
plt.grid()
In [25]:
from sklearn.cluster import AgglomerativeClustering
In [240]:
agg = AgglomerativeClustering(n_clusters=7,affinity = 'cosine',linkage='complete')
agg_labels = agg.fit_predict(parameters_standard)
fig_size=(20,20)
fig, axes2d = plt.subplots(nrows=5, ncols=5,
sharex=True, sharey=True,
figsize=fig_size)
for i, row in enumerate(axes2d):
for j, cell in enumerate(row):
x_label = labels[j]
y_label = labels[i]
x_in = parameters.columns.get_loc(x_label)
y_in = parameters.columns.get_loc(y_label)
cell.scatter(parameters_standard[:,x_in],
parameters_standard[:,y_in], c=agg_labels,
s=20,label=cluster_labels)
cell.scatter(clf.cluster_centers_[:,x_in],clf.cluster_centers_[:,y_in],marker='x',s=150,c=range(k))
if i == len(axes2d) - 1:
cell.set_xlabel(x_label)
if j == 0:
cell.set_ylabel(y_label)
plt.tight_layout()
In [182]:
plt.scatter(X_pca[:,0],X_pca[:,1], c=agg_labels,
s=10,label=cluster_labels)
pca_centers = pca.transform(clf.cluster_centers_)
plt.scatter(pca_centers[:,0],pca_centers[:,1],marker='x',s=150,c=range(k))
plt.xlabel("PC 1")
plt.ylabel("PC0")
plt.grid()
In [178]:
cluster_to_genre = pd.DataFrame({'cluster_labels_kmeans':cluster_labels,'cluster_labels_spec':spec_labels,'cluster_labels_agg':agg_labels}, index = parameters.index)
cluster_to_genre = pd.merge(cluster_to_genre, metadata, left_index=True, right_index=True,how='left')
unique_genres = list(cluster_to_genre.mbtag.dropna().unique())
def num_per_cluster(label):
grouped = cluster_to_genre.groupby(label)
num_unique_genres = []
unique_genres_in_cluster = []
for i in range(len(grouped)):
num_unique_genres.append(grouped.get_group(i).mbtag.dropna().nunique())
unique_genres_in_cluster.append(list(grouped.get_group(i).mbtag.dropna().unique()))
return num_unique_genres,unique_genres_in_cluster,grouped
def genreTable(ugc,label,grouped):
overlaps = np.zeros((len(unique_genres),len(grouped)))
for i in range(len(grouped)):
locs = []
for j in range(len(ugc[i])):
locs.append(unique_genres.index(ugc[i][j]))
overlaps[locs,i] += 1
return overlaps
nug_k,ugc_k,g_k = num_per_cluster('cluster_labels_kmeans')
nug_s,ugc_s,g_s = num_per_cluster('cluster_labels_spec')
nug_a,ugc_a,g_a = num_per_cluster('cluster_labels_agg')
overlaps_k = genreTable(ugc_k,cluster_labels,g_k)
overlaps_s = genreTable(ugc_s,spec_labels,g_s)
overlaps_a = genreTable(ugc_a,agg_labels,g_a)
plt.figure()
plt.plot(range(len(set(cluster_labels))),nug_k,label = 'kmeans')
plt.plot(range(len(set(spec_labels))),nug_s, label = 'spectral')
plt.plot(range(len(set(agg_labels))),nug_a, label = 'agglomerative')
plt.xlabel("cluster")
plt.ylabel("number of unique genres in cluster")
plt.legend()
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_k,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_s,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_a,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
# noise_in_genres = pd.DataFrame({'num_clusters':np.sum(overlaps,axis = 1)},index=unique_genres)
# plt.figure(figsize=(20,3))
# ax = plt.subplot(111)
# sns.countplot(noise_in_genres.num_clusters)
# a = ax.set_xticklabels(unique_genres,rotation=70)
In [29]:
#One Hot Encoding
cat_cols = ["mbtag"]
df_continuous = pd.get_dummies(metadata.mbtag,columns=cat_cols)
df_continuous.describe()
df_continuous.to_csv('one_hot_genres.csv')
df_continuous.describe().to_csv('one_hot_genres_stats.csv')
In [103]:
macro_genres = {}
unique_genres2 = set(unique_genres)
tags = ['rock','elec','classic','jazz','punk','blue','house','fusion','rap',
'rnb','60','alternative','indie','folk',
'country','dub','easy','metal',
['irish','irish','celtic'],
['soulreggae','soul','reggae'],
['pop','pop','taylor-swift'],
['singers','singer','group'],
['hip-hop','hip','hop'],
['easy','downtempo'],
['instrumental','instrumental','saxaphone','orchestral',
'conductor','guitarist'],
['city','new','wave','grindcore','grunge','american underground',
'hard','core'],
['musical','musical','soundtrack','star academy', 'american idol',
'film', 'production music','singer-songwriter','video game','producer'],
['vocal','soprano','whistle register','a cappella'],
['christian','christian','gospel']]
for tag in tags:
if isinstance(tag,list):
macro_genres[tag[0]] = []
for t in tag[1:]:
tagList = [x for x in unique_genres2 if t in x]
macro_genres[tag[0]] += tagList
unique_genres2 = unique_genres2.difference(set(tagList))
else:
tagList = [x for x in unique_genres2 if tag in x]
macro_genres[tag] = tagList
unique_genres2 = unique_genres2.difference(set(tagList))
macro_genres['other'] = unique_genres2
print(unique_genres2)
In [104]:
print(macro_genres.keys())
print([len(x) for x in macro_genres.values()])
In [242]:
def minor_to_macro_genre(tag):
for k in macro_genres.keys():
if tag in macro_genres[k]:
return k
In [ ]:
In [243]:
cluster_to_genre = pd.DataFrame({'cluster_labels_kmeans':cluster_labels,'cluster_labels_spec':spec_labels,'cluster_labels_agg':agg_labels}, index = parameters.index)
cluster_to_genre['mbtag'] = metadata.mbtag.apply(minor_to_macro_genre).loc[parameters.index]
unique_genres = list(cluster_to_genre.mbtag.dropna().unique())
def num_per_cluster(label):
grouped = cluster_to_genre.groupby(label)
num_unique_genres = []
unique_genres_in_cluster = []
for i in range(len(grouped)):
num_unique_genres.append(grouped.get_group(i).mbtag.dropna().nunique())
unique_genres_in_cluster.append(list(grouped.get_group(i).mbtag.dropna().unique()))
return num_unique_genres,unique_genres_in_cluster,grouped
def genreTable(ugc,label,grouped):
overlaps = np.zeros((len(unique_genres),len(grouped)))
for i in range(len(grouped)):
locs = []
for j in range(len(ugc[i])):
locs.append(unique_genres.index(ugc[i][j]))
overlaps[locs,i] += 1
return overlaps
nug_k,ugc_k,g_k = num_per_cluster('cluster_labels_kmeans')
nug_s,ugc_s,g_s = num_per_cluster('cluster_labels_spec')
nug_a,ugc_a,g_a = num_per_cluster('cluster_labels_agg')
overlaps_k = genreTable(ugc_k,cluster_labels,g_k)
overlaps_s = genreTable(ugc_s,spec_labels,g_s)
overlaps_a = genreTable(ugc_a,agg_labels,g_a)
plt.figure()
plt.plot(range(len(set(cluster_labels))),nug_k,label = 'kmeans')
plt.plot(range(len(set(spec_labels))),nug_s, label = 'spectral')
plt.plot(range(len(set(agg_labels))),nug_a, label = 'agglomerative')
plt.xlabel("cluster")
plt.ylabel("number of unique genres in cluster")
plt.legend()
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_k,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_s,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
fig = plt.figure(figsize=(5,30))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(overlaps_a,ax = ax1)
a = ax1.set_yticklabels(unique_genres)
In [212]:
overlaps = [overlaps_k,overlaps_s,overlaps_a]
for n in range(3):
noise_in_genres = pd.DataFrame({'num_clusters':np.count_nonzero(overlaps[n],axis = 1)},index=unique_genres)
plt.figure(figsize=(10,3))
ax = plt.subplot(3,1,n+1)
plt.scatter(range(len(unique_genres)),noise_in_genres.num_clusters)
ax.set_xticks(range(len(unique_genres)))
a = ax.set_xticklabels(macro_genres.keys(),rotation=70)
In [188]:
metadata_macro_genres = metadata
metadata_macro_genres.mbtag = metadata.mbtag.apply(minor_to_macro_genre)
In [190]:
metadata_macro_genres.head()
Out[190]:
In [205]:
plt.figure(figsize=(15,5))
sns.violinplot(x="mbtag", y="artist_familiarity", data=metadata_macro_genres)
plt.xticks(rotation=70);
In [244]:
plt.figure(figsize=(15,5))
sns.violinplot( x="mbtag",y="artist_hotttnesss", data=metadata_macro_genres)
plt.xticks(rotation=70);
In [209]:
metadata_macro_genres.to_csv('../MillionSongSubset/metadata_macro_genres.csv')
In [263]:
labels = ['loudness','tempo','artist_familiarity','artist_hotttnesss','song_hotttnesss']
genrelabels = [list(set(metadata_macro_genres.mbtag)).index(x) for x in metadata_macro_genres.mbtag]
from matplotlib import colors
import matplotlib.cm as cmx
cm = plt.get_cmap('RdYlGn')
cNorm = colors.Normalize(vmin=0, vmax=len(set(genrelabels)))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
fig_size=(20,20)
fig, axes2d = plt.subplots(nrows=5, ncols=5,
sharex=True, sharey=True,
figsize=fig_size)
for i, row in enumerate(axes2d):
for j, cell in enumerate(row):
x_label = labels[j]
y_label = labels[i]
x_in = parameters.columns.get_loc(x_label)
y_in = parameters.columns.get_loc(y_label)
cell.scatter(parameters_standard[:,x_in],
parameters_standard[:,y_in], c=scalarMap.to_rgba(genrelabels),
s=20,label=cluster_labels)
cell.scatter(clf.cluster_centers_[:,x_in],clf.cluster_centers_[:,y_in],marker='x',s=150,c=range(k))
if i == len(axes2d) - 1:
cell.set_xlabel(x_label)
if j == 0:
cell.set_ylabel(y_label)
plt.tight_layout()
In [ ]: