In [39]:
# Import libraries
import csv
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
In [40]:
def read_data(filename):
with open(filename, 'rB') as csvf:
return [row for row in csv.reader(csvf)]
def get_cluster(title):
return clusterIdOfSong[title]
In [41]:
# Read files
X = read_data("songbysongtransprob.csv")
Y = []
for song in X:
Y.append(song[0].replace('"','').replace("'",''))
song.pop(0)
In [42]:
X_mat = np.array(X, dtype = np.float)
print X_mat.shape
In [43]:
"""6 Cluster K-Means Analysis"""
K = 6 #number of clusters
km = KMeans(n_clusters = K, n_init = 10000, random_state = 42)
km.fit(X_mat)
Out[43]:
In [44]:
# Try t-sne
from sklearn import manifold
#fig, ax = plt.subplots()
plt.figure(figsize=(40,35))
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_tsne = tsne.fit_transform(X_mat)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c = km.labels_, s = 200, cmap=plt.cm.Spectral)
ax = plt.gca()
for i, txt in enumerate(Y):
ax.annotate(txt, (X_tsne[i, 0], X_tsne[i, 1]))
In [46]:
# Add center clusters to X, Y so that we can visualize them on the graph
Y = Y + ['CLUSTER0', 'CLUSTER1','CLUSTER2','CLUSTER3','CLUSTER4','CLUSTER5']
X = np.concatenate((X_mat,km.cluster_centers_), axis = 0)
# Define dictionary of song names and song indices
song_dict = dict()
for i in range(0, len(Y)):
song_dict[Y[i]] = X[i]
song_index = dict()
for i in range(0, len(Y)):
song_index[Y[i]] = i
song_index['CLUSTER0'] = i+1
song_index['CLUSTER1'] = i+2
song_index['CLUSTER2'] = i+3
song_index['CLUSTER3'] = i+4
song_index['CLUSTER4'] = i+5
song_index['CLUSTER5'] = i+6
In [47]:
# Compute distances between every two songs
import itertools
from scipy.spatial import distance
pairs_cosine = dict()
pairs_euclidean = dict()
counter = 0
for song_pair in itertools.combinations(Y, 2):
pairs_cosine[song_pair] = distance.cosine(song_dict[song_pair[0]],song_dict[song_pair[1]])
pairs_euclidean[song_pair] = distance.euclidean(song_dict[song_pair[0]],song_dict[song_pair[1]])
counter += 1
print 'Number of pairs: ' + str(counter)
In [48]:
#Normalize euclidean distances to 0-1 range
max_dist = max(pairs_euclidean.values())
min_dist = min(pairs_euclidean.values())
def min_max_scaler(x, min_, max_):
return (x-min_)/(max_-min_)
In [49]:
# Filter out NaN distances and scale to range 0-1 (only needed with Euclidean distance)
import math
graph_cosine = pairs_cosine.items()
graph_cosine = [i for i in graph_cosine if math.isnan(i[1]) == False]
graph_euclidean = pairs_euclidean.items()
graph_euclidean = [(i[0], min_max_scaler(i[1], min_dist, max_dist)) for i in graph_euclidean if math.isnan(i[1]) == False]
In [50]:
# Plot distribution of distances
plt.figure(figsize=(10,5))
plt.ylabel('Number of distance pairs')
plt.xlabel('Distance')
plt.title('Histogram of cosine distance pairs')
plt.hist([i[1] for i in graph_cosine], bins = 50, color = 'green')
plt.show()
plt.figure(figsize=(10,5))
plt.ylabel('Number of distance pairs')
plt.xlabel('Distance')
plt.title('Histogram of euclidean distance pairs')
plt.hist([i[1] for i in graph_euclidean], bins = 50, color = 'green')
plt.show()
In [51]:
# Manually create JSON file
json_nodes = '''"nodes": [''' + '\n'
for title, cluster in zip(Y, km.labels_):
json_nodes += '''{"id": "''' + title + '''", "group": ''' + str(cluster) + '''},''' + '\n'
json_nodes = json_nodes[:-2] + '\n'+ ''']'''
In [52]:
# Links using cosine distance
json_links = '''"links": [''' + '\n'
for song_pair, dist in graph_cosine:
if dist < 0.9:
json_links += '\t' + '''{"source": ''' + str(song_index[song_pair[0]]) + ''', "target": ''' + str(song_index[song_pair[1]]) + ''', "value": ''' + str(dist) + '''},''' + '\n'
json_links = json_links[:-2] + '\n'+ ''']'''
json_string = '''{''' + json_nodes + ''',''' + json_links + '''}'''
text_file = open("graph_cosine.json", "w")
text_file.write(json_string)
text_file.close()
In [53]:
# Links using euclidean distance
json_links = '''"links": [''' + '\n'
for song_pair, dist in graph_euclidean:
if dist < 0.9:
json_links += '\t' + '''{"source": ''' + str(song_index[song_pair[0]]) + ''', "target": ''' + str(song_index[song_pair[1]]) + ''', "value": ''' + str(dist) + '''},''' + '\n'
json_links = json_links[:-2] + '\n'+ ''']'''
json_string = '''{''' + json_nodes + ''',''' + json_links + '''}'''
text_file = open("graph_euclidean.json", "w")
text_file.write(json_string)
text_file.close()
In [45]:
# Hierarchical clustering with Scipy -- This method is not using K-means results!!
from scipy.cluster.hierarchy import dendrogram, linkage
# generate the linkage matrix
Z = linkage(X_mat, 'ward')
# calculate full dendrogram
plt.figure(figsize=(100,45))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Song')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,leaf_font_size=9., labels = Y)
plt.show()