In [674]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
import time
import hdbscan
from sklearn.metrics import silhouette_score
%matplotlib inline
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}
In [675]:
plot_x, plot_y = 1, 3
n_clusters = 3
markers = []
In [676]:
s_id = 7104
input_path = r'../local_data/cluster_%d.csv' % s_id
input_file = open(input_path,'rb')
In [677]:
reader=csv.reader(input_file)
In [678]:
data_array=list(reader)
In [679]:
input_file.close()
In [680]:
data=np.array(data_array).astype('float')
In [681]:
plt.scatter(data.T[plot_x], data.T[plot_y], c='b', **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
In [682]:
def plot_clusters(data, all_data, algorithm, args, kwds):
start_time = time.time()
labels = algorithm(*args, **kwds).fit_predict(data)
end_time = time.time()
palette = sns.color_palette('deep', np.unique(labels).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
plt.scatter(all_data.T[plot_x], all_data.T[plot_y], c=colors, **plot_kwds)
frame = plt.gca()
# frame.axes.get_xaxis().set_visible(False)
# frame.axes.get_yaxis().set_visible(False)
# plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
plt.xlabel('Assignment performance')
plt.ylabel('Past assignment performance')
silhouette_avg = silhouette_score(data, labels)
# plt.text(3, 0.95, 'Silhouette Score: %4f'% silhouette_avg, fontsize=25)
return labels
In [683]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 2})
In [684]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 2})
In [685]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 3})
In [686]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 3})
In [687]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 4})
In [688]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 4})
In [689]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 5})
In [690]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 5})
In [691]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 6})
In [692]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 6})
In [693]:
# plot_clusters(data, cluster.AffinityPropagation, (), {'preference':-5.0, 'damping':0.95})
In [694]:
# labels_s1 = plot_clusters(data[:,1:], data, cluster.SpectralClustering, (), {'n_clusters': 3})
In [695]:
labels_s2 = plot_clusters(data[:,:], data, cluster.SpectralClustering, (), {'n_clusters': 3})
In [696]:
# plot_clusters(data[:,1:], data, cluster.AgglomerativeClustering, (), {'n_clusters':n_clusters, 'linkage':'ward'})
In [697]:
# plot_clusters(data[:,1:], data, cluster.DBSCAN, (), {'eps':0.025})
In [698]:
# plot_clusters(data[:,1:], data, hdbscan.HDBSCAN, (), {'min_cluster_size':15})
In [699]:
output_path = r'local_data/label_%d.csv' % s_id
output_file = open(output_path, 'wb')
In [700]:
writer = csv.writer(output_file)
writer.writerow(['l1', 'l2'])
In [701]:
for i in range(len(labels_k1)):
writer.writerow([labels_k1[i], labels_k2[i]])
In [ ]:
output_file.close()