In [674]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
import time
import hdbscan

from sklearn.metrics import silhouette_score

%matplotlib inline
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

In [675]:
plot_x, plot_y = 1, 3
n_clusters = 3
markers = []

In [676]:
s_id = 7104
input_path = r'../local_data/cluster_%d.csv' % s_id
input_file = open(input_path,'rb')

In [677]:
reader=csv.reader(input_file)

In [678]:
data_array=list(reader)

In [679]:
input_file.close()

In [680]:
data=np.array(data_array).astype('float')

In [681]:
plt.scatter(data.T[plot_x], data.T[plot_y], c='b', **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)



In [682]:
def plot_clusters(data, all_data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(all_data.T[plot_x], all_data.T[plot_y], c=colors, **plot_kwds)
    frame = plt.gca()
#     frame.axes.get_xaxis().set_visible(False)
#     frame.axes.get_yaxis().set_visible(False)
#     plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.xlabel('Assignment performance')
    plt.ylabel('Past assignment performance')
    silhouette_avg = silhouette_score(data, labels)
#     plt.text(3, 0.95, 'Silhouette Score: %4f'% silhouette_avg, fontsize=25)
    return labels

In [683]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 2})



In [684]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 2})



In [685]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 3})



In [686]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 3})



In [687]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 4})



In [688]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 4})



In [689]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 5})



In [690]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 5})



In [691]:
labels_k2 = plot_clusters(data[:,1:], data, cluster.KMeans, (), {'n_clusters': 6})



In [692]:
labels_k2 = plot_clusters(data[:,:], data, cluster.KMeans, (), {'n_clusters': 6})



In [693]:
# plot_clusters(data, cluster.AffinityPropagation, (), {'preference':-5.0, 'damping':0.95})

In [694]:
# labels_s1 = plot_clusters(data[:,1:], data, cluster.SpectralClustering, (), {'n_clusters': 3})

In [695]:
labels_s2 = plot_clusters(data[:,:], data, cluster.SpectralClustering, (), {'n_clusters': 3})



In [696]:
# plot_clusters(data[:,1:], data, cluster.AgglomerativeClustering, (), {'n_clusters':n_clusters, 'linkage':'ward'})

In [697]:
# plot_clusters(data[:,1:], data, cluster.DBSCAN, (), {'eps':0.025})

In [698]:
# plot_clusters(data[:,1:], data, hdbscan.HDBSCAN, (), {'min_cluster_size':15})

In [699]:
output_path = r'local_data/label_%d.csv' % s_id
output_file = open(output_path, 'wb')

In [700]:
writer = csv.writer(output_file)
writer.writerow(['l1', 'l2'])

In [701]:
for i in range(len(labels_k1)):
    writer.writerow([labels_k1[i], labels_k2[i]])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-701-36cba4681224> in <module>()
----> 1 for i in range(len(labels_k1)):
      2     writer.writerow([labels_k1[i], labels_k2[i]])

NameError: name 'labels_k1' is not defined

In [ ]:
output_file.close()