notebook.community

Edit and run



In [ ]:

    
import sys
print(sys.version)



In [ ]:

    
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import time

import pandas as pd
import seaborn as sns



In [ ]:

    
from scipy.spatial.distance import cdist



In [ ]:

    
import sys
sys.path.append('../code/')

from k_means import KMeans



In [ ]:

    
pts = np.array([[1, 0], [0., 1], [0, 1], [0, 1], [0, 1], [0, 1]])
pts_y = np.array([0, 1, 1, 1, 1, 1])
assert(pts.shape[0] == pts_y.shape[0])



In [ ]:

    
centers = np.array([[0, 0.], [0, 0.]])



In [ ]:

    
distances = sp.spatial.distance.cdist(pts, centers)
distances



In [ ]:

    
centers = np.argmin(distances, axis=1)
centers



In [ ]:

    
km_toy = KMeans(k=3, train_X=pts, train_y=pts_y, pca_obj=None)
km_toy.run()



In [ ]:

    
km_toy.assignments



In [ ]:

    
km_toy.predictions



In [ ]:

    
km_toy.loss_01()



In [ ]:

    
km_toy.results_df



In [ ]:

    
km_toy.results_df is None



In [ ]:

    
km_toy.plot_squared_reconstruction_error()

Train on sklearn classification data



In [ ]:

    
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=5, 
                           n_informative=5, n_redundant=0, n_repeated=0, 
                           n_classes=2, n_clusters_per_class=1, 
                           weights=None, flip_y=0.001, class_sep=2.0, 
                           hypercube=True, shift=0.0, scale=1.0, 
                           shuffle=True, random_state=None)
#X_train, y_train = X[0:120], y[0:120]
#X_test, y_test = X[120:], y[120:]



In [ ]:

    
train_X = X
train_y = y
km_sklearn = KMeans(k=3, train_X=X, train_y=y, 
            #test_X=X_test, test_y=y_test,
            pca_obj = None)



In [ ]:

    
km_sklearn.run()



In [ ]:

    
p = km_sklearn.plot_squared_reconstruction_error()



In [ ]:

    
p = km_sklearn.plot_0_1_loss()



In [ ]:

    
km_sklearn.plot_squared_reconstruction_error()



In [ ]:

    
km_sklearn.plot_num_assignments_for_each_center()



In [ ]:

    
df = km_sklearn.results_df_cluster_assignment_counts.copy()
df.set_index('iteration', inplace=True)
df.head()

f = plt.figure()

plt.title('Num points per cluster', color='black')
df.plot(figsize = (4,3), ax=f.gca())
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

Train some more legit fake data



In [ ]:

    
km = KMeans(k=2, train_X=X, train_y=y, 
            test_X=X_test, test_y=y_test,
            pca_obj = None)



In [ ]:

    
km.run()



In [ ]:

    
km.assignments[0:10]



In [ ]:

    
#from scipy.stats.mstats import mode
from statistics import mode



In [ ]:

    
mode([0, 1, 1])



In [ ]:

    
from collections import Counter
c = Counter(np.array([0, 0, 0, 1, 1]))



In [ ]:

    
c.values()



In [ ]:

    
c.most_common()



In [ ]:

    
np.bincount(np.array([0, 0, 0, 1, 1]))



In [ ]:

    
km.loss_01()



In [ ]:

    
km.loss_01_normalized()



In [ ]:

    
km.results_df



In [ ]:

    
km.plot_squared_reconstruction_error()



In [ ]: