In [ ]:
import sys
print(sys.version)

In [ ]:
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import time

import pandas as pd
import seaborn as sns

In [ ]:
from scipy.spatial.distance import cdist

In [ ]:
import sys
sys.path.append('../code/')

from k_means import KMeans

In [ ]:
pts = np.array([[1, 0], [0., 1], [0, 1], [0, 1], [0, 1], [0, 1]])
pts_y = np.array([0, 1, 1, 1, 1, 1])
assert(pts.shape[0] == pts_y.shape[0])

In [ ]:
centers = np.array([[0, 0.], [0, 0.]])

In [ ]:
distances = sp.spatial.distance.cdist(pts, centers)
distances

In [ ]:
centers = np.argmin(distances, axis=1)
centers

In [ ]:
km_toy = KMeans(k=3, train_X=pts, train_y=pts_y, pca_obj=None)
km_toy.run()

In [ ]:
km_toy.assignments

In [ ]:
km_toy.predictions

In [ ]:
km_toy.loss_01()

In [ ]:
km_toy.results_df

In [ ]:
km_toy.results_df is None

In [ ]:
km_toy.plot_squared_reconstruction_error()

Train on sklearn classification data


In [ ]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=5, 
                           n_informative=5, n_redundant=0, n_repeated=0, 
                           n_classes=2, n_clusters_per_class=1, 
                           weights=None, flip_y=0.001, class_sep=2.0, 
                           hypercube=True, shift=0.0, scale=1.0, 
                           shuffle=True, random_state=None)
#X_train, y_train = X[0:120], y[0:120]
#X_test, y_test = X[120:], y[120:]

In [ ]:
train_X = X
train_y = y
km_sklearn = KMeans(k=3, train_X=X, train_y=y, 
            #test_X=X_test, test_y=y_test,
            pca_obj = None)

In [ ]:
km_sklearn.run()

In [ ]:
p = km_sklearn.plot_squared_reconstruction_error()

In [ ]:
p = km_sklearn.plot_0_1_loss()

In [ ]:
km_sklearn.plot_squared_reconstruction_error()

In [ ]:
km_sklearn.plot_num_assignments_for_each_center()

In [ ]:
df = km_sklearn.results_df_cluster_assignment_counts.copy()
df.set_index('iteration', inplace=True)
df.head()

f = plt.figure()

plt.title('Num points per cluster', color='black')
df.plot(figsize = (4,3), ax=f.gca())
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

Train some more legit fake data


In [ ]:
km = KMeans(k=2, train_X=X, train_y=y, 
            test_X=X_test, test_y=y_test,
            pca_obj = None)

In [ ]:
km.run()

In [ ]:
km.assignments[0:10]

In [ ]:
#from scipy.stats.mstats import mode
from statistics import mode

In [ ]:
mode([0, 1, 1])

In [ ]:
from collections import Counter
c = Counter(np.array([0, 0, 0, 1, 1]))

In [ ]:
c.values()

In [ ]:
c.most_common()

In [ ]:
np.bincount(np.array([0, 0, 0, 1, 1]))

In [ ]:
km.loss_01()

In [ ]:
km.loss_01_normalized()

In [ ]:
km.results_df

In [ ]:
km.plot_squared_reconstruction_error()

In [ ]: