In [ ]:
import sys
print(sys.version)
In [ ]:
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import time
import pandas as pd
import seaborn as sns
In [ ]:
from scipy.spatial.distance import cdist
In [ ]:
import sys
sys.path.append('../code/')
from k_means import KMeans
In [ ]:
pts = np.array([[1, 0], [0., 1], [0, 1], [0, 1], [0, 1], [0, 1]])
pts_y = np.array([0, 1, 1, 1, 1, 1])
assert(pts.shape[0] == pts_y.shape[0])
In [ ]:
centers = np.array([[0, 0.], [0, 0.]])
In [ ]:
distances = sp.spatial.distance.cdist(pts, centers)
distances
In [ ]:
centers = np.argmin(distances, axis=1)
centers
In [ ]:
km_toy = KMeans(k=3, train_X=pts, train_y=pts_y, pca_obj=None)
km_toy.run()
In [ ]:
km_toy.assignments
In [ ]:
km_toy.predictions
In [ ]:
km_toy.loss_01()
In [ ]:
km_toy.results_df
In [ ]:
km_toy.results_df is None
In [ ]:
km_toy.plot_squared_reconstruction_error()
In [ ]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=5,
n_informative=5, n_redundant=0, n_repeated=0,
n_classes=2, n_clusters_per_class=1,
weights=None, flip_y=0.001, class_sep=2.0,
hypercube=True, shift=0.0, scale=1.0,
shuffle=True, random_state=None)
#X_train, y_train = X[0:120], y[0:120]
#X_test, y_test = X[120:], y[120:]
In [ ]:
train_X = X
train_y = y
km_sklearn = KMeans(k=3, train_X=X, train_y=y,
#test_X=X_test, test_y=y_test,
pca_obj = None)
In [ ]:
km_sklearn.run()
In [ ]:
p = km_sklearn.plot_squared_reconstruction_error()
In [ ]:
p = km_sklearn.plot_0_1_loss()
In [ ]:
km_sklearn.plot_squared_reconstruction_error()
In [ ]:
km_sklearn.plot_num_assignments_for_each_center()
In [ ]:
df = km_sklearn.results_df_cluster_assignment_counts.copy()
df.set_index('iteration', inplace=True)
df.head()
f = plt.figure()
plt.title('Num points per cluster', color='black')
df.plot(figsize = (4,3), ax=f.gca())
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()
In [ ]:
km = KMeans(k=2, train_X=X, train_y=y,
test_X=X_test, test_y=y_test,
pca_obj = None)
In [ ]:
km.run()
In [ ]:
km.assignments[0:10]
In [ ]:
#from scipy.stats.mstats import mode
from statistics import mode
In [ ]:
mode([0, 1, 1])
In [ ]:
from collections import Counter
c = Counter(np.array([0, 0, 0, 1, 1]))
In [ ]:
c.values()
In [ ]:
c.most_common()
In [ ]:
np.bincount(np.array([0, 0, 0, 1, 1]))
In [ ]:
km.loss_01()
In [ ]:
km.loss_01_normalized()
In [ ]:
km.results_df
In [ ]:
km.plot_squared_reconstruction_error()
In [ ]: