In [1]:
%matplotlib inline
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
In [2]:
rect1 = pd.DataFrame(np.random.rand(1000, 2)*np.asarray([0.2, 1]), columns=['a', 'b'])
rect2 = pd.DataFrame(np.random.rand(100, 2)*np.asarray([0.2, 0.1]) + 0.5, columns=['a', 'b'])
rect = pd.concat([rect1, rect2])
In [26]:
plt.figure()
ax = plt.subplot(111)
rect.plot.scatter(x='a', y='b', ax=ax)
plt.axis('off')
plt.savefig('original_problem.png')
In [4]:
kmeans = KMeans(n_clusters=2)
In [5]:
kmeans.fit(rect)
Out[5]:
In [6]:
labels = kmeans.predict(rect)
In [7]:
labels
Out[7]:
In [8]:
rect.head()
Out[8]:
In [9]:
plt.figure()
ax = plt.subplot(111)
rect.loc[labels== 0].plot.scatter(x='a', y='b', ax=ax)
rect.loc[labels== 1].plot.scatter(x='a', y='b', color='r', ax=ax)
plt.axis('off')
plt.savefig('bad_kmeans.png')
In [10]:
from clusteror import Clusteror
In [11]:
clusteror = Clusteror(rect)
In [12]:
clusteror.cleaned_data = np.tanh(rect - rect.median())
In [13]:
clusteror.train_sda_dim_reducer(hidden_layers_sizes=[5], corruption_levels=[0.1])
In [14]:
clusteror.reduce_to_one_dim()
In [15]:
clusteror.train_kmeans(n_clusters=2)
In [16]:
clusteror.add_cluster()
In [17]:
clusteror.raw_data.head()
Out[17]:
In [20]:
plt.figure()
ax = plt.subplot(111)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b', ax=ax)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 0).values].plot.scatter(x='a', y='b', ax=ax, color='r')
plt.axis('off')
plt.savefig('leveraged_kmeans.png')
In [22]:
clusteror.raw_data.cluster.value_counts()
Out[22]:
In [23]:
from clusteror.plot import *
In [25]:
hist_plot_one_dim_group_data(clusteror.one_dim_data, clusteror.raw_data.cluster, show=False, filepath='hist.png')
In [23]:
valley_clusteror = Clusteror(rect)
In [24]:
valley_clusteror.cleaned_data = np.tanh(rect.iloc[:, :2] - rect.iloc[:, :2].median())
In [25]:
valley_clusteror.train_sda_dim_reducer()
In [26]:
valley_clusteror.cleaned_data.head()
Out[26]:
In [27]:
valley_clusteror.reduce_to_one_dim()
In [32]:
valley_clusteror.train_valley(bins=200, contrast=0.1)
In [33]:
valley_clusteror.add_cluster()
In [34]:
valley_clusteror.raw_data.cluster.value_counts()
Out[34]:
In [35]:
hist_plot_one_dim_group_data(valley_clusteror.one_dim_data, valley_clusteror.raw_data.cluster, bins=100)
In [36]:
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')
valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 2).values].plot.scatter(x='a', y='b', ax=ax, color='r')
#valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 3).values].plot.scatter(x='a', y='b', ax=ax, color='k')
Out[36]:
In [238]:
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')
In [ ]: