In [1]:
%matplotlib inline
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
rect1 = pd.DataFrame(np.random.rand(1000, 2)*np.asarray([0.2, 1]), columns=['a', 'b'])
rect2 = pd.DataFrame(np.random.rand(100, 2)*np.asarray([0.2, 0.1]) + 0.5, columns=['a', 'b'])
rect = pd.concat([rect1, rect2])

In [26]:
plt.figure()
ax = plt.subplot(111)
rect.plot.scatter(x='a', y='b', ax=ax)
plt.axis('off')
plt.savefig('original_problem.png')


/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [4]:
kmeans = KMeans(n_clusters=2)

In [5]:
kmeans.fit(rect)


Out[5]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [6]:
labels = kmeans.predict(rect)

In [7]:
labels


Out[7]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [8]:
rect.head()


Out[8]:
a b
0 0.025567 0.711467
1 0.023250 0.921974
2 0.113821 0.752094
3 0.181033 0.317396
4 0.178371 0.565227

In [9]:
plt.figure()
ax = plt.subplot(111)
rect.loc[labels== 0].plot.scatter(x='a', y='b', ax=ax)
rect.loc[labels== 1].plot.scatter(x='a', y='b', color='r', ax=ax)
plt.axis('off')
plt.savefig('bad_kmeans.png')


/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [10]:
from clusteror import Clusteror


Using gpu device 0: GeForce GT 650M (CNMeM is disabled, cuDNN 5005)

In [11]:
clusteror = Clusteror(rect)

In [12]:
clusteror.cleaned_data = np.tanh(rect - rect.median())

In [13]:
clusteror.train_sda_dim_reducer(hidden_layers_sizes=[5], corruption_levels=[0.1])

In [14]:
clusteror.reduce_to_one_dim()

In [15]:
clusteror.train_kmeans(n_clusters=2)

In [16]:
clusteror.add_cluster()

In [17]:
clusteror.raw_data.head()


Out[17]:
a b cluster
0 0.025567 0.711467 0
1 0.023250 0.921974 0
2 0.113821 0.752094 0
3 0.181033 0.317396 0
4 0.178371 0.565227 0

In [20]:
plt.figure()
ax = plt.subplot(111)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b', ax=ax)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 0).values].plot.scatter(x='a', y='b', ax=ax, color='r')
plt.axis('off')
plt.savefig('leveraged_kmeans.png')


/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [22]:
clusteror.raw_data.cluster.value_counts()


Out[22]:
0    1000
1     100
Name: cluster, dtype: int64

In [23]:
from clusteror.plot import *

In [25]:
hist_plot_one_dim_group_data(clusteror.one_dim_data, clusteror.raw_data.cluster, show=False, filepath='hist.png')



In [23]:
valley_clusteror = Clusteror(rect)

In [24]:
valley_clusteror.cleaned_data = np.tanh(rect.iloc[:, :2] - rect.iloc[:, :2].median())

In [25]:
valley_clusteror.train_sda_dim_reducer()

In [26]:
valley_clusteror.cleaned_data.head()


Out[26]:
a b
0 -0.090516 -0.467712
1 0.023934 -0.119174
2 -0.011268 0.229945
3 -0.031149 -0.160487
4 -0.068324 0.258716

In [27]:
valley_clusteror.reduce_to_one_dim()

In [32]:
valley_clusteror.train_valley(bins=200, contrast=0.1)

In [33]:
valley_clusteror.add_cluster()

In [34]:
valley_clusteror.raw_data.cluster.value_counts()


Out[34]:
1    1033
2      67
Name: cluster, dtype: int64

In [35]:
hist_plot_one_dim_group_data(valley_clusteror.one_dim_data, valley_clusteror.raw_data.cluster, bins=100)



In [36]:
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')
valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 2).values].plot.scatter(x='a', y='b', ax=ax, color='r')
#valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 3).values].plot.scatter(x='a', y='b', ax=ax, color='k')


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e4de28a58>

In [238]:
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')



In [ ]: