notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans



In [2]:

    
rect1 = pd.DataFrame(np.random.rand(1000, 2)*np.asarray([0.2, 1]), columns=['a', 'b'])
rect2 = pd.DataFrame(np.random.rand(100, 2)*np.asarray([0.2, 0.1]) + 0.5, columns=['a', 'b'])
rect = pd.concat([rect1, rect2])



In [26]:

    
plt.figure()
ax = plt.subplot(111)
rect.plot.scatter(x='a', y='b', ax=ax)
plt.axis('off')
plt.savefig('original_problem.png')









    



/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [4]:

    
kmeans = KMeans(n_clusters=2)



In [5]:

    
kmeans.fit(rect)









    Out[5]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [6]:

    
labels = kmeans.predict(rect)



In [7]:

    
labels









    Out[7]:





array([0, 0, 0, ..., 0, 0, 0], dtype=int32)



In [8]:

    
rect.head()



In [9]:

    
plt.figure()
ax = plt.subplot(111)
rect.loc[labels== 0].plot.scatter(x='a', y='b', ax=ax)
rect.loc[labels== 1].plot.scatter(x='a', y='b', color='r', ax=ax)
plt.axis('off')
plt.savefig('bad_kmeans.png')









    



/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [10]:

    
from clusteror import Clusteror









    



Using gpu device 0: GeForce GT 650M (CNMeM is disabled, cuDNN 5005)



In [11]:

    
clusteror = Clusteror(rect)



In [12]:

    
clusteror.cleaned_data = np.tanh(rect - rect.median())



In [13]:

    
clusteror.train_sda_dim_reducer(hidden_layers_sizes=[5], corruption_levels=[0.1])



In [14]:

    
clusteror.reduce_to_one_dim()



In [15]:

    
clusteror.train_kmeans(n_clusters=2)



In [16]:

    
clusteror.add_cluster()



In [17]:

    
clusteror.raw_data.head()



In [20]:

    
plt.figure()
ax = plt.subplot(111)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b', ax=ax)
clusteror.raw_data.loc[(clusteror.raw_data.cluster == 0).values].plot.scatter(x='a', y='b', ax=ax, color='r')
plt.axis('off')
plt.savefig('leveraged_kmeans.png')









    



/Users/feizhan/miniconda/envs/py3_env/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [22]:

    
clusteror.raw_data.cluster.value_counts()









    Out[22]:





0    1000
1     100
Name: cluster, dtype: int64



In [23]:

    
from clusteror.plot import *



In [25]:

    
hist_plot_one_dim_group_data(clusteror.one_dim_data, clusteror.raw_data.cluster, show=False, filepath='hist.png')



In [23]:

    
valley_clusteror = Clusteror(rect)



In [24]:

    
valley_clusteror.cleaned_data = np.tanh(rect.iloc[:, :2] - rect.iloc[:, :2].median())



In [25]:

    
valley_clusteror.train_sda_dim_reducer()



In [26]:

    
valley_clusteror.cleaned_data.head()



In [27]:

    
valley_clusteror.reduce_to_one_dim()



In [32]:

    
valley_clusteror.train_valley(bins=200, contrast=0.1)



In [33]:

    
valley_clusteror.add_cluster()



In [34]:

    
valley_clusteror.raw_data.cluster.value_counts()









    Out[34]:





1    1033
2      67
Name: cluster, dtype: int64



In [35]:

    
hist_plot_one_dim_group_data(valley_clusteror.one_dim_data, valley_clusteror.raw_data.cluster, bins=100)



In [36]:

    
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')
valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 2).values].plot.scatter(x='a', y='b', ax=ax, color='r')
#valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 3).values].plot.scatter(x='a', y='b', ax=ax, color='k')









    Out[36]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1e4de28a58>



In [238]:

    
ax = valley_clusteror.raw_data.loc[(valley_clusteror.raw_data.cluster == 1).values].plot.scatter(x='a', y='b')



In [ ]:

	a	b
0	0.025567	0.711467
1	0.023250	0.921974
2	0.113821	0.752094
3	0.181033	0.317396
4	0.178371	0.565227

	a	b
0	0.025567	0.711467
1	0.023250	0.921974
2	0.113821	0.752094
3	0.181033	0.317396
4	0.178371	0.565227

	a	b
0	-0.090516	-0.467712
1	0.023934	-0.119174
2	-0.011268	0.229945
3	-0.031149	-0.160487
4	-0.068324	0.258716