In [1]:
%load_ext autoreload
%autoreload 
import sys
sys.path.append("./..")

In [2]:
%autoreload 
from clustering.equal_groups import EqualGroupsKMeans

In [3]:
%matplotlib inline
# %matplotlib notebook
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [17]:
all_dfs = []

for timegroup in "ABCDEFG":
    df = pd.read_csv('data/2dcluster.csv')
    df['TIMEGROUP'] =  timegroup
    all_dfs.append(df)
    
X = pd.concat(all_dfs)
X = X.reset_index(drop=True)

In [20]:
X_features = X[['X', 'Y']].sample(500)

In [ ]:
%autoreload 
clf = EqualGroupsKMeans(n_clusters=5, random_state=0)
%timeit clf.fit(X_features)


6.88 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [ ]:
clf = EqualGroupsKMeans(n_clusters=7, random_state=0)
clf.fit(X_features)

In [ ]:
X_features['labels'] = clf.labels_

In [ ]:
X_features.plot.scatter(x='X', y='Y', c='labels')

In [ ]:


In [ ]: