In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
# 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = pd.read_csv('report_nopoker.csv',sep='\t')
print(df['endpoint_type'].unique())
df['endpoint_type'] = le.fit_transform(df['endpoint_type'].values)
print('df[:10]',df[:10])
print(df['endpoint_type'].unique())


['binary' 'integer']
df[:10]                                              Dataset  #instances  #features  \
0  GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_ED...        1600       1000   
1    GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1.csv        1600         20   
2    GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.csv        1600         20   
3    GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1.csv        1600         20   
4  GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...        1600         20   
5  GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...        1600         20   
6                         Hill_Valley_with_noise.csv        1212        100   
7                      Hill_Valley_without_noise.csv        1212        100   
8                                          adult.csv       48842         14   
9                               agaricus-lepiota.csv        8145         22   

   #binary_features  #integer_features  #float_features  endpoint_type  \
0                31                969                0              0   
1                 0                 20                0              0   
2                 2                 18                0              0   
3                 0                 20                0              0   
4                 1                 19                0              0   
5                 1                 19                0              0   
6                 0                  0              100              0   
7                 0                  0              100              0   
8                 1                  7                6              0   
9                 6                 16                0              0   

   #classes  Imbalance_metric  
0         2          0.000000  
1         2          0.000000  
2         2          0.000000  
3         2          0.000000  
4         2          0.000000  
5         2          0.000000  
6         2          0.000000  
7         2          0.000098  
8         2          0.271896  
9         2          0.001458  
[0 1]

In [4]:
# histogram plot of features
import seaborn as sns
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter



nice_cols = {
    '#instances': 'Instances',
    '#features': 'Features',
    '#binary_features': 'Binary Features',
    '#integer_features': 'Categorical and Ordinal Features',
    '#float_features': 'Continuous Features',
    'endpoint_type': 'Endpoint Type',
    '#classes': 'Classes',
    'Imbalance_metric': 'Class Imbalance'
}
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18

h = plt.figure()#figsize=(1,1)
plt.rcParams['figure.figsize'] = (20.0, 10.0)
sns.set_style("whitegrid")
for i,col in enumerate(df.columns[1:]):

    ax=plt.subplot(4,2,i+1)   
#     ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))

    if col =='endpoint_type':
        sns.distplot(df[col],kde=False,bins=2,hist_kws={"rwidth":0.25})
        plt.xlim(0,1)
        plt.gca().set_xticks((0.25,0.75))
        plt.gca().set_xticklabels(('Binary','Multiclass'),size=16)
        plt.ylim(1,100)
    elif col == '#classes':
        sns.distplot(df[col],kde=False,bins=50,hist_kws={"rwidth":2})
        plt.gca().set_xticks((2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25))
    else:
        sns.distplot(df[col],kde=False,bins=100,hist_kws={"rwidth":1})
    
        
    plt.xlabel(nice_cols[col],size=20)   
    plt.yscale('log', nonposy='clip')
    plt.yticks((1,10,100))
    ax.yaxis.set_major_formatter(ScalarFormatter())
    ax.yaxis.set_major_formatter(FormatStrFormatter('%i'))

#     yticks = plt.gca().get_yticks()
#     plt.set(ax,dpi=300)
# h=plt.gcf()
# 

h.tight_layout()
h.savefig('figs/data_feature_hist.pdf')
h.savefig('figs/data_feature_hist.png')
plt.show()



In [38]:
# x, y = np.random.multivariate_normal(mean, cov, 1000).T
# with sns.axes_style("white"):
sns.jointplot(x='#instances', y='#features', data=df)#,kind="hex", color="k");
# plt.text(df['#instances'].values+1,df['#features'].values,df['Dataset'].values)


Out[38]:
<matplotlib.text.Text at 0x7f91490b1da0>
Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x7f9161077d90> (for post_execute):
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/pyplot.py in post_execute()
    147             def post_execute():
    148                 if matplotlib.is_interactive():
--> 149                     draw_all()
    150 
    151             # IPython >= 2

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/_pylab_helpers.py in draw_all(cls, force)
    148         for f_mgr in cls.get_all_fig_managers():
    149             if force or f_mgr.canvas.figure.stale:
--> 150                 f_mgr.canvas.draw_idle()
    151 
    152 atexit.register(Gcf.destroy_all)

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/backend_bases.py in draw_idle(self, *args, **kwargs)
   2030         if not self._is_idle_drawing:
   2031             with self._idle_draw_cntx():
-> 2032                 self.draw(*args, **kwargs)
   2033 
   2034     def draw_cursor(self, event):

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/backends/backend_agg.py in draw(self)
    462 
    463         try:
--> 464             self.figure.draw(self.renderer)
    465         finally:
    466             RendererAgg.lock.release()

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/figure.py in draw(self, renderer)
   1141 
   1142             mimage._draw_list_compositing_images(
-> 1143                 renderer, self, dsu, self.suppressComposite)
   1144 
   1145             renderer.close_group('figure')

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, dsu, suppress_composite)
    137     if not_composite or not has_images:
    138         for zorder, a in dsu:
--> 139             a.draw(renderer)
    140     else:
    141         # Composite any adjacent images together

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_base.py in draw(self, renderer, inframe)
   2407             renderer.stop_rasterizing()
   2408 
-> 2409         mimage._draw_list_compositing_images(renderer, self, dsu)
   2410 
   2411         renderer.close_group('axes')

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, dsu, suppress_composite)
    137     if not_composite or not has_images:
    138         for zorder, a in dsu:
--> 139             a.draw(renderer)
    140     else:
    141         # Composite any adjacent images together

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in draw(self, renderer)
    752 
    753         with _wrap_text(self) as textobj:
--> 754             bbox, info, descent = textobj._get_layout(renderer)
    755             trans = textobj.get_transform()
    756 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in _get_layout(self, renderer)
    333         of a rotated text when necessary.
    334         """
--> 335         key = self.get_prop_tup(renderer=renderer)
    336         if key in self._cached:
    337             return self._cached[key]

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in get_prop_tup(self, renderer)
    907         need to know if the text has changed.
    908         """
--> 909         x, y = self.get_unitless_position()
    910         return (x, y, self.get_text(), self._color,
    911                 self._verticalalignment, self._horizontalalignment,

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in get_unitless_position(self)
    889         # This will get the position with all unit information stripped away.
    890         # This is here for convienience since it is done in several locations.
--> 891         x = float(self.convert_xunits(self._x))
    892         y = float(self.convert_yunits(self._y))
    893         return x, y

TypeError: only length-1 arrays can be converted to Python scalars
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/home/bill/anaconda3/lib/python3.5/site-packages/IPython/core/formatters.py in __call__(self, obj)
    337                 pass
    338             else:
--> 339                 return printer(obj)
    340             # Finally look for special method names
    341             method = _safe_get_formatter_method(obj, self.print_method)

/home/bill/anaconda3/lib/python3.5/site-packages/IPython/core/pylabtools.py in <lambda>(fig)
    226 
    227     if 'png' in formats:
--> 228         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    229     if 'retina' in formats or 'png2x' in formats:
    230         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))

/home/bill/anaconda3/lib/python3.5/site-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs)
    117 
    118     bytes_io = BytesIO()
--> 119     fig.canvas.print_figure(bytes_io, **kw)
    120     data = bytes_io.getvalue()
    121     if fmt == 'svg':

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, **kwargs)
   2190                     orientation=orientation,
   2191                     dryrun=True,
-> 2192                     **kwargs)
   2193                 renderer = self.figure._cachedRenderer
   2194                 bbox_inches = self.figure.get_tightbbox(renderer)

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, *args, **kwargs)
    543 
    544     def print_png(self, filename_or_obj, *args, **kwargs):
--> 545         FigureCanvasAgg.draw(self)
    546         renderer = self.get_renderer()
    547         original_dpi = renderer.dpi

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/backends/backend_agg.py in draw(self)
    462 
    463         try:
--> 464             self.figure.draw(self.renderer)
    465         finally:
    466             RendererAgg.lock.release()

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/figure.py in draw(self, renderer)
   1141 
   1142             mimage._draw_list_compositing_images(
-> 1143                 renderer, self, dsu, self.suppressComposite)
   1144 
   1145             renderer.close_group('figure')

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, dsu, suppress_composite)
    137     if not_composite or not has_images:
    138         for zorder, a in dsu:
--> 139             a.draw(renderer)
    140     else:
    141         # Composite any adjacent images together

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_base.py in draw(self, renderer, inframe)
   2407             renderer.stop_rasterizing()
   2408 
-> 2409         mimage._draw_list_compositing_images(renderer, self, dsu)
   2410 
   2411         renderer.close_group('axes')

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, dsu, suppress_composite)
    137     if not_composite or not has_images:
    138         for zorder, a in dsu:
--> 139             a.draw(renderer)
    140     else:
    141         # Composite any adjacent images together

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     61     def draw_wrapper(artist, renderer, *args, **kwargs):
     62         before(artist, renderer)
---> 63         draw(artist, renderer, *args, **kwargs)
     64         after(artist, renderer)
     65 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in draw(self, renderer)
    752 
    753         with _wrap_text(self) as textobj:
--> 754             bbox, info, descent = textobj._get_layout(renderer)
    755             trans = textobj.get_transform()
    756 

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in _get_layout(self, renderer)
    333         of a rotated text when necessary.
    334         """
--> 335         key = self.get_prop_tup(renderer=renderer)
    336         if key in self._cached:
    337             return self._cached[key]

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in get_prop_tup(self, renderer)
    907         need to know if the text has changed.
    908         """
--> 909         x, y = self.get_unitless_position()
    910         return (x, y, self.get_text(), self._color,
    911                 self._verticalalignment, self._horizontalalignment,

/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/text.py in get_unitless_position(self)
    889         # This will get the position with all unit information stripped away.
    890         # This is here for convienience since it is done in several locations.
--> 891         x = float(self.convert_xunits(self._x))
    892         y = float(self.convert_yunits(self._y))
    893         return x, y

TypeError: only length-1 arrays can be converted to Python scalars
<matplotlib.figure.Figure at 0x7f91490b17b8>

In [ ]:


In [30]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

from sklearn.preprocessing import StandardScaler
import numpy
import pdb
from tqdm import tqdm
#==========
# optimal K via elbow method with silhouette score which produces a better elbow.
#==========
ss = StandardScaler()
X = ss.fit_transform(df.drop('Dataset',axis=1).values)
# pdb.set_trace()
Ks = np.arange(2,20,1)
Inertias = []
Silhouettes = []
np.random.seed(2)
# loop through k values
for K in tqdm(Ks):
	km = KMeans(n_clusters=K, init='k-means++',copy_x=False).fit(X)
	labels = km.labels_
	centers = km.cluster_centers_
	inertia = km.inertia_
	Silhouettes.append(silhouette_score(X,labels))
	# Inertias[K-1] = km.inertia_
	Inertias.append(km.inertia_)

# line plot of K versus Silhouette score with best value marked with x 
plt.figure(1)
plt.plot(Ks,Silhouettes,label='silhouette')
plt.plot(Ks[np.argmax(Silhouettes)],Silhouettes[np.argmax(Silhouettes)],marker = 'o',color='r',markersize=7)
plt.text(Ks[np.argmax(Silhouettes)]-2,Silhouettes[np.argmax(Silhouettes)],"K = "+repr(Ks[np.argmax(Silhouettes)]))
plt.ylim(0.95*np.min(Silhouettes),1.05*np.max(Silhouettes),'x')
plt.ylabel("Average silhouette score") #Y-axis label
plt.xlabel("K") #X-axis label
plt.title("Choice of K") #Plot title
plt.tight_layout()
# plt.savefig("k_silhouette_data-features.pdf")

plt.figure(2)
plt.plot(Ks,Inertias,label='inertia')
plt.plot(Ks[np.argmin(Inertias)],Inertias[np.argmin(Inertias)],marker = 'o',color='r',markersize=7)
plt.text(Ks[np.argmin(Inertias)]-2,Inertias[np.argmin(Inertias)],"K = "+repr(Ks[np.argmin(Inertias)]))
plt.ylim(0.95*np.min(Inertias),1.05*np.max(Inertias))
plt.ylabel("Inertias") #Y-axis label
plt.xlabel("K") #X-axis label
plt.title("Choice of K") #Plot title
plt.tight_layout()
# plt.savefig("k_inertia_data-features.pdf")


100%|██████████| 18/18 [00:00<00:00, 31.44it/s]

In [5]:
# =====
# plot cluster centers on 2 principal component axes
# =====
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt 
import pdb
%matplotlib inline 
plt.rcParams['figure.figsize'] = (20.0, 10.0)

marker =( 'o','s','^','<','v','>','+', 'x','*') 
h = plt.figure()

ss = StandardScaler()
X = ss.fit_transform(df.drop('Dataset',axis=1).values)

pca = PCA()
X_pca = pca.fit_transform(X)
print("explained variance:",np.cumsum(pca.explained_variance_ratio_) )

nc = 5
np.random.seed(42)
km = KMeans(n_clusters=nc, init='k-means++',copy_x=False,max_iter=1000).fit(X)
labels = km.labels_
centers = km.cluster_centers_

# centers_pca = centers
centers_pca = pca.transform(centers)

colors = plt.cm.Vega10((0,1,2,3,4,5))

for k, col in zip(np.unique(labels), colors):
    print('k:',k)
    print('col:',col)
    print('k%nc:',k%nc)
    label_mask = (k==labels)
    xy = X_pca[label_mask]
    coverage = np.sum(k==labels)
    if coverage==1:
        label='cluster '+str(k)+' ('+str(coverage)+' dataset)'
    else:
        label='cluster '+str(k)+' ('+str(coverage)+' datasets)'
    plt.scatter(xy[:,0], xy[:, 1], marker=marker[k%nc], facecolor=col, s=500, edgecolor='k', alpha=1,label=label)

# plt.plot(X_pca[:,0], X_pca[:, 1], linestyle = '',marker='.', markerfacecolor='k', markersize=5, alpha=1)

for k, col in zip(np.unique(labels), colors): 
    print('k:',k)
    print('col:',col)
    print('k%nc:',k%nc)
    
    plt.plot(centers_pca[k,0],centers_pca[k,1], linestyle='', marker=marker[k%nc], markerfacecolor=col,markersize=100,alpha=0.3)
    plt.text(centers_pca[k,0]+0.75,centers_pca[k,1]-1,str(k),fontsize=35)

print('labels:',labels)
print('centers:',centers_pca,len(centers_pca))

plt.xlim(min(X_pca[:,0])*1.1,max(X_pca[:,0])*1.1)
plt.ylim(min(X_pca[:,1])*1.1,max(X_pca[:,1])*1.1)
ax = plt.gca()
ax.set_axis_bgcolor('white')        
plt.xlabel('Principal Component 1',fontsize=22)
plt.ylabel('Principal Component 2',fontsize=22)
plt.legend(fontsize=25)
plt.tight_layout()
h.savefig('figs/k_means_PCA_data_by_features.pdf')
h.savefig('figs/k_means_PCA_data_by_features.png')


# show kmeans cluster center values for different features. 
print('features:',df.columns[1:])
for k in np.unique(labels):
    print('label',k)
    label_mask = (k==labels)
    xk_mean = np.mean(ss.inverse_transform(X[label_mask]),axis=0)
    for i,f in enumerate(df.columns[1:]):
        print('\t{}:\t{}'.format(f,np.round(xk_mean[i],3)))
        
        
# bar plots of cluster feature values
nice_cols = {
    '#instances': 'Instances',
    '#features': 'Features',
    '#binary_features': 'Binary Features',
    '#integer_features': 'Categorical and\n Ordinal Features',
    '#float_features': 'Continuous Features',
    'endpoint_type': 'Endpoint Type',
    '#classes': 'Classes',
    'Imbalance_metric': 'Class Imbalance'
}
h2 = plt.figure()
features = df.columns[1:].values
for k,col in zip(np.unique(labels),colors):
    label_mask = (k==labels)
    coverage = np.sum(label_mask)
    xk_mean = np.mean(ss.inverse_transform(X[label_mask]),axis=0)
    offset = k*0.1-np.mean(np.unique(labels))*0.1
    if coverage==1:
        label='cluster '+str(k)+' ('+str(coverage)+' dataset)'
    else:
        label='cluster '+str(k)+' ('+str(coverage)+' datasets)'
    plt.bar(np.arange(len(features))+offset,xk_mean,align='center',width=0.1,color=col,log=True,label=label)
# pdb.set_trace()
plt.gca().set_xticks(np.arange(len(features)))
plt.gca().set_xticklabels([nice_cols[f] for f in features],fontsize=22,rotation=60)
plt.legend(fontsize=25)
plt.tight_layout()
h2.savefig('figs/cluster_features.pdf')
h2.savefig('figs/cluster_features.png')

plt.show()

print('data sets in cluster 0:',df['Dataset'][labels==0])
print('data sets in cluster 1:',df['Dataset'][labels==1])
print('data sets in cluster 2:',df['Dataset'][labels==2])
print('data sets in cluster 3:',df['Dataset'][labels==3])


explained variance: [ 0.274563    0.49895352  0.65848273  0.77250202  0.88139719  0.96460656
  1.          1.        ]
k: 0
col: [ 0.12156863  0.46666667  0.70588235  1.        ]
k%nc: 0
k: 1
col: [ 1.          0.49803922  0.05490196  1.        ]
k%nc: 1
k: 2
col: [ 0.17254902  0.62745098  0.17254902  1.        ]
k%nc: 2
k: 3
col: [ 0.83921569  0.15294118  0.15686275  1.        ]
k%nc: 3
k: 4
col: [ 0.58039216  0.40392157  0.74117647  1.        ]
k%nc: 4
k: 0
col: [ 0.12156863  0.46666667  0.70588235  1.        ]
k%nc: 0
k: 1
col: [ 1.          0.49803922  0.05490196  1.        ]
k%nc: 1
k: 2
col: [ 0.17254902  0.62745098  0.17254902  1.        ]
k%nc: 2
k: 3
col: [ 0.83921569  0.15294118  0.15686275  1.        ]
k%nc: 3
k: 4
col: [ 0.58039216  0.40392157  0.74117647  1.        ]
k%nc: 4
labels: [2 1 1 1 1 1 1 1 1 1 3 3 3 3 1 1 0 1 1 1 1 1 1 0 1 0 0 1 3 3 1 1 0 3 0 1 1
 1 1 1 1 1 1 0 3 0 0 0 1 1 1 1 1 0 0 0 0 3 1 0 0 0 0 1 1 1 1 0 1 3 3 0 0 0
 1 1 0 1 1 0 1 1 1 1 1 1 1 3 1 0 1 4 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 2 1 1
 1 1 1 0 1 1 0 0 0 3 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 3 0 0 0 1 0 1 1 1 0 0
 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 0]
centers: [[  3.39799654e-01   1.15444374e+00   2.87335814e-01  -1.95810625e-01
   -5.86165032e-01  -3.47885923e-01  -1.87501878e-01   8.01401243e-17]
 [ -5.79893088e-01  -7.30180782e-01   2.32440417e-01   2.79065670e-01
    2.31921584e-01   2.61704663e-01   1.03617631e-01   1.04147525e-16]
 [  1.13078657e+01  -4.90539056e+00   2.99229148e-01   1.49136143e+00
    5.61719292e-02  -1.73020436e+00   8.61853977e-03  -3.10730696e-15]
 [  3.32573099e-01  -9.35045926e-02  -2.46811584e+00  -1.58236319e+00
    6.22349447e-01  -1.99018698e-01   1.99442297e-01  -2.16772462e-16]
 [  4.29055047e+00   7.99375413e+00  -3.68484661e+00   5.88634413e+00
    5.11747981e+00   3.48022492e+00  -9.68787584e-01   1.11022302e-15]] 5
/home/bill/anaconda3/lib/python3.5/site-packages/matplotlib/lines.py:1206: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self._markerfacecolor != fc:
/home/bill/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:62: MatplotlibDeprecationWarning: The set_axis_bgcolor function was deprecated in version 2.0. Use set_facecolor instead.
features: Index(['#instances', '#features', '#binary_features', '#integer_features',
       '#float_features', 'endpoint_type', '#classes', 'Imbalance_metric'],
      dtype='object')
label 0
	#instances:	7160.186
	#features:	28.576
	#binary_features:	1.763
	#integer_features:	5.712
	#float_features:	21.102
	endpoint_type:	1.0
	#classes:	6.797
	Imbalance_metric:	0.069
label 1
	#instances:	1943.281
	#features:	20.843
	#binary_features:	2.573
	#integer_features:	5.978
	#float_features:	12.292
	endpoint_type:	0.0
	#classes:	2.0
	Imbalance_metric:	0.086
label 2
	#instances:	35800.0
	#features:	892.0
	#binary_features:	48.0
	#integer_features:	844.0
	#float_features:	0.0
	endpoint_type:	0.5
	#classes:	6.0
	Imbalance_metric:	0.0
label 3
	#instances:	7705.214
	#features:	38.0
	#binary_features:	26.143
	#integer_features:	9.429
	#float_features:	2.429
	endpoint_type:	0.643
	#classes:	3.286
	Imbalance_metric:	0.707
label 4
	#instances:	494020.0
	#features:	41.0
	#binary_features:	6.0
	#integer_features:	20.0
	#float_features:	15.0
	endpoint_type:	1.0
	#classes:	23.0
	Imbalance_metric:	0.382
data sets in cluster 0: 16     analcatdata_authorship.csv
23           analcatdata_dmft.csv
25      analcatdata_germangss.csv
26      analcatdata_happiness.csv
32                       auto.csv
34              balance-scale.csv
43                calendarDOW.csv
45                        car.csv
46                       cars.csv
47                      cars1.csv
53          cleveland-nominal.csv
54                  cleveland.csv
55                      cloud.csv
56                        cmc.csv
59                    collins.csv
60                 confidence.csv
61                  connect-4.csv
62              contraceptive.csv
67                dermatology.csv
71                      ecoli.csv
72                       fars.csv
73                      flags.csv
76                      glass.csv
79                 hayes-roth.csv
89                       iris.csv
93                     krkopt.csv
95                      led24.csv
96                       led7.csv
97                     letter.csv
100              lymphography.csv
102             mfeat-factors.csv
103             mfeat-fourier.csv
104            mfeat-karhunen.csv
105       mfeat-morphological.csv
106               mfeat-pixel.csv
107             mfeat-zernike.csv
114           movement_libras.csv
117               new-thyroid.csv
118                   nursery.csv
119                 optdigits.csv
123                 pendigits.csv
128               prnn_fglass.csv
134                  satimage.csv
136              segmentation.csv
138                     sleep.csv
139             solar-flare_1.csv
140             solar-flare_2.csv
142                   soybean.csv
146                    splice.csv
147                       tae.csv
148                   texture.csv
154                   vehicle.csv
156                     vowel.csv
157               waveform-21.csv
158               waveform-40.csv
160          wine-quality-red.csv
161        wine-quality-white.csv
162          wine-recognition.csv
164                     yeast.csv
Name: Dataset, dtype: object
data sets in cluster 1: 1        GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1.csv
2        GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.csv
3        GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1.csv
4      GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...
5      GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...
6                             Hill_Valley_with_noise.csv
7                          Hill_Valley_without_noise.csv
8                                              adult.csv
9                                   agaricus-lepiota.csv
14                                  analcatdata_aids.csv
15                              analcatdata_asbestos.csv
17                            analcatdata_bankruptcy.csv
18                               analcatdata_boxing1.csv
19                               analcatdata_boxing2.csv
20                           analcatdata_creditscore.csv
21                           analcatdata_cyyoung8092.csv
22                           analcatdata_cyyoung9302.csv
24                                 analcatdata_fraud.csv
27                          analcatdata_japansolvent.csv
30                                      appendicitis.csv
31                                        australian.csv
35                                            banana.csv
36                                            biomed.csv
37                           breast-cancer-wisconsin.csv
38                                     breast-cancer.csv
39                                          breast-w.csv
40                                            breast.csv
41                                          buggyCrx.csv
42                                              bupa.csv
48                                             chess.csv
                             ...                        
110                      molecular-biology_promoters.csv
111                                            monk1.csv
112                                            monk2.csv
113                                            monk3.csv
115                                         mushroom.csv
116                                             mux6.csv
121                                        parity5+5.csv
122                                          parity5.csv
124                                          phoneme.csv
125                                             pima.csv
126                       postoperative-patient-data.csv
127                                       prnn_crabs.csv
129                                       prnn_synth.csv
130                                            profb.csv
131                                        promoters.csv
132                                             ring.csv
133                                          saheart.csv
135                                           schizo.csv
141                                            sonar.csv
143                                         spambase.csv
144                                            spect.csv
145                                           spectf.csv
149                                         threeOf9.csv
150                                      tic-tac-toe.csv
151                                          titanic.csv
152                                           tokyo1.csv
153                                          twonorm.csv
155                                             vote.csv
159                                             wdbc.csv
163                                              xd6.csv
Name: Dataset, dtype: object
data sets in cluster 2: 0      GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_ED...
108                                            mnist.csv
Name: Dataset, dtype: object
data sets in cluster 3: 10                   allbp.csv
11                allhyper.csv
12                 allhypo.csv
13                  allrep.csv
28     analcatdata_lawsuit.csv
29             ann-thyroid.csv
33                backache.csv
44          car-evaluation.csv
57                coil2000.csv
69                     dis.csv
70                     dna.csv
87             hypothyroid.csv
120            page-blocks.csv
137                shuttle.csv
Name: Dataset, dtype: object

In [ ]:
#tsne manifold learning for visualizing the data
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

X = df.drop('Dataset',axis=1).values

tsne = TSNE(n_components=2,random_state=42)

X_tsne = tsne.fit_transform(X)
# get colors from KMeans labels
nc = 6
np.random.seed(42)
km = KMeans(n_clusters=nc, init='k-means++',copy_x=False,max_iter=1000).fit(X)
labels = km.labels_
colors = plt.cm.Spectral(np.linspace(0, 1, nc))
color=[]
for l in labels:
    color.append(colors[l])
    
h = plt.figure()
plt.scatter(X_tsne[:,0],X_tsne[:,1],c=color, cmap=plt.cm.Spectral)
h.savefig('tsne.pdf')
plt.show()

Cluster datasets using benchmark features + algorithm performance


In [39]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import pdb

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = pd.read_csv('report.csv',sep=',')
df['endpoint_type'] = le.fit_transform(df['endpoint_type'].values)
df = df.rename(columns={'Dataset': 'dataset'})
print(df.columns)

data = pd.read_csv('sklearn-benchmark5-data-edited.tsv.gz', sep='\t', names=['dataset',
                                                                     'classifier',
                                                                     'parameters',
                                                                     'accuracy', 
                                                                     'macrof1',
                                                                     'bal_accuracy']).fillna('')

data = data.groupby(['dataset','classifier'])['bal_accuracy'].max().reset_index()
data = data[data['classifier']!='LinearSVC']
data['bal_accuracy'] = data['bal_accuracy'].apply(lambda x: round(x, 3))
data = data.sort('dataset')
df = df.sort('dataset')
# add each classifier as a feature to df
for clf, data_clf in data.groupby('classifier'):
#     pdb.set_trace()
    df[clf] = data_clf['bal_accuracy'].values


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/bill/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'endpoint_type'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-39-d9163001d660> in <module>()
      9 le = LabelEncoder()
     10 df = pd.read_csv('report.csv',sep=',')
---> 11 df['endpoint_type'] = le.fit_transform(df['endpoint_type'].values)
     12 df = df.rename(columns={'Dataset': 'dataset'})
     13 print(df.columns)

/home/bill/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/home/bill/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/home/bill/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/home/bill/anaconda3/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/bill/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'endpoint_type'

In [32]:
print(df.sort('#features',ascending=False))


                                               Dataset  #instances  #features  \
0    GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_ED...        1600       1000   
108                                          mnist.csv       70000        784   
106                                    mfeat-pixel.csv        2000        240   
102                                  mfeat-factors.csv        2000        216   
70                                             dna.csv        3186        180   
51                                          clean2.csv        6598        168   
50                                          clean1.csv         476        168   
6                           Hill_Valley_with_noise.csv        1212        100   
7                        Hill_Valley_without_noise.csv        1212        100   
114                                movement_libras.csv         360         90   
57                                        coil2000.csv        9822         85   
103                                  mfeat-fourier.csv        2000         76   
16                          analcatdata_authorship.csv         841         70   
104                                 mfeat-karhunen.csv        2000         64   
119                                      optdigits.csv        5620         64   
146                                         splice.csv        3188         60   
141                                          sonar.csv         208         60   
110                    molecular-biology_promoters.csv         106         58   
131                                      promoters.csv         106         58   
143                                       spambase.csv        4601         57   
107                                  mfeat-zernike.csv        2000         47   
152                                         tokyo1.csv         959         44   
145                                         spectf.csv         349         44   
73                                           flags.csv         178         43   
61                                       connect-4.csv       67557         42   
91                                          kddcup.csv      494020         41   
158                                    waveform-40.csv        5000         40   
148                                        texture.csv        5500         40   
48                                           chess.csv        3196         36   
92                                        kr-vs-kp.csv        3196         36   
..                                                 ...         ...        ...   
111                                          monk1.csv         556          6   
98                                  liver-disorder.csv         345          6   
105                            mfeat-morphological.csv        2000          6   
113                                          monk3.csv         554          6   
20                         analcatdata_creditscore.csv         100          6   
63                                          corral.csv         160          6   
45                                             car.csv        1728          6   
93                                          krkopt.csv       28056          6   
90                                           irish.csv         500          5   
25                           analcatdata_germangss.csv         400          5   
147                                            tae.csv         151          5   
124                                        phoneme.csv        5404          5   
122                                        parity5.csv          32          5   
117                                    new-thyroid.csv         215          5   
28                             analcatdata_lawsuit.csv         264          4   
79                                      hayes-roth.csv         160          4   
23                                analcatdata_dmft.csv         797          4   
34                                   balance-scale.csv         625          4   
14                                analcatdata_aids.csv          50          4   
89                                            iris.csv         150          4   
26                           analcatdata_happiness.csv          60          3   
60                                      confidence.csv          72          3   
15                            analcatdata_asbestos.csv          83          3   
19                             analcatdata_boxing2.csv         132          3   
18                             analcatdata_boxing1.csv         120          3   
78                                        haberman.csv         306          3   
151                                        titanic.csv        2201          3   
99                                           lupus.csv          87          3   
129                                     prnn_synth.csv         250          2   
35                                          banana.csv        5300          2   

     #binary_features  #integer_features  #float_features  endpoint_type  \
0                  31                969                0              0   
108                65                719                0              1   
106                 0                  0              240              1   
102                 0                  0              216              1   
70                180                  0                0              1   
51                  0                  2              166              0   
50                  0                  2              166              0   
6                   0                  0              100              0   
7                   0                  0              100              0   
114                 0                  0               90              1   
57                  5                 80                0              0   
103                 0                  0               76              1   
16                  0                  0               70              1   
104                 0                  0               64              1   
119                 0                  0               64              1   
146                 0                 60                0              1   
141                 0                  0               60              0   
110                 0                 58                0              0   
131                 0                 58                0              0   
143                 0                  2               55              0   
107                 0                  0               47              1   
152                 0                  0               44              0   
145                 0                  0               44              0   
73                 36                  7                0              1   
61                  0                 42                0              1   
91                  6                 20               15              1   
158                 0                  0               40              1   
148                 0                  0               40              1   
48                 35                  1                0              0   
92                 35                  1                0              0   
..                ...                ...              ...            ...   
111                 2                  4                0              0   
98                  0                  0                6              0   
105                 0                  0                6              1   
113                 2                  4                0              0   
20                  2                  1                3              0   
63                  6                  0                0              0   
45                  0                  6                0              1   
93                  0                  6                0              1   
90                  1                  3                1              0   
25                  1                  3                1              1   
147                 0                  2                3              1   
124                 0                  0                5              0   
122                 5                  0                0              0   
117                 0                  0                5              1   
28                  1                  0                3              0   
79                  0                  0                4              1   
23                  1                  3                0              1   
34                  0                  4                0              1   
14                  0                  2                2              0   
89                  0                  0                4              1   
26                  0                  2                1              1   
60                  0                  0                3              1   
15                  1                  1                1              0   
19                  1                  2                0              0   
18                  1                  2                0              0   
78                  0                  1                2              0   
151                 0                  0                3              0   
99                  0                  0                3              0   
129                 0                  0                2              0   
35                  0                  0                2              0   

     #classes  Imbalance_metric  
0           2          0.000000  
108        10          0.000325  
106        10          0.000000  
102        10          0.000000  
70          3          0.077685  
51          2          0.478483  
50          2          0.016966  
6           2          0.000000  
7           2          0.000098  
114        15          0.000000  
57          2          0.775590  
103        10          0.000000  
16          4          0.083397  
104        10          0.000000  
119        10          0.000015  
146         3          0.077676  
141         2          0.004530  
110         2          0.000000  
131         2          0.000000  
143         2          0.044906  
107        10          0.000000  
152         2          0.077515  
145         2          0.207560  
73          5          0.043918  
61          3          0.254663  
91         23          0.382099  
158         3          0.000058  
148        11          0.000000  
48          2          0.001974  
92          2          0.001974  
..        ...               ...  
111         2          0.000000  
98          2          0.025415  
105        10          0.000000  
113         2          0.001577  
20          2          0.211600  
63          2          0.015625  
45          4          0.390288  
93         18          0.051669  
90          2          0.012544  
25          4          0.000000  
147         3          0.000307  
124         2          0.170592  
122         2          0.000000  
117         3          0.299081  
28          2          0.732840  
79          3          0.043867  
23          6          0.001343  
34          3          0.146230  
14          2          0.000000  
89          3          0.000000  
26          3          0.000000  
60          6          0.000000  
15          2          0.011758  
19          2          0.005739  
18          2          0.090000  
78          2          0.221453  
151         2          0.125266  
99          2          0.038182  
129         2          0.000000  
35          2          0.010691  

[165 rows x 9 columns]
/home/bill/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':

In [34]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.scatter(df['#instances'],df['#features'])


Out[34]:
<matplotlib.collections.PathCollection at 0x7f9149298320>

In [46]:
df.iloc[df['Imbalance_metric'].argmax()]


Out[46]:
Dataset              dis.csv
#instances              3772
#features                 29
#binary_features          21
#integer_features          8
#float_features            0
endpoint_type              0
#classes                   2
Imbalance_metric     0.93944
Name: 69, dtype: object

In [51]:
from pmlb import fetch_data

d = fetch_data('dis')
d.groupby('class')['class'].count()


Out[51]:
class
0      58
1    3714
Name: class, dtype: int64

In [56]:
2*((58/3772-1/2)**2+(3714/3772-1/2)**2)


Out[56]:
0.9394399094067016

In [75]:
mf_small = [(d,i,f,c) for d,i,f,c in zip(df['Dataset'],df['#instances'],df['#features'],df['#classes']) if c>2 and i<1000]
inst = [m[1] for m in mf_small]
mf_small = [mf_small[i] for i in np.argsort(inst)]
print('name','instances','features','classes')
for m in mf_small: print(m)


name instances features classes
('analcatdata_happiness.csv', 60, 3, 3)
('confidence.csv', 72, 3, 6)
('cloud.csv', 108, 7, 4)
('lymphography.csv', 148, 18, 4)
('iris.csv', 150, 4, 3)
('tae.csv', 151, 5, 3)
('hayes-roth.csv', 160, 4, 3)
('flags.csv', 178, 43, 5)
('wine-recognition.csv', 178, 13, 3)
('auto.csv', 202, 25, 5)
('prnn_fglass.csv', 205, 9, 5)
('glass.csv', 205, 9, 5)
('new-thyroid.csv', 215, 5, 3)
('cleveland-nominal.csv', 303, 7, 5)
('cleveland.csv', 303, 13, 5)
('solar-flare_1.csv', 315, 12, 5)
('ecoli.csv', 327, 7, 5)
('movement_libras.csv', 360, 90, 15)
('dermatology.csv', 366, 34, 6)
('cars1.csv', 392, 7, 3)
('cars.csv', 392, 8, 3)
('calendarDOW.csv', 399, 32, 5)
('analcatdata_germangss.csv', 400, 5, 4)
('collins.csv', 485, 23, 13)
('balance-scale.csv', 625, 4, 3)
('soybean.csv', 675, 35, 18)
('analcatdata_dmft.csv', 797, 4, 6)
('analcatdata_authorship.csv', 841, 70, 4)
('vehicle.csv', 846, 18, 4)
('vowel.csv', 990, 13, 11)

In [74]:
mf_small[:][1]


Out[74]:
('analcatdata_dmft.csv', 797, 4, 6)

In [ ]:


In [ ]:


In [ ]: