In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os.path
import pandas as pd
import seaborn as sns
from MyML.helper.plotting import save_fig
In [2]:
sns.set_style("whitegrid")
fig_width = 8
fig_height = 6
In [182]:
results_path = '/home/chiroptera/QCThesis/results/kmeans/mariana/kmeans_results.csv'
results_big = '/home/chiroptera/QCThesis/results/kmeans/mariana/kmeans_results_very_big_1ppt.csv'
In [183]:
res = pd.read_csv(results_path)
for col in res.columns:
print col
In [185]:
# time not spend in either computing the labels or the centroids
res['useless time'] = res['total time'] - (res['label time'] + res['centroid time'])
In [186]:
# build new results with all modes in same row
# get columns and ignore index column
new_cols = list(res.columns.values)[1:]
new_cols.remove('total time')
new_cols.remove('useless time') # I'll save stats instead of this
new_cols.remove('label time')
new_cols.remove('centroid time')
new_cols.remove('label mode')
new_cols.remove('centroid mode')
new_cols.extend(['python label time', 'python centroid time',
'numpy label time', 'numpy centroid time',
'numba label time', 'numba centroid time',
'cuda label time',
'useless time mean', 'useless time std', 'useless time min', 'useless time max'])
cuda_event_timings_keys = [ 'n_data_transfer', 'cum_data_transfer', 'std_data_transfer', 'max_data_transfer', 'min_data_transfer',
'n_centroids_transfer', 'cum_centroids_transfer', 'std_centroids_transfer', 'max_centroids_transfer', 'min_centroids_transfer',
'n_labels_transfer', 'cum_labels_transfer', 'std_labels_transfer', 'max_labels_transfer', 'min_labels_transfer',
'n_dists_transfer', 'cum_dists_transfer', 'std_dists_transfer', 'max_dists_transfer', 'min_dists_transfer',
'n_kernel', 'cum_kernel', 'std_kernel', 'max_kernel', 'min_kernel']
#new_cols.extend(cuda_event_timings_keys)
cres = pd.DataFrame(columns=new_cols)
In [187]:
idx = 0
for key, grp in res.groupby(by=['cardinality','dimensionality','number of clusters','number of iterations','round']):
new_row = {col: 0 for col in cres.columns}
card, dim, nc, niter, r = key
new_row['cardinality'] = card
new_row['dimensionality'] = dim
new_row['number of clusters'] = nc
new_row['number of iterations'] = niter
new_row['round'] = r
#card, dim, nc, niter, r = key
numba_time_count = 0
if (grp['label mode'] == 'python').any():
py_label, py_centroid = grp[grp['label mode']=='python'][['label time', 'centroid time']].values[0]
new_row['python label time'] = py_label
new_row['python centroid time'] = py_centroid
if (grp['label mode'] == 'numpy').any():
np_label, np_centroid = grp[grp['label mode']=='numpy'][['label time', 'centroid time']].values[0]
new_row['numpy label time'] = np_label
new_row['numpy centroid time'] = np_centroid
if (grp['label mode'] == 'numba').any():
numba_time_count += 1
nb_label, nb_centroid = grp[grp['label mode']=='numba'][['label time', 'centroid time']].values[0]
new_row['numba centroid time'] = 1.0 * (new_row.get('numba centroid time', 0.0) + nb_centroid) / numba_time_count
new_row['numba label time'] = nb_label
if (grp['label mode'] == 'cuda').any():
numba_time_count += 1
cuda_label, nb_centroid2 = grp[grp['label mode']=='cuda'][['label time', 'centroid time']].values[0]
new_row['numba centroid time'] = 1.0 * (new_row.get('numba centroid time', 0.0) + nb_centroid2) / numba_time_count
new_row['cuda label time'] = cuda_label
for cet_key in cuda_event_timings_keys:
new_row[cet_key] = grp[grp['label mode'] == 'cuda'][cet_key].values[0]
useless_time_mean = grp['useless time'].mean()
useless_time_std = grp['useless time'].std()
useless_time_min = grp['useless time'].min()
useless_time_max = grp['useless time'].max()
new_row['useless time mean'] = useless_time_mean
new_row['useless time std'] = useless_time_std
new_row['useless time min'] = useless_time_min
new_row['useless time max'] = useless_time_max
cres.loc[idx] = new_row
idx += 1
# compute data, centroids, labels and dists sizes
cres.data_size = cres.cardinality * cres.dimensionality * 4
cres.centroids_size = cres['number of clusters'] * cres.dimensionality * 4
cres.labels_dists_size = cres.cardinality * 4
#print key
#grp[grp['label mode']=='python']
#python_label_time = grp[grp['label mode']=='python']['label time']
In [188]:
cres['cuda numba label speedup'] = cres['numba label time'] / cres['cuda label time']
cres['cuda numpy label speedup'] = cres['numpy label time'] / cres['cuda label time']
print cres['cuda numba label speedup'].max()
In [191]:
cres['data_size'] = cres['cardinality'] * cres['dimensionality'] * 4 / 1024.0**2
cres['centroids_size'] = cres['number of clusters'] * cres['dimensionality'] * 4 / 1024.0**2
In [193]:
cres[['cardinality','dimensionality','number of clusters','cuda numba label speedup', 'cuda numpy label speedup','data_size','centroids_size']].corr()
Out[193]:
In [176]:
#cres_mean = cres_mean.reset_index()
idx = cres['cardinality']==500000
cres.loc[idx,['cardinality','dimensionality','number of clusters','cuda numba label speedup', 'cuda numpy label speedup','data_size','centroids_size']]
Out[176]:
In [194]:
cres.loc[500000,['cuda numba label speedup','cuda numpy label speedup']]
In [126]:
cres = cres.set_index(['cardinality','dimensionality','number of clusters'])
In [114]:
cres.loc[cres['cuda numpy label speedup']>1,['cardinality','dimensionality','number of clusters','round','cuda numpy label speedup']]
Out[114]:
In [164]:
cres_mean = cres.groupby(by=['cardinality','dimensionality','number of clusters']).mean()
del cres_mean['round']
In [106]:
cres_mean.loc[100000]
Out[106]:
In [102]:
corr_df = cres_mean.reset_index()
#idx = corr_df['cuda numba label speedup'] > 0.5
corr_df.loc[:,['cardinality','dimensionality','number of clusters','cuda numba label speedup','cuda numpy label speedup']].corr()
Out[102]:
In [53]:
idx1 = cres['cardinality'] > 10000
idx = idx2
cres.loc[idx,['cardinality','dimensionality','number of clusters','round','cuda numba label speedup','cuda numpy label speedup']].corr()
Out[53]:
In [48]:
idx1 = cres['cuda numba label speedup']>1
idx2 = cres['cardinality'] == 5000000
idx = idx1 & idx2
cres.loc[idx,['cardinality','dimensionality','number of clusters','round','cuda numba label speedup']]
Out[48]: