This notebook contains an analysis of the results from executing the different implementations (CUDA, NumPy, Python) of the K-Means algorithm.
The machine technical specifications of the machine (Mariana) are:
In [1]:
# necessary imports
%pylab inline
import seaborn as sns
import pandas as pd
In [2]:
# locations of the results
results_filename="/home/chiroptera/workspace/QCThesis/CUDA/tests/test1v2/results.csv" #local
#results_filename="https://raw.githubusercontent.com/Chiroptera/QCThesis/master/CUDA/tests/test1v2/results.csv" #git repo
In [3]:
results = pd.read_csv(results_filename)
In [4]:
print "Structure of the results"
results.head()
Out[4]:
In [5]:
N_labels=[1e3,5e3,1e4,5e4,1e5,5e5,1e6,2e6,4e6]
K_labels=[5,10,20,30,40,50,100,250,500]
Some of the parameters were don't change in these results, so we can delete them (natural number of clusters, dimensionality and number of iterations). Furthermore, We can delete the rounds column because it becomes useless after averaging the times.
In [6]:
results.drop(['R','NATC','D','iters'], axis=1, inplace=True)
results.head()
Out[6]:
Below is some statistics about the timings for the rounds. The important thing to notice is that there is low variance on the data, which suggests that the results are consistent.
In [7]:
rounds = results.groupby(['type','N','K'],as_index = True)
results_mean = rounds.mean()
rounds.describe()
Out[7]:
In [8]:
times = results_mean.loc["cuda"]
times['cuda']=times['time']
times['numpy']=results_mean.loc["numpy"]
times['python']=results_mean.loc["python"]
times['s_cuda_np']=times['numpy']/times['cuda']
times['s_cuda_py']=times['python']/times['cuda']
times['s_np_py']=times['python']/times['numpy']
times
Out[8]:
In [9]:
a=times.groupby(level='K')
#a.get_group(20)['python'].plot(subplots=True,layout=(2,2))
p=a.get_group(20)[['python','numpy','cuda']].plot(title="Time evolution; 20 clusters",logy=True)
plt.xticks(range(len(N_labels)),N_labels)
plt.xlabel("Cardinality")
a.get_group(500)[['python','numpy','cuda']].plot(title="Time evolution; 500 clusters",logy=True)
plt.xticks(range(len(N_labels)),N_labels)
plt.xlabel("Cardinality")
Out[9]:
In [10]:
b=times.groupby(level='N')
b.get_group(1e5)[['python','numpy','cuda']].plot(title="Time evolution by number of clusters; 1e5 datapoints",logy=True)
plt.xticks(range(len(K_labels)),K_labels)
plt.xlabel("Number of clusters")
Out[10]:
In [11]:
b.get_group(1e5)[['numpy','cuda']].plot(title="Time evolution by number of clusters; 1e5 datapoints",logy=True)
plt.xticks(range(len(K_labels)),K_labels)
plt.xlabel("Number of clusters")
b.get_group(4e6)[['numpy','cuda']].plot(title="Time evolution by number of clusters; 4e6 datapoints",logy=True)
plt.xticks(range(len(K_labels)),K_labels)
plt.xlabel("Number of clusters")
Out[11]:
In [12]:
s_cuda_np = results_mean.loc['numpy'] / results_mean.loc['cuda']
#s_cuda_np['speedup']=s_cuda_np['time']
In [13]:
s_cuda_np.groupby(level=['K']).describe()
Out[13]:
In [14]:
for key, grp in s_cuda_np.groupby(level=['K']):
plt.plot(grp['time'],label=key)#grp.index.levels[0],
plt.legend(loc='best')
plt.title("Speedup by cardinality")
plt.plot([0, 8], [1, 1], 'k-', lw=2)
plt.ylabel("Speedup")
plt.xlabel("Cardinality")
plt.xticks(range(len(N_labels)),N_labels)
Out[14]:
In [15]:
s_cuda_np.groupby(level=['N']).describe()
Out[15]:
In [16]:
for key, grp in s_cuda_np.groupby(level=['N']):
plt.plot(grp['time'],label=key)#grp.index.levels[0],
plt.plot([0, 8], [1, 1], 'k-', lw=2) #slowdown/speedup threshold
plt.legend(loc='best')
plt.title("Speedup by cardinality")
plt.ylabel("Speedup")
plt.xlabel("Number of clusters")
plt.xticks(range(len(K_labels)),K_labels)
Out[16]:
In [17]:
s_cuda_py = results_mean.loc['python'] / results_mean.loc['cuda']
In [18]:
for key, grp in s_cuda_py.groupby(level=['K']):
plt.plot(grp['time'],label=key)#grp.index.levels[0],
plt.plot([0, 8], [1, 1], 'k-', lw=2) #slowdown/speedup threshold
plt.legend(loc='best')
plt.title("Speedup by cardinality")
plt.ylabel("Speedup")
plt.xlabel("Cardinality")
plt.xticks(range(len(N_labels)),N_labels)
Out[18]:
In [19]:
for key, grp in s_cuda_py.groupby(level=['N']):
plt.plot(grp['time'],label=key)#grp.index.levels[0],
plt.plot([0, 8], [1, 1], 'k-', lw=2) #slowdown/speedup threshold
plt.legend(loc='best')
plt.title("Speedup by cardinality")
plt.ylabel("Speedup")
plt.xlabel("Number of clusters")
plt.xticks(range(len(K_labels)),K_labels)
Out[19]: