import re
import pandas as pd
data = pd.read_csv("java_sorting_24_7_17.txt", sep="|")
def filter_data(data):
data.columns= [re.sub(r'\s+(\S+)\s+', r'\1', x) for x in data.columns]
for i in range(1, len(data.columns)):
try:
data.iloc[:,i] = data.iloc[:,i].apply(lambda x: re.sub(r'\s+(\S+)\s+', r'\1', x))
except Exception as e:
print(e)
data.loc[:, 'shuffle'] = data.loc[:, 'shuffle'].apply(lambda x: re.sub(r'\/(\d+)', r'\1',x))
return data
data = filter_data(data)
In [1]:
# Using strip to filter the values in the txt
import pandas as pd
import numpy as np
def read_stats(data_file):
data = pd.read_csv(data_file, sep="|")
data.columns = [ x.strip() for x in data.columns]
# Filter integer indexes
str_idxs = [idx for idx,dtype in zip(range(0,len(data.dtypes)), data.dtypes) if dtype != 'int64' ]
# Strip fields
for i in str_idxs:
key = data.columns[i]
if data[key].dtype == np.dtype('str'):
data.loc[:,key] = [ x.strip() for x in data.loc[:, key]]
return data
data = read_stats("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.csv")
# data.to_csv("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.csv")
In [2]:
[x for x in zip(range(0, len(data.columns)),data.columns)]
Out[2]:
In [3]:
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
#plotly.offline.init_notebook_mode()
def filter_by(data, name, value):
data_length = len(data)
return [idx for idx in range(0, data_length) if data.loc[idx,name] == value]
# using ~/.plotly/.credentials
# plotly.tools.set_credentials_file(username="", api_key="")
algorithms = set(data.loc[:, 'name'])
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[3]:
In [4]:
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[4]:
In [5]:
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[5]:
In [6]:
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[6]:
In [7]:
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[7]:
The merge sort algorithm we developed is a bit less than O(N). We couldn't find out in that run the worst case performance of O(n log(n)) see.
The worst case of our merge sort (single threaded) is better than the worst case of the java platform Arrays.sort, however the stats are not independend the runs were not isolated. We loop through all sorting algorithms, the garbage collection of the previous algorithm might affect the performance of the next one. The garbage collection of merge sort might change the performance of Arrays.sort
In [8]:
data2 = read_stats("java_sorting_127.0.1.1_Fr_4._Aug_23:59:33_UTC_2017.txt")
algorithms = set(data2.loc[:, 'name'])
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[8]:
In [9]:
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[9]:
In [10]:
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[10]:
In [11]:
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[11]:
In [12]:
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
xaxis=dict(title='Elements'),
yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[12]:
In [13]:
data2.loc[:,'name'] =[x.strip() for x in data2.loc[:,'name']]
algorithms = set(data2.loc[:, 'name'])
algorithms
Out[13]:
In [41]:
import plotly.graph_objs as go
algorithms.remove('Linked Hashmap')
def get_bar(data, algorithm_name):
idxs = filter_by(data, 'name', algorithm_name)
X1 = data2.loc[idxs, 'elements']
Y1 = data2.loc[idxs, 'duration_ms']
return go.Bar(x=X1, y=Y1, name=algorithm_name)
plot_data = [get_bar(data2, name) for name in algorithms]
layout = go.Layout(title= 'Performance comparison',
xaxis=dict(title='Elements (32 bits / -2,147,483,648 to +2,147,483,647)'),
yaxis=dict(title='Time (ms)'),
barmode='stack')
fig = go.Figure(data=plot_data, layout=layout)
py.iplot(fig)
Out[41]:
In [ ]: