Initial code to strip the fields with Regex

import re
import pandas as pd

data = pd.read_csv("java_sorting_24_7_17.txt", sep="|")
def filter_data(data):
    data.columns= [re.sub(r'\s+(\S+)\s+', r'\1', x) for x in data.columns]
    for i in range(1, len(data.columns)):
        try:
          data.iloc[:,i] = data.iloc[:,i].apply(lambda x: re.sub(r'\s+(\S+)\s+', r'\1', x))
        except Exception as e:
            print(e)
    data.loc[:, 'shuffle'] = data.loc[:, 'shuffle'].apply(lambda x: re.sub(r'\/(\d+)', r'\1',x))
    return data
data = filter_data(data)



In [1]:

    
# Using strip to filter the values in the txt
import pandas as pd
import numpy as np
def read_stats(data_file):
    data = pd.read_csv(data_file, sep="|")
    data.columns = [ x.strip() for x in data.columns]

    # Filter integer indexes
    str_idxs = [idx for idx,dtype in zip(range(0,len(data.dtypes)), data.dtypes) if dtype != 'int64' ]

    # Strip fields
    for i in str_idxs:    
        key = data.columns[i]
        if data[key].dtype == np.dtype('str'):
            data.loc[:,key] = [ x.strip() for x in data.loc[:, key]]
    return data

data = read_stats("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.csv")
# data.to_csv("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.csv")









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-1-d9e5f1df70d6> in <module>()
     16     return data
     17 
---> 18 data = read_stats("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.txt")
     19 # data.to_csv("java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.csv")

<ipython-input-1-d9e5f1df70d6> in read_stats(data_file)
      3 import numpy as np
      4 def read_stats(data_file):
----> 5     data = pd.read_csv(data_file, sep="|")
      6     data.columns = [ x.strip() for x in data.columns]
      7 

~/.venv3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/.venv3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/.venv3/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/.venv3/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/.venv3/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'java_sorting_127.0.1.1_Di_1._Aug_07:39:03_UTC_2017.txt' does not exist



In [2]:

    
[x for x in zip(range(0, len(data.columns)),data.columns)]









    Out[2]:





[(0, 'name'),
 (1, 'shuffle'),
 (2, 'elements'),
 (3, 'duration_ms'),
 (4, 'p_duration_s'),
 (5, 'p_duration_ns'),
 (6, 'memory')]



In [3]:

    
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
#plotly.offline.init_notebook_mode() 
def filter_by(data, name, value):
    data_length = len(data)
    return [idx for idx in range(0, data_length) if data.loc[idx,name] == value]

# using ~/.plotly/.credentials
# plotly.tools.set_credentials_file(username="", api_key="")

algorithms = set(data.loc[:, 'name'])
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[3]:



In [4]:

    
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[4]:



In [5]:

    
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[5]:



In [6]:

    
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[6]:



In [7]:

    
alg = algorithms.pop()
idxs = filter_by(data, 'name', alg)
X = data.loc[idxs, 'elements']
Y = data.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[7]:

The same stats as before but with 9 Million data

The merge sort algorithm we developed is a bit less than O(N). We couldn't find out in that run the worst case performance of O(n log(n)) see.

The worst case of our merge sort (single threaded) is better than the worst case of the java platform Arrays.sort, however the stats are not independend the runs were not isolated. We loop through all sorting algorithms, the garbage collection of the previous algorithm might affect the performance of the next one. The garbage collection of merge sort might change the performance of Arrays.sort



In [8]:

    
data2 = read_stats("java_sorting_127.0.1.1_Fr_4._Aug_23:59:33_UTC_2017.txt")
algorithms = set(data2.loc[:, 'name'])
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[8]:



In [9]:

    
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[9]:



In [10]:

    
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[10]:



In [11]:

    
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[11]:



In [12]:

    
alg = algorithms.pop()
idxs = filter_by(data2, 'name', alg)
X = data2.loc[idxs, 'elements']
Y = data2.loc[idxs, 'duration_ms']
plot_data = [Bar(x = X, y = Y, name=alg)]
layout = Layout(title= alg + ' performance (java) ',
                xaxis=dict(title='Elements'),
                yaxis=dict(title='Time'))
fig = Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[12]:

Better visualization



In [13]:

    
data2.loc[:,'name'] =[x.strip() for x in data2.loc[:,'name']]
algorithms = set(data2.loc[:, 'name'])

algorithms









    Out[13]:





{'Arrays.parallelSort',
 'Arrays.sort',
 'Linked Hashmap',
 'Stream + parallel + sort',
 'merge sort'}



In [41]:

    
import plotly.graph_objs as go
algorithms.remove('Linked Hashmap')
def get_bar(data, algorithm_name):    
    idxs = filter_by(data, 'name', algorithm_name)
    X1 = data2.loc[idxs, 'elements']
    Y1 = data2.loc[idxs, 'duration_ms']
    return go.Bar(x=X1, y=Y1, name=algorithm_name)

plot_data = [get_bar(data2, name) for name in algorithms]
layout = go.Layout(title= 'Performance comparison',
                xaxis=dict(title='Elements (32 bits / -2,147,483,648 to +2,147,483,647)'),
                yaxis=dict(title='Time (ms)'),
                  barmode='stack')
fig = go.Figure(data=plot_data, layout=layout)
py.iplot(fig)









    Out[41]:



In [ ]: