analysis



In [1]:
import pandas as pd
import re

In [2]:
cases = ['genome', 'genome-48', 'genome-98', 'random']
sizes = [1, 2, 4, 8, 16 ,32]
programs = ['beetl', 'bwt-lcp-em', 'egap', 'gsa-is']

In [3]:
def get_data(dir):
    with open(dir + 'time.txt') as datafile:
        lines=datafile.readlines()
    parsed = {}
    for line in lines:
        # remove percent character
        m = re.match('^\t+(.*?):\s+(\S+?)\%?$', line)
        if m is not None:
            # transform hh:mm:ss or mm:ss into seconds
            m2 = re.match('(\d+):(\d\d):([\d\.]+)', m.group(2))
            if m2 is not None:
                parsed[m.group(1)] = float(m2.group(3)) + int(m2.group(2)) * 60 + int(m2.group(1)) * 60 * 60
            else:
                m2 = re.match('(\d+):([\d\.]+)', m.group(2))
                if m2 is not None:
                    parsed[m.group(1)] = float(m2.group(2)) + int(m2.group(1)) * 60
                else:
                    parsed[m.group(1)] = float(m.group(2))
    return parsed

In [4]:
df = pd.DataFrame()
for case in cases:
    for size in sizes:
        for program in programs:
            temp = pd.DataFrame(get_data('log/' + str(case) + '/' +  str(size) + '/' +  str(program) + '/'), 
                                index=[0])
            temp['case'] = case
            temp['size'] = size
            temp['program'] = program
            df = df.append(temp)
df.set_index(['case', 'size', 'program'], inplace = True)
df.head()


Out[4]:
Average resident set size (kbytes) Average shared text size (kbytes) Average stack size (kbytes) Average total size (kbytes) Average unshared data size (kbytes) Elapsed (wall clock) time (h:mm:ss or m:ss) Exit status File system inputs File system outputs Involuntary context switches ... Minor (reclaiming a frame) page faults Page size (bytes) Percent of CPU this job got Signals delivered Socket messages received Socket messages sent Swaps System time (seconds) User time (seconds) Voluntary context switches
case size program
genome 1 beetl 0.0 0.0 0.0 0.0 0.0 367.48 0.0 112128.0 107981320.0 6264.0 ... 72732.0 4096.0 83.0 0.0 0.0 0.0 0.0 41.91 266.52 9417.0
bwt-lcp-em 0.0 0.0 0.0 0.0 0.0 991.09 0.0 128.0 208999976.0 885620.0 ... 2802.0 4096.0 88.0 0.0 0.0 0.0 0.0 49.77 824.82 11119.0
egap 0.0 0.0 0.0 0.0 0.0 48.84 0.0 2192.0 291048.0 11382.0 ... 316116.0 4096.0 99.0 0.0 0.0 0.0 0.0 0.65 47.87 137.0
gsa-is 0.0 0.0 0.0 0.0 0.0 74.86 0.0 160.0 2910184.0 398.0 ... 399927.0 4096.0 90.0 0.0 0.0 0.0 0.0 1.05 66.42 511.0
2 beetl 0.0 0.0 0.0 0.0 0.0 1932.47 0.0 148251832.0 215955744.0 316399.0 ... 129486.0 4096.0 33.0 0.0 0.0 0.0 0.0 96.47 557.79 170157.0

5 rows × 22 columns

Plots


In [5]:
%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
def show_case(field, case):
    df_plot = df[field][case].unstack('program')
    print('Dataset: ' + case + ' ' + field)
    print(df_plot)
    print("")
    df_plot.plot(figsize=(18,10), grid=True, title='Dataset: ' + case, legend = True, table = True)

In [7]:
for field in ['Elapsed (wall clock) time (h:mm:ss or m:ss)', 'Maximum resident set size (kbytes)',
             'File system inputs', 'File system outputs']:
    for case in cases:
        show_case(field, case)


Dataset: genome Elapsed (wall clock) time (h:mm:ss or m:ss)
program     beetl  bwt-lcp-em    egap  gsa-is
size                                         
1          367.48      991.09   48.84   74.86
2         1932.47     2165.02  100.76  339.71
4         4173.00     9268.00  247.66   16.03
8         7358.00    17484.00  749.33  521.85
16       15963.00    53102.00  240.33  222.06
32        1901.56    95574.00  188.55  138.28

Dataset: genome-48 Elapsed (wall clock) time (h:mm:ss or m:ss)
program    beetl  bwt-lcp-em    egap  gsa-is
size                                        
1          38.48      113.84   13.83   20.66
2          85.07      216.14   59.30   45.92
4         189.72      541.78   62.50   93.14
8         752.99      723.03  133.86  433.54
16       2320.96     6636.00  395.81   29.06
32       4722.00    14023.00   36.38   29.65

Dataset: genome-98 Elapsed (wall clock) time (h:mm:ss or m:ss)
program     beetl  bwt-lcp-em    egap   gsa-is
size                                          
1          169.63      436.85   29.98    47.25
2          503.53     1023.94   64.03    98.94
4          753.81     2205.33  132.97  1392.62
8         3752.00     6533.00  367.11    86.31
16        8752.00    29908.00   48.86    18.18
32       23030.00    53457.00   47.21    43.37

Dataset: random Elapsed (wall clock) time (h:mm:ss or m:ss)
program      beetl  bwt-lcp-em    egap  gsa-is
size                                          
1           525.40      193.80   50.11   79.31
2          2035.03      507.24  107.29  165.19
4          4326.00     1821.39  233.91   12.92
8          7745.00     4105.00   34.00   27.13
16        18469.00    10235.00   79.76   78.23
32       148138.00    32645.00  384.76  682.40

Dataset: genome Maximum resident set size (kbytes)
program      beetl  bwt-lcp-em       egap     gsa-is
size                                                
1         253560.0      6768.0   730088.0  1312188.0
2         482808.0     10736.0  1457636.0  2621304.0
4         848952.0     18648.0  2912632.0  2912748.0
8        1983544.0     34184.0  3857864.0  3478876.0
16       3195560.0     65432.0  3858976.0  4169944.0
32       1485940.0    127848.0  3858856.0  4041632.0

Dataset: genome-48 Maximum resident set size (kbytes)
program      beetl  bwt-lcp-em       egap     gsa-is
size                                                
1         130408.0      6276.0   241960.0   433452.0
2         267384.0     10212.0   481124.0   863960.0
4         448684.0     18128.0   959672.0  1725156.0
8        1023716.0     33416.0  1916632.0  3447896.0
16       1558904.0     64880.0  3830444.0  3830776.0
32       1474400.0    127468.0  3717420.0  4128488.0

Dataset: genome-98 Maximum resident set size (kbytes)
program      beetl  bwt-lcp-em       egap     gsa-is
size                                                
1         217796.0      6556.0   485884.0   872684.0
2         392920.0     10476.0   969248.0  1742788.0
4         667568.0     18508.0  1936192.0  3482772.0
8         954356.0     34008.0  3869736.0  3869892.0
16       2277884.0     65296.0  2264972.0  4174016.0
32       1738232.0    127688.0  2265012.0  4128872.0

Dataset: random Maximum resident set size (kbytes)
program      beetl  bwt-lcp-em       egap     gsa-is
size                                                
1         230876.0      6656.0   744840.0  1338652.0
2         419704.0     10760.0  1486952.0  2674356.0
4         843504.0     18576.0  2971436.0  2971404.0
8        1431536.0     34136.0  3799308.0  3502376.0
16       3504340.0     65324.0  3799264.0  4170140.0
32       3776144.0    128020.0  3799600.0  4041076.0

Dataset: genome File system inputs
program         beetl    bwt-lcp-em        egap     gsa-is
size                                                      
1        1.121280e+05  1.280000e+02      2192.0      160.0
2        1.482518e+08  2.800000e+02       392.0     3152.0
4        3.905123e+08  3.865877e+08      7360.0  1575224.0
8        6.106029e+08  1.730265e+09   2272896.0  3150648.0
16       1.676391e+09  3.816353e+09   2270384.0  5564088.0
32       1.443030e+08  7.645372e+09  14864352.0  4892496.0

Dataset: genome-48 File system inputs
program        beetl   bwt-lcp-em       egap     gsa-is
size                                                   
1               88.0         40.0     2008.0       56.0
2              768.0         96.0     6136.0      368.0
4             2160.0     182264.0     4784.0      536.0
8         73537968.0     363632.0     6328.0   234704.0
16       140334472.0  406780896.0  4178680.0  3064120.0
32       368783760.0  836320592.0  8765384.0  4377120.0

Dataset: genome-98 File system inputs
program         beetl    bwt-lcp-em        egap     gsa-is
size                                                      
1        5.280000e+02  9.600000e+01        16.0        0.0
2        1.921400e+07  3.569600e+04       344.0      384.0
4        2.296000e+03  2.992000e+05      2472.0   113448.0
8        3.455901e+08  4.122756e+08      7024.0  2368400.0
16       7.090176e+08  1.695283e+09   4739376.0  1599336.0
32       1.719832e+09  3.365730e+09  12041800.0  4876440.0

Dataset: random File system inputs
program         beetl    bwt-lcp-em        egap     gsa-is
size                                                      
1        7.537456e+06  1.280000e+02       392.0       56.0
2        1.725274e+08  1.949440e+05      1520.0      520.0
4        4.143089e+08  2.246207e+07      5680.0   807208.0
8        7.024717e+08  3.543768e+08    251432.0  1633520.0
16       1.783676e+09  8.128251e+08   1518752.0  3980336.0
32       4.205876e+09  1.793704e+09  12172888.0  4360688.0

Dataset: genome File system outputs
program         beetl    bwt-lcp-em       egap     gsa-is
size                                                     
1        1.079813e+08  2.090000e+08   291048.0  2910184.0
2        2.159557e+08  4.371230e+08   582064.0  5820344.0
4        4.319048e+08  8.748453e+08  1164096.0       16.0
8        8.638039e+08  1.749678e+09       16.0       24.0
16       1.727601e+09  3.497010e+09       16.0       24.0
32       2.477978e+08  6.994476e+09       16.0       24.0

Dataset: genome-48 File system outputs
program        beetl   bwt-lcp-em       egap     gsa-is
size                                                   
1         11589784.0   14460352.0    95728.0   957064.0
2         23177760.0   41891544.0   191440.0  1914104.0
4         46353536.0   95587104.0   382848.0  3828168.0
8         92705296.0  192636656.0   765664.0  7656296.0
16       185408528.0  385269104.0  1531288.0       24.0
32       370814656.0  770534976.0       16.0       24.0

Dataset: genome-98 File system outputs
program         beetl    bwt-lcp-em       egap     gsa-is
size                                                     
1        4.757837e+07  8.649896e+07   193384.0  1933624.0
2        9.515272e+07  1.932815e+08   386752.0  3867224.0
4        1.903010e+08  3.878250e+08   773472.0  7734408.0
8        3.805984e+08  7.758435e+08  1546912.0       24.0
16       7.611921e+08  1.551680e+09       16.0       24.0
32       1.522379e+09  3.100258e+09       16.0       24.0

Dataset: random File system outputs
program         beetl    bwt-lcp-em       egap     gsa-is
size                                                     
1        1.123819e+08  4.047779e+07   296912.0  2968792.0
2        2.247566e+08  9.198783e+07   593784.0  5937544.0
4        4.495066e+08  1.788574e+08  1187536.0       24.0
8        8.990071e+08  3.695054e+08       16.0       24.0
16       1.798007e+09  7.838186e+08       16.0       24.0
32       3.596007e+09  1.709749e+09       16.0       24.0