In [82]:
    
import os
from collections import defaultdict
results_dir = "/home/ubuntu/test_matam/"
revisions_path = [os.path.join(results_dir,d) for d in os.listdir(results_dir) if os.path.isdir( os.path.join(results_dir, d))]
revisions_path = [d for d in revisions_path if os.path.basename(d) not in ['matam', 'db', '.ipynb_checkpoints']]
def list_logfiles(directory):
    logs = [ os.path.join( directory,f) for f in os.listdir( directory ) if os.path.isfile(os.path.join(directory, f))]
    return sorted(logs)
def get_stats(f):
    """
    extract statistic from matam log file & more precisly from the last 8 cols the one line stats:
    ('seq_nb', 'seq_min_size', 'seq_max_size', 'seq_avg_size', 'seq_total_size', 'error_rate', 'error_rate2', 'ref_coverage')
    """
    res_handler = open(f, 'r')
    lines = res_handler.readlines()
    for i, line in enumerate(lines):
        if 'One-line stats' in line: break    
    stats = lines[i+1].strip()
    stats = stats.split('\t')
    stats = stats[-8:]
    
    
    #stats = stats[:4] + stats[5:]
    formated_stats = []
    for s in stats:
        s = s.strip()
        if '%' in s:
            s = s.replace('%','')
        if '.' in s:
            s = float(s)
        else: s = int(s)
        formated_stats.append(s)
    assert len(formated_stats) == 8
    res_handler.close()
    return formated_stats
revisions_stats = defaultdict(list)
for revision_path in revisions_path:
    revision_name = os.path.basename(revision_path)
    for logfile in list_logfiles(revision_path):
        stats = None
        try:
            stats = get_stats(logfile)
        except IndexError: pass
        if stats is None:
            print('Ignoring:%s' % logfile)
            continue
        revisions_stats[revision_name].append(stats)
#revisions_stats
## Testing the testing code
#t1=t2=t3-20
#revisions_stats = {'t1' : [[23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [25, 550, 1530, 1167.8, 29195, 0.09, 0.03, 92.18],
#    [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [20, 847, 1526, 1365.4, 27308, 0.02, 0.02, 90.69]],
# 't2' : [[23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [25, 550, 1530, 1167.8, 29195, 0.09, 0.03, 92.18],
#    [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
#    [20, 847, 1526, 1365.4, 27308, 0.02, 0.02, 90.69]],
# 't3' : [[43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
#    [43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
#    [45, 570, 1550, 1187.8, 29215, 20.09, 20.03, 112.18],
#    [43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
#    [40, 867, 1546, 1385.4, 27328, 20.02, 20.02, 110.69]]
#}
    
In [83]:
    
import numpy as np
import pandas as pd
categories = ('seq_nb', 'seq_min_size', 'seq_max_size', 'seq_avg_size', 'seq_total_size', 'error_rate', 'error_rate2', 'ref_coverage')
revisions_df = {}
for revision in revisions_stats.keys():
    df = pd.DataFrame(revisions_stats[revision],columns=categories)
    revisions_df[revision] = df
#revisions_df
    
In [84]:
    
%matplotlib inline
from ggplot import ggplot, aes, geom_boxplot
import datetime
#import matplotlib.pyplot as plt
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
pdf = os.path.join(results_dir, "%s.pdf" % timestamp)
for i, c in enumerate(categories):
    print(c)
    dict_to_plot = {}
    for revision in revisions_df.keys():
        dict_to_plot[revision] = revisions_df[revision][c]
    df = pd.DataFrame(dict_to_plot)
    display(df.describe())
    p = ggplot(pd.melt(df, var_name=c, value_name='values'), aes( x = c, y = 'values')) + geom_boxplot()
    p.show()
    p.save('%s_matam_boxplot.svg' % i)
#convert all svg in one pdf file & remove the svg files
! convert ?_matam_boxplot.svg {pdf}
! rm ?_matam_boxplot.svg