In [82]:
import os
from collections import defaultdict
results_dir = "/home/ubuntu/test_matam/"
revisions_path = [os.path.join(results_dir,d) for d in os.listdir(results_dir) if os.path.isdir( os.path.join(results_dir, d))]
revisions_path = [d for d in revisions_path if os.path.basename(d) not in ['matam', 'db', '.ipynb_checkpoints']]
def list_logfiles(directory):
logs = [ os.path.join( directory,f) for f in os.listdir( directory ) if os.path.isfile(os.path.join(directory, f))]
return sorted(logs)
def get_stats(f):
"""
extract statistic from matam log file & more precisly from the last 8 cols the one line stats:
('seq_nb', 'seq_min_size', 'seq_max_size', 'seq_avg_size', 'seq_total_size', 'error_rate', 'error_rate2', 'ref_coverage')
"""
res_handler = open(f, 'r')
lines = res_handler.readlines()
for i, line in enumerate(lines):
if 'One-line stats' in line: break
stats = lines[i+1].strip()
stats = stats.split('\t')
stats = stats[-8:]
#stats = stats[:4] + stats[5:]
formated_stats = []
for s in stats:
s = s.strip()
if '%' in s:
s = s.replace('%','')
if '.' in s:
s = float(s)
else: s = int(s)
formated_stats.append(s)
assert len(formated_stats) == 8
res_handler.close()
return formated_stats
revisions_stats = defaultdict(list)
for revision_path in revisions_path:
revision_name = os.path.basename(revision_path)
for logfile in list_logfiles(revision_path):
stats = None
try:
stats = get_stats(logfile)
except IndexError: pass
if stats is None:
print('Ignoring:%s' % logfile)
continue
revisions_stats[revision_name].append(stats)
#revisions_stats
## Testing the testing code
#t1=t2=t3-20
#revisions_stats = {'t1' : [[23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [25, 550, 1530, 1167.8, 29195, 0.09, 0.03, 92.18],
# [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [20, 847, 1526, 1365.4, 27308, 0.02, 0.02, 90.69]],
# 't2' : [[23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [25, 550, 1530, 1167.8, 29195, 0.09, 0.03, 92.18],
# [23, 550, 1526, 1239.09, 28499, 0.04, 0.04, 91.96],
# [20, 847, 1526, 1365.4, 27308, 0.02, 0.02, 90.69]],
# 't3' : [[43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
# [43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
# [45, 570, 1550, 1187.8, 29215, 20.09, 20.03, 112.18],
# [43, 570, 1546, 1259.09, 28519, 20.04, 20.04, 111.96],
# [40, 867, 1546, 1385.4, 27328, 20.02, 20.02, 110.69]]
#}
In [83]:
import numpy as np
import pandas as pd
categories = ('seq_nb', 'seq_min_size', 'seq_max_size', 'seq_avg_size', 'seq_total_size', 'error_rate', 'error_rate2', 'ref_coverage')
revisions_df = {}
for revision in revisions_stats.keys():
df = pd.DataFrame(revisions_stats[revision],columns=categories)
revisions_df[revision] = df
#revisions_df
In [84]:
%matplotlib inline
from ggplot import ggplot, aes, geom_boxplot
import datetime
#import matplotlib.pyplot as plt
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
pdf = os.path.join(results_dir, "%s.pdf" % timestamp)
for i, c in enumerate(categories):
print(c)
dict_to_plot = {}
for revision in revisions_df.keys():
dict_to_plot[revision] = revisions_df[revision][c]
df = pd.DataFrame(dict_to_plot)
display(df.describe())
p = ggplot(pd.melt(df, var_name=c, value_name='values'), aes( x = c, y = 'values')) + geom_boxplot()
p.show()
p.save('%s_matam_boxplot.svg' % i)
#convert all svg in one pdf file & remove the svg files
! convert ?_matam_boxplot.svg {pdf}
! rm ?_matam_boxplot.svg