In [1]:
%matplotlib inline
import eden
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging
In [2]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
import datetime
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,estimate
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=100):
return islice(gspan_to_eden(dataset_fname),size)
In [3]:
from scipy.optimize import curve_fit
import numpy as np
def func(x,a,b,c):
return a * (1 - 1/np.exp(b * x + c))
def plot_data(data_x, full_data, label='', color=None):
data_y = np.mean(full_data, axis=1)
ws = 0.02
cd={'color':color}
plt.boxplot(full_data, positions=data_x, widths=ws, capprops=cd, medianprops=cd, boxprops=cd, whiskerprops=cd, flierprops=cd)
plt.plot(data_x,data_y, color=color, linestyle='None', marker='o', markeredgewidth=1, markersize=7, markeredgecolor=color, markerfacecolor='w', label=label)
try:
popt,pcov = curve_fit(func, data_x, data_y)
except:
pass
else:
data_x = np.linspace(data_x[0],data_x[-1],num=100)
data_y = [func(x,*popt) for x in data_x]
plt.plot(data_x,data_y, color=color, lw=2)
def plot(dataset, percentages, original_sample_repetitions, original_repetitions, sample_repetitions):
plt.figure(figsize=(18,8))
plt.grid()
plot_data(percentages, original_sample_repetitions, label='original+sample', color='g')
plot_data(percentages, original_repetitions, label='original', color='r')
plot_data(percentages, sample_repetitions, label='sample', color='b')
plt.xlim(percentages[0]-.05,percentages[-1]+.05)
plt.title(dataset+'\n',fontsize=17)
plt.legend(loc='lower right',fontsize=16)
plt.ylabel('ROC AUC',fontsize=16)
plt.xlabel('Dataset size (fraction)',fontsize=16)
plt.savefig('%s_plot_predictive_performance_of_samples.pdf' % dataset)
In [4]:
def save_results(result_fname,percentages, original_repetitions,original_sample_repetitions,sample_repetitions):
with open(result_fname,'w') as f:
f.write('dataset sizes list:\n')
for perc in percentages:
f.write('%s '% perc)
f.write('\n')
f.write('AUC scores:\n')
for repetitions in original_repetitions,original_sample_repetitions,sample_repetitions:
f.write('%s\n' % len(repetitions))
for repetition in repetitions:
for auc in repetition:
f.write('%s ' % auc)
f.write('\n')
def load_results(result_fname):
with open(result_fname) as f:
comment = next(f)
line = next(f)
percentages = [float(x) for x in line.split()]
comment = next(f)
original_repetitions = []
size = int(next(f))
for i in range(size):
line = next(f)
repetition = [float(x) for x in line.split()]
original_repetitions.append(repetition)
original_sample_repetitions = []
size = int(next(f))
for i in range(size):
line = next(f)
repetition = [float(x) for x in line.split()]
original_sample_repetitions.append(repetition)
sample_repetitions = []
size = int(next(f))
for i in range(size):
line = next(f)
repetition = [float(x) for x in line.split()]
sample_repetitions.append(repetition)
return percentages, original_repetitions,original_sample_repetitions,sample_repetitions
In [5]:
import numpy as np
import math
def score(experiment):
return np.mean([np.mean(repetition) for repetition in experiment])
def scores(result_fname):
percentages,\
original_repetitions,\
original_sample_repetitions,\
sample_repetitions = load_results(result_fname)
return math.log10(score(original_sample_repetitions)/score(original_repetitions))
In [6]:
#setup
dataset_names = !cat NCI60/names
In [7]:
import os.path
performances=[]
for dataset in sorted(dataset_names):
result_fname='%s_predictive_performance_of_samples.data'%dataset
if os.path.exists(result_fname):
performances.append((scores(result_fname),dataset))
for i, (score, dataset) in enumerate(sorted(performances, reverse=True)):
result_fname='%s_predictive_performance_of_samples.data'%dataset
if os.path.exists(result_fname):
percentages_l, original_repetitions_l,original_sample_repetitions_l,sample_repetitions_l = load_results(result_fname)
plot(dataset, percentages_l, original_sample_repetitions_l, original_repetitions_l, sample_repetitions_l)
print('%2d %15s %+.4f'%(i, dataset,score))
.