In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging
In [2]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
import datetime
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,predict
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=100):
return islice(gspan_to_eden(dataset_fname),size)
In [3]:
def fit_sample(graphs, random_state=42):
graphs, graphs_ = tee(graphs)
sampler=GraphLearnSampler(radius_list=[0,1],thickness_list=[2,3],
min_cip_count=2, min_interface_count=2,
vectorizer=Vectorizer(5), random_state=random_state)
sampler.fit(graphs, nu=0.25, n_jobs=-1)
logger.info('graph grammar stats:')
dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar().size()
logger.info('#instances:%d #interfaces: %d #cores: %d #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))
graphs = sampler.sample(graphs_,
n_steps=5, n_samples=4,
target_orig_cip=False,
probabilistic_core_choice=False,
score_core_choice= False,
max_core_size_diff=3,
burnin=1,
omit_seed=True,
max_cycle_size=6,
improving_threshold=0.25,
accept_static_penalty=0,
n_jobs=-1,
select_cip_max_tries=200,
keep_duplicates=True,
generator_mode=True)
return graphs
In [4]:
def fit_and_evaluate(original, sampled, local_estimator):
outputs = []
for desc,train in [('original',original),
('sample',sampled)]:
train,train_ = tee(train)
size=sum(1 for x in train_)
logger.info( "-"*80)
logger.info( 'working on %s'%(desc))
logger.info( 'training set sizes: #: %d'%(size))
if size == 0:
logger.info( 'WARNING: empty dataset')
outputs.append(0)
else:
start=time()
predictions = predict(train,
estimator=local_estimator,
vectorizer=Vectorizer(4),
mode='predict_proba',
n_jobs=-1)
avg_score=np.mean(predictions[:,1])
logger.info( 'avg score: %.5f' % avg_score)
outputs.append(avg_score)
logger.info( 'elapsed: %.1f sec'%(time()-start))
return outputs
In [5]:
def evaluate(data_fname, size=None, percentages=None, n_repetitions=None, train_test_split=None):
# initializing
graphs = get_graphs(data_fname, size=size)
# train/test split
from eden.util import random_bipartition_iter
train_global,test_global = random_bipartition_iter(graphs,train_test_split)
original_repetitions = []
sample_repetitions = []
for percentage in percentages:
originals = []
originals_samples = []
samples = []
for repetition in range(n_repetitions):
random_state = int(313379*percentage+repetition)
random.seed(random_state)
train_global,train_global_ = tee(train_global)
test_global,test_global_ = tee(test_global)
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True, loss='log', n_jobs=-1)
local_estimator = fit(test_global_,
iterable_neg=None,
vectorizer=Vectorizer(4),
estimator=estimator, n_jobs=-1, n_iter_search=1)
# use shuffled list to create test and sample set
train,train_reminder = random_bipartition_iter(train_global_,percentage)
train,train_ = tee(train)
sampled = fit_sample(train_, random_state=random_state)
#evaluate the predictive performance on held out test set
start=time()
logger.info( "="*80)
logger.info( 'repetition: %d/%d'%(repetition+1, n_repetitions))
logger.info( "training percentage:"+str(percentage))
perf_orig, perf_samp = fit_and_evaluate(train, sampled, local_estimator)
logger.info( 'Time elapsed: %.1f sec'%((time()-start)))
originals.append(perf_orig)
samples.append(perf_samp)
original_repetitions.append(originals)
sample_repetitions.append(samples)
return original_repetitions, sample_repetitions
In [6]:
def plot(dataset, percentages, original_repetitions, sample_repetitions):
gc={'color':'g'}
rc={'color':'r'}
bc={'color':'b'}
ws = 0.02
o = np.mean(original_repetitions, axis=1)
s = np.mean(sample_repetitions, axis=1)
plt.figure(figsize=(18,8))
plt.grid()
plt.boxplot(original_repetitions, positions=percentages, widths=ws, capprops=rc, medianprops=rc, boxprops=rc, whiskerprops=rc, flierprops=rc)
plt.plot(percentages,o, color='r', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='r', markerfacecolor='w', label='original')
plt.boxplot(sample_repetitions, positions=percentages, widths=ws, capprops=bc, medianprops=bc, boxprops=bc, whiskerprops=bc, flierprops=bc)
plt.plot(percentages,s, color='b', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='b', markerfacecolor='w', label='sample')
plt.xlim(percentages[0]-.05,percentages[-1]+.05)
plt.title(dataset+'\n',fontsize=17)
plt.legend(loc='upper right',fontsize=16)
plt.ylabel('Likelihood',fontsize=16)
plt.xlabel('Dataset size (fraction)',fontsize=16)
plt.savefig('%s_plot_probability_of_samples.pdf' % dataset)
In [7]:
def save_results(result_fname,percentages, original_repetitions,sample_repetitions):
with open(result_fname,'w') as f:
f.write('dataset sizes list:\n')
for perc in percentages:
f.write('%s '% perc)
f.write('\n')
f.write('AUC scores:\n')
for repetitions in original_repetitions,sample_repetitions:
f.write('%s\n' % len(repetitions))
for repetition in repetitions:
for auc in repetition:
f.write('%s ' % auc)
f.write('\n')
def load_results(result_fname):
with open(result_fname) as f:
comment = next(f)
line = next(f)
percentages = [float(x) for x in line.split()]
comment = next(f)
original_repetitions = []
size = int(next(f))
for i in range(size):
line = next(f)
repetition = [float(x) for x in line.split()]
original_repetitions.append(repetition)
sample_repetitions = []
size = int(next(f))
for i in range(size):
line = next(f)
repetition = [float(x) for x in line.split()]
sample_repetitions.append(repetition)
return percentages, original_repetitions,sample_repetitions
In [ ]:
#setup
dataset_names = !cat NCI60/names
random.shuffle(dataset_names)
In [ ]:
%%time
for dataset in dataset_names:
#logging
logger = logging.getLogger()
if True:
logger_fname = '%s_probability_of_samples.log'%dataset
else:
logger_fname = None
configure_logging(logger,verbosity=1, filename=logger_fname)
#main
start=time()
print('*'*80)
print( 'Working with dataset: %s' % dataset )
logger.info( 'Working with dataset: %s' % dataset )
dataset_fname = 'NCI60/' + dataset + '_orig_pos.gspan'
#pos_dataset_fname = 'bursi.pos.gspan'
percentages=[.2,.4,.6,.8,.95]
original_repetitions,\
sample_repetitions = evaluate(dataset_fname,
size=400,
percentages=percentages,
n_repetitions=3,
train_test_split=0.7)
plot(dataset, percentages, original_repetitions, sample_repetitions)
#save and display results
result_fname='%s_probability_of_samples.data'%dataset
save_results(result_fname,percentages, original_repetitions,sample_repetitions)
percentages_l, original_repetitions_l,sample_repetitions_l = load_results(result_fname)
plot(dataset, percentages_l, original_repetitions_l, sample_repetitions_l)
print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
In [ ]:
#display
for dataset in dataset_names:
result_fname='%s_predictive_performance_of_samples.data'%dataset
percentages_l, original_repetitions_l,sample_repetitions_l = load_results(result_fname)
plot(dataset, percentages_l, original_repetitions_l, sample_repetitions_l)
.