In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging

In [2]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
import datetime
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,predict
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=100):
    return  islice(gspan_to_eden(dataset_fname),size)

In [3]:
def fit_sample(graphs, random_state=42):
    graphs, graphs_ = tee(graphs)
    sampler=GraphLearnSampler(radius_list=[0,1],thickness_list=[2,3],
                              min_cip_count=2, min_interface_count=2,
                              vectorizer=Vectorizer(5), random_state=random_state)
    
    sampler.fit(graphs, nu=0.25, n_jobs=-1)

    logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar().size()
    logger.info('#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))

    graphs = sampler.sample(graphs_,
                            n_steps=5, n_samples=4,
                            target_orig_cip=False,
                            probabilistic_core_choice=False,
                            score_core_choice= False,
                            max_core_size_diff=3,
                            burnin=1,
                            omit_seed=True,
                            max_cycle_size=6,
                            improving_threshold=0.25,
                            accept_static_penalty=0,
                            n_jobs=-1,
                            select_cip_max_tries=200,
                            keep_duplicates=True,
                            generator_mode=True)
    return graphs

In [4]:
def fit_and_evaluate(original, sampled, local_estimator):
    outputs = []
    for desc,train in [('original',original),
                           ('sample',sampled)]:
        train,train_ = tee(train)
        size=sum(1 for x in train_)
        logger.info( "-"*80)
        logger.info( 'working on %s'%(desc))
        logger.info( 'training set sizes: #: %d'%(size))
        if size == 0:
            logger.info( 'WARNING: empty dataset')
            outputs.append(0)
        else:
            start=time()
            predictions = predict(train, 
                              estimator=local_estimator, 
                              vectorizer=Vectorizer(4), 
                              mode='predict_proba',
                              n_jobs=-1)
            avg_score=np.mean(predictions[:,1])
            logger.info( 'avg score: %.5f' % avg_score)
            outputs.append(avg_score)
            logger.info( 'elapsed: %.1f sec'%(time()-start))
    return outputs

In [5]:
def evaluate(data_fname, size=None, percentages=None, n_repetitions=None, train_test_split=None):
    # initializing 
    graphs = get_graphs(data_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    train_global,test_global = random_bipartition_iter(graphs,train_test_split)

    original_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            random_state = int(313379*percentage+repetition) 
            random.seed(random_state)
            

            train_global,train_global_ = tee(train_global)
            test_global,test_global_ = tee(test_global)

            from sklearn.linear_model import SGDClassifier
            estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True, loss='log', n_jobs=-1)
            local_estimator = fit(test_global_, 
                                  iterable_neg=None,
                                  vectorizer=Vectorizer(4),
                                  estimator=estimator, n_jobs=-1, n_iter_search=1)
           
            # use shuffled list to create test and sample set
            train,train_reminder = random_bipartition_iter(train_global_,percentage)
            train,train_ = tee(train)
            sampled = fit_sample(train_, random_state=random_state)

            #evaluate the predictive performance on held out test set
            start=time()
            logger.info( "="*80)
            logger.info( 'repetition: %d/%d'%(repetition+1, n_repetitions))
            logger.info( "training percentage:"+str(percentage))
            perf_orig, perf_samp = fit_and_evaluate(train, sampled, local_estimator)
            logger.info( 'Time elapsed: %.1f sec'%((time()-start)))
            originals.append(perf_orig)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        sample_repetitions.append(samples)
    
    return original_repetitions, sample_repetitions

In [6]:
def plot(dataset, percentages, original_repetitions, sample_repetitions):
    gc={'color':'g'}
    rc={'color':'r'}
    bc={'color':'b'}
    ws = 0.02
    o = np.mean(original_repetitions, axis=1)
    s = np.mean(sample_repetitions, axis=1)
    plt.figure(figsize=(18,8))
    plt.grid()

    plt.boxplot(original_repetitions, positions=percentages, widths=ws, capprops=rc, medianprops=rc, boxprops=rc, whiskerprops=rc, flierprops=rc)
    plt.plot(percentages,o, color='r', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='r', markerfacecolor='w', label='original')

    plt.boxplot(sample_repetitions, positions=percentages, widths=ws, capprops=bc, medianprops=bc, boxprops=bc, whiskerprops=bc, flierprops=bc)
    plt.plot(percentages,s, color='b', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='b', markerfacecolor='w', label='sample')

    plt.xlim(percentages[0]-.05,percentages[-1]+.05)
    plt.title(dataset+'\n',fontsize=17)
    plt.legend(loc='upper right',fontsize=16)
    plt.ylabel('Likelihood',fontsize=16)
    plt.xlabel('Dataset size (fraction)',fontsize=16)
    plt.savefig('%s_plot_probability_of_samples.pdf' % dataset)

In [7]:
def save_results(result_fname,percentages, original_repetitions,sample_repetitions):
    with open(result_fname,'w') as f:
        f.write('dataset sizes list:\n')
        for perc in percentages:
            f.write('%s '% perc)
        f.write('\n')
        f.write('AUC scores:\n')
        for repetitions in original_repetitions,sample_repetitions:
            f.write('%s\n' % len(repetitions))
            for repetition in repetitions:
                for auc in repetition:
                    f.write('%s ' % auc)
                f.write('\n')
    
def load_results(result_fname):
    with open(result_fname) as f:
        comment = next(f)
        line = next(f)
        percentages = [float(x) for x in line.split()]
        comment = next(f)

        original_repetitions = []
        size = int(next(f))
        for i in range(size):
            line = next(f)
            repetition = [float(x) for x in line.split()]
            original_repetitions.append(repetition)

        sample_repetitions = []
        size = int(next(f))
        for i in range(size):
            line = next(f)
            repetition = [float(x) for x in line.split()]
            sample_repetitions.append(repetition)
            
    return percentages, original_repetitions,sample_repetitions

In [ ]:
#setup
dataset_names = !cat NCI60/names
random.shuffle(dataset_names)

In [ ]:
%%time
for dataset in dataset_names:
    #logging
    logger = logging.getLogger()
    if True:
        logger_fname = '%s_probability_of_samples.log'%dataset
    else:
        logger_fname = None
    configure_logging(logger,verbosity=1, filename=logger_fname)
    
    #main 
    start=time()
    print('*'*80)
    print( 'Working with dataset: %s' % dataset )
    logger.info( 'Working with dataset: %s' % dataset )
    dataset_fname = 'NCI60/' + dataset + '_orig_pos.gspan'
    #pos_dataset_fname = 'bursi.pos.gspan'
    
    percentages=[.2,.4,.6,.8,.95]

    original_repetitions,\
    sample_repetitions = evaluate(dataset_fname,
                                  size=400,
                                  percentages=percentages,
                                  n_repetitions=3,
                                  train_test_split=0.7)
    plot(dataset, percentages, original_repetitions, sample_repetitions)
    
    #save and display results
    result_fname='%s_probability_of_samples.data'%dataset
    save_results(result_fname,percentages, original_repetitions,sample_repetitions)    
    percentages_l, original_repetitions_l,sample_repetitions_l = load_results(result_fname)
    plot(dataset, percentages_l, original_repetitions_l, sample_repetitions_l)
    
    print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))


********************************************************************************
Working with dataset: HCT_15_t
Working with dataset: HCT_15_t
graph grammar stats:
#instances:56   #interfaces: 96   #cores: 32   #core-interface-pairs: 235
================================================================================
repetition: 1/3
training percentage:0.2
--------------------------------------------------------------------------------
working on original
training set sizes: #: 56
avg score: 0.99622
elapsed: 0.7 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 112
avg score: 0.99699
elapsed: 1.3 sec
Time elapsed: 8.4 sec
graph grammar stats:
#instances:56   #interfaces: 96   #cores: 32   #core-interface-pairs: 235
================================================================================
repetition: 2/3
training percentage:0.2
--------------------------------------------------------------------------------
working on original
training set sizes: #: 56
avg score: 0.99586
elapsed: 0.6 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 110
avg score: 0.99692
elapsed: 1.3 sec
Time elapsed: 8.0 sec
graph grammar stats:
#instances:56   #interfaces: 96   #cores: 32   #core-interface-pairs: 235
================================================================================
repetition: 3/3
training percentage:0.2
--------------------------------------------------------------------------------
working on original
training set sizes: #: 56
avg score: 0.99576
elapsed: 0.6 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 112
avg score: 0.99688
elapsed: 1.3 sec
Time elapsed: 8.2 sec
graph grammar stats:
#instances:112   #interfaces: 157   #cores: 43   #core-interface-pairs: 400
================================================================================
repetition: 1/3
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 112
avg score: 0.98841
elapsed: 1.1 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 212
avg score: 0.99597
elapsed: 2.0 sec
Time elapsed: 14.7 sec
graph grammar stats:
#instances:112   #interfaces: 157   #cores: 43   #core-interface-pairs: 400
================================================================================
repetition: 2/3
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 112
avg score: 0.98852
elapsed: 1.0 sec

In [ ]:
#display
for dataset in dataset_names:
    result_fname='%s_predictive_performance_of_samples.data'%dataset
    percentages_l, original_repetitions_l,sample_repetitions_l = load_results(result_fname)
    plot(dataset, percentages_l, original_repetitions_l, sample_repetitions_l)

.