In [223]:

    
%matplotlib inline

import os, sys
import seaborn
import pandas as pd
import numpy as np
import stochastic
from scipy import stats
from matplotlib import pyplot as plt
import graphs
plt.rcParams['figure.figsize'] = [10, 6]
import matplotlib.style as style
style.use('ggplot')

def load_data(res_dir):
    data = pd.DataFrame()
    for filename in os.listdir(res_dir):
        if ".csv" in filename:
            data = pd.concat([data, pd.read_csv(os.path.join(res_dir, filename))])
            
    # exclude three acyclic graphs that might be included by accident
    data = data[~(data.graph_name.str.contains("VEGF") |
                          data.graph_name.str.contains("Toll Pathway") | data.graph_name.str.contains("Processing of Spz"))]

    return data

def extract_mean_scores_data(list_data):
    res_data = list_data.copy()
    score_cols = [col for col in res_data.columns if "scores" in col]
    res_data[score_cols] = res_data[score_cols].apply(lambda col: col.apply(
        lambda val_list: np.nanmean(eval(str(val_list).replace("nan", "np.nan")))), axis=1)
    return res_data
    
def extract_vertex_scores_data(list_data):
    res_data = []
    score_cols = [col for col in list_data.columns if "scores" in col]
    non_score_cols = [col for col in list_data if col not in score_cols]
    for index, row in list_data.iterrows():
        val_lists = [eval(str(val_list).
                          replace("nan", "np.nan")) for val_list in row[score_cols]]
        for vals in zip(*val_lists):
            if not np.isnan(vals[0]):
                res_data.append(list(row[non_score_cols]) + list(vals))
    res_data = pd.DataFrame(res_data)
    res_data.columns = non_score_cols + score_cols
    return res_data


def treat_unequal_representation(data, min_threshold=0.8):
    # TODO: rewrite this horror of a code
    res_data = pd.DataFrame()
    n_rnds = []
    print "#original graphs={}".format(len(data.graph_name.unique()))
    for graph_name in data.graph_name.unique():
        n_bio = len(data.loc[(data.graph_name == graph_name) & (data.is_random == False)])
        n_rnd = len(data.loc[(data.graph_name == graph_name) & (data.is_random == True)])
        if n_bio > 1:
            print "Warning - too many bio for graph {}: #bio={}, #rnd={}".format(graph_name, n_bio, n_rnd)
        if n_bio == 1:
            res_data = pd.concat([res_data, data[data.graph_name == graph_name]])
        n_rnds.append(n_rnd)
    
    print "#original graphs with bio={}".format(len(res_data.graph_name.unique()))

    final_res_data = pd.DataFrame()
    actual_threshold = min([n for n in n_rnds if n >= (max(n_rnds) * min_threshold)])
    for graph_name in res_data.graph_name.unique():
        n_rnd = len(res_data.loc[(res_data.graph_name == graph_name) & (res_data.is_random == True)])
        if n_rnd >= actual_threshold:
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == False) & (
                res_data.graph_name == graph_name)]])
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == True) & (
                res_data.graph_name == graph_name)].sample(actual_threshold)])
    
    return final_res_data

def convert_model_addition_scores(data, K=30): 
    # because I have to ways to normalize that, and want to be able to convert here from 1/K to 1/ATT
    if "optimization_model_addition_impact_scores" in data.columns:
        data['optimization_model_addition_impact_scores'] = data['optimization_model_addition_impact_scores'] * K / data['num_attractors']
    return data



In [225]:

    
data = treat_unequal_representation(load_data("results/29_10_pooled"), min_threshold=0.6)
mean_data = extract_mean_scores_data(data)
vertex_data = extract_vertex_scores_data(data)

mean_data = convert_model_addition_scores(mean_data)
vertex_data = convert_model_addition_scores(vertex_data)

score_types = [col for col in data if "scores" in col]
print "number of graphs: {}".format(len(data.graph_name.unique()))
print "#bio={}, #rnd={}".format(len(data.loc[data.is_random == False]), len(data.loc[data.is_random == True]))









    



#original graphs=30
#original graphs with bio=28
number of graphs: 20
#bio=20, #rnd=220

mean



In [226]:

    
# state_scores = [s for s in score_types if "state" in s]
# fig, ax = plt.subplots(nrows=len(state_scores)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 5))
# i = 0
# for score in state_scores:
#     bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
#     cur_ax = ax[i % 2]
#     for is_random in [False, True]:
#         bar_data = mean_data.loc[mean_data.is_random == is_random][score].values
#         weights = np.ones_like(bar_data) / float(len(bar_data))
#         cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
#     cur_ax.legend(["biological", "random"])
#     cur_ax.set_title(score.replace("_", " "))
#     i += 1
# plt.savefig("state_impacts_histograms.png")
# plt.show()



In [228]:

    
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, sharex=False, figsize=(15, 5))

i = 0
for score in score_types:
    print score
    if "addition" in score or "model" not in score:
        continue
    bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    cur_ax = ax[i % 2]
    for is_random in [False, True]:
        bar_data = mean_data.loc[mean_data.is_random == is_random][score].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["real", "random"])
#     cur_ax.set_title(score.replace("_", " "))
    cur_ax.set_title(['a', 'b'][i])
    i += 1
plt.savefig("vertex_score_histograms.png")
plt.show()









    



optimization_model_impact_scores
stochastic_model_impact_scores
optimization_model_addition_impact_scores
stochastic_model_addition_impact_scores
optimization_state_impact_scores
stochastic_state_impact_scores



In [229]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 15))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    for is_random in [False, True]:
        bins = np.linspace(min(min(mean_data[score_pair[0]]), min(mean_data[score_pair[1]])), 
                           max(max(mean_data[score_pair[0]]), max(mean_data[score_pair[1]])), 25)
        cur_ax = ax[i / 2, i % 2]
        for score in score_pair:
            bar_data = mean_data.loc[mean_data.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(["optimization", "stochastic"])
        cur_ax.set_title("{}, {}".format(score_pair[0].replace("optimization", " ").replace("_", " "), 
                                         "random" if is_random else "biological"))
        i += 1
plt.savefig("mean_score_histograms_method.png")
plt.show()



In [230]:

    
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=False, sharex=False, figsize=(17, 5))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    cur_ax = ax[i]
    mean_data.loc[mean_data.is_random == True].plot.scatter(x=score_pair[0], y=score_pair[1], c="blue", ax=cur_ax)
    mean_data.loc[mean_data.is_random == False].plot.scatter(x=score_pair[0], y=score_pair[1], c="red", ax=cur_ax)
    cur_ax.set_title(score_pair[0].replace("optimization", " ").replace("_", " "))
    cur_ax.set_xlabel("optimization")
    cur_ax.set_ylabel("scochastic")
    i += 1
    print score_pair, np.corrcoef(mean_data.loc[mean_data.is_random == True][score_pair[0]], 
                                  mean_data.loc[mean_data.is_random == True][score_pair[1]])[1, 0]
plt.savefig("mean_scatter_score_types.png")
plt.show()









    



('optimization_model_impact_scores', 'stochastic_model_impact_scores') 0.20855983192009933
('optimization_model_addition_impact_scores', 'stochastic_model_addition_impact_scores') 0.04883137418317227
('optimization_state_impact_scores', 'stochastic_state_impact_scores') 0.8629320085990054



In [231]:

    
mean_pvalue_data = []
for score in score_types:
    bio = mean_data.loc[mean_data.is_random == False][score]
    rnd = mean_data.loc[mean_data.is_random == True][score]
    row = score.replace("_", " ").replace(" impact scores", ""), np.nanmean(bio), np.nanmean(rnd), stats.mannwhitneyu(bio, rnd)[1]
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "biological mean", "random mean", "p-value (MWW test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data









    Out[231]:







  
    
      
      biological mean
      random mean
      p-value (MWW test)
    
    
      impact variant
      
      
      
    
  
  
    
      optimization model
      0.799816
      0.832963
      0.195415
    
    
      stochastic model
      0.340358
      0.364066
      0.075680
    
    
      optimization model addition
      0.657237
      0.345675
      0.000521
    
    
      stochastic model addition
      0.355122
      0.317503
      0.499329
    
    
      optimization state
      0.059181
      0.113715
      0.027934
    
    
      stochastic state
      0.095647
      0.135742
      0.151631



In [232]:

    
bio = mean_data.loc[mean_data.is_random == False]
rnd = mean_data.loc[mean_data.is_random == True]
threshold = 0.3
for score in score_types:
    print score
    for field in [col for col in data.columns if ("scores" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]:
        if np.corrcoef(bio[score], bio[field])[1,0] >= threshold or np.corrcoef(rnd[score], rnd[field])[1,0] >= threshold:
            print "{}\t biological corr={:.2f}\t random corr={:.2f}".format(field, np.corrcoef(bio[score], bio[field])[1,0], np.corrcoef(rnd[score], rnd[field])[1,0])
    print ""









    



optimization_model_impact_scores
mean_attractor_length	 biological corr=0.60	 random corr=0.35
median_attractor_length	 biological corr=0.61	 random corr=0.35
std_attractor_length	 biological corr=0.52	 random corr=0.07
max_attractor_length	 biological corr=0.54	 random corr=0.15
std_attractor_basin	 biological corr=0.32	 random corr=0.37
median_attractor_basin	 biological corr=0.74	 random corr=0.56
max_attractor_basin	 biological corr=0.79	 random corr=0.72

stochastic_model_impact_scores
size	 biological corr=0.46	 random corr=0.24
mean_attractor_length	 biological corr=0.68	 random corr=0.46
median_attractor_length	 biological corr=0.67	 random corr=0.43
std_attractor_length	 biological corr=0.66	 random corr=0.21
max_attractor_length	 biological corr=0.70	 random corr=0.35

optimization_model_addition_impact_scores
mean_degree	 biological corr=0.39	 random corr=-0.05
mean_attractor_length	 biological corr=0.75	 random corr=-0.30
median_attractor_length	 biological corr=0.75	 random corr=-0.33
max_attractor_length	 biological corr=0.40	 random corr=-0.22
median_attractor_basin	 biological corr=0.67	 random corr=0.51
max_attractor_basin	 biological corr=0.51	 random corr=0.45

stochastic_model_addition_impact_scores
size	 biological corr=0.37	 random corr=0.03
mean_attractor_length	 biological corr=0.87	 random corr=0.43
median_attractor_length	 biological corr=0.87	 random corr=0.42
std_attractor_length	 biological corr=0.61	 random corr=0.03
max_attractor_length	 biological corr=0.76	 random corr=0.20
median_attractor_basin	 biological corr=0.47	 random corr=0.50
max_attractor_basin	 biological corr=0.33	 random corr=0.40

optimization_state_impact_scores
num_attractors	 biological corr=0.30	 random corr=0.43

stochastic_state_impact_scores
num_attractors	 biological corr=0.54	 random corr=0.38



In [233]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=True, figsize=(15, 15))

i = 0
for score in score_types:
    bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=mean_data.loc[mean_data.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = mean_data.loc[mean_data.is_random == False]
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
    cur_ax.set_title(score.replace("_", " ").replace(" scores", ""))
    i += 1
plt.savefig("mean_graph_scores.png")
plt.show()

VERTEX



In [234]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 5))

i = 0
for score in score_types:
    bins = np.linspace(min(vertex_data[score]), max(vertex_data[score]), 25)
    cur_ax = ax[i / 2, i % 2]
#     cur_ax = ax[i % 2]
    for is_random in [False, True]:
        bar_data = vertex_data.loc[vertex_data.is_random == is_random][score].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["biological", "random"])
    cur_ax.set_title(score.replace("_", " "))
    i += 1
plt.savefig("vertex_score_histograms_israndom.png")
plt.show()



In [235]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 5))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    for is_random in [False, True]:
        bins = np.linspace(min(min(vertex_data[score_pair[0]]), min(vertex_data[score_pair[1]])), 
                           max(max(vertex_data[score_pair[0]]), max(vertex_data[score_pair[1]])), 25)
#         cur_ax = ax[i / 2, i % 2]
        cur_ax = ax[i % 2]
        for score in score_pair:
            bar_data = vertex_data.loc[vertex_data.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(["optimization", "stochastic"])
        cur_ax.set_title("{}, {}".format(score_pair[0].replace("optimization", " ").replace("_", " "), 
                                         "random" if is_random else "biological"))
        i += 1
plt.savefig("vertex_score_histograms_method.png")
plt.show()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-235-a61736f91d94> in <module>()
     12             bar_data = vertex_data.loc[vertex_data.is_random == is_random][score].values
     13             weights = np.ones_like(bar_data) / float(len(bar_data))
---> 14             cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
     15         cur_ax.legend(["optimization", "stochastic"])
     16         cur_ax.set_title("{}, {}".format(score_pair[0].replace("optimization", " ").replace("_", " "), 

AttributeError: 'numpy.ndarray' object has no attribute 'hist'



In [ ]:

    
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=False, sharex=False, figsize=(17, 5))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    cur_ax = ax[i]
    vertex_data.loc[vertex_data.is_random == True].plot.scatter(x=score_pair[0], y=score_pair[1], c="blue", ax=cur_ax)
    vertex_data.loc[vertex_data.is_random == False].plot.scatter(x=score_pair[0], y=score_pair[1], c="red", ax=cur_ax)
    cur_ax.set_title(score_pair[0].replace("optimization", " ").replace("_", " "))
    cur_ax.set_xlabel("optimization")
    cur_ax.set_ylabel("scochastic")
    i += 1
    print score_pair
    print "biological - ", np.corrcoef(vertex_data.loc[vertex_data.is_random == False][score_pair[0]], 
                                  vertex_data.loc[vertex_data.is_random == False][score_pair[1]])[1, 0]
    print "random - ", np.corrcoef(vertex_data.loc[vertex_data.is_random == True][score_pair[0]], 
                                  vertex_data.loc[vertex_data.is_random == True][score_pair[1]])[1, 0]

plt.savefig("vertex_scatter_score_types.png")
plt.show()



In [ ]:

    
vertex_pvalue_data = []
for score in score_types:
    bio = vertex_data.loc[vertex_data.is_random == False][score]
    rnd = vertex_data.loc[vertex_data.is_random == True][score]
    row = score.replace("_", " ").replace(" impact scores", ""), np.nanmean(bio), np.nanmean(rnd), stats.mannwhitneyu(bio, rnd)[1]
    vertex_pvalue_data.append(row)
vertex_pvalue_data = pd.DataFrame(vertex_pvalue_data)
vertex_pvalue_data.columns = ["impact variant", "biological mean", "random mean", "p-value (MWW test)"]
vertex_pvalue_data = vertex_pvalue_data.set_index(vertex_pvalue_data.columns[0])
vertex_pvalue_data.to_csv("vertex_pvalues.csv", float_format="%.3g")
vertex_pvalue_data



In [ ]:

    
mean_pvalue_data = []
for score in score_types:
    bio = vertex_data.loc[vertex_data.is_random == False]
    rnd = vertex_data.loc[vertex_data.is_random == True]
    graph_names = vertex_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
#     ranks = []
#     samples_per_graph = None
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
#         if samples_per_graph is None:
#             samples_per_graph = len(graph_rnd) + 1
        mean_pairs.append((graph_bio.mean(), graph_rnd.mean()))

#     pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
#     rank_counts = [len([r for r in ranks if r == i]) for i in range(samples_per_graph)]
#     print rank_counts
#     rank_pvalue = stats.chisquare(f_obs=rank_counts)[1]
    row = score.replace("_", " ").replace(" impact score", ""), mean_pair_pvalue # , rank_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon mean test)"]
#                             "p-value (Wilcoxon test)"]
                            #"p-value (rank chisquared)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data



In [ ]:

    
bio = vertex_data.loc[vertex_data.is_random == False]
rnd = vertex_data.loc[vertex_data.is_random == True]
threshold = 0.3
for score in score_types:
    print score
    for field in [col for col in data.columns if ("scores" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]:
        if np.corrcoef(bio[score], bio[field])[1,0] >= threshold or np.corrcoef(rnd[score], rnd[field])[1,0] >= threshold:
            print "{}\t biological corr={:.2f}\t random corr={:.2f}".format(field, np.corrcoef(bio[score], bio[field])[1,0], np.corrcoef(rnd[score], rnd[field])[1,0])
    print ""



In [ ]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=True, figsize=(15, 15))

i = 0
for score in score_types:
    bins = np.linspace(min(vertex_data[score]), max(vertex_data[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=vertex_data.loc[vertex_data.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    
    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.legend_.remove()
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.set_title(score.replace("_", " ").replace(" scores", ""))
    i += 1
plt.savefig("vertex_graph_scores.png")
plt.show()



In [ ]:

    
for score in score_types:
    vertex_pvalue_data = []
    for graph in vertex_data.graph_name.unique():
        bio = vertex_data.loc[(vertex_data.graph_name == graph) & (vertex_data.is_random == False)][score]
        rnd = vertex_data.loc[(vertex_data.graph_name == graph) & (vertex_data.is_random == True)][score]
        row = score.replace("_", " ").replace(" impact scores", ""), np.nanmean(bio), np.nanmean(rnd), stats.mannwhitneyu(bio, rnd)[1]
        vertex_pvalue_data.append(row)
    vertex_pvalue_data = pd.DataFrame(vertex_pvalue_data)
    vertex_pvalue_data.columns = ["impact variant", "biological mean", "random mean", "p-value (MWW test)"]
    vertex_pvalue_data = vertex_pvalue_data.set_index(vertex_pvalue_data.columns[0])
    print score
    print vertex_pvalue_data[vertex_pvalue_data.columns[-1]].median()
    print vertex_pvalue_data
# vertex_pvalue_data.to_csv("vertex_pvalues.csv", float_format="%.3g")
# vertex_pvalue_data

Timing and graph properties



In [237]:

    
timing_types = [score.replace("impact_scores", "time") for score in score_types]
timing_data = []
for timing_type in timing_types:
    print timing_type
    row = [np.nanmedian(data[timing_type]), np.nanmean(data[timing_type]),
                                                         np.nanmax(data[timing_type])]
    timing_data.append([timing_type.replace("_", " ").replace(" time", "")] + row)
timing_data = pd.DataFrame(timing_data)
timing_data = timing_data.set_index(timing_data.columns[0])
timing_data.columns = ["median (seconds)", "mean (seconds)", "max (seconds)"]
timing_data.index.name = "impact variant"
timing_data.to_csv("timings.csv", float_format="%.0f")
timing_data

# bins = np.linspace(0, 4000, 100)
# for timing_type in timing_types:
#     data[timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
# plt.legend(timing_types)
# plt.show()

# for timing_type in timing_types:
#     for graph_name in data.graph_name.unique():
#         data.loc[data.graph_name == graph_name][timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
#     plt.title(timing_type)
#     plt.legend(data.graph_name.unique())
#     plt.show()

# for timing_type in timing_types:
#     for field in [col for col in data.columns if ("scores" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]:
#         data.plot.scatter(field, timing_type)









    



optimization_model_time
stochastic_model_time
optimization_model_addition_time
stochastic_model_addition_time
optimization_state_time
stochastic_state_time






    Out[237]:







  
    
      
      median (seconds)
      mean (seconds)
      max (seconds)
    
    
      impact variant
      
      
      
    
  
  
    
      optimization model
      1.362878
      3.088127
      26.732371
    
    
      stochastic model
      1378.625914
      1720.275679
      7043.028339
    
    
      optimization model addition
      99.993842
      288.020783
      4266.760450
    
    
      stochastic model addition
      1506.827890
      1828.203960
      7891.154656
    
    
      optimization state
      158.021152
      892.862811
      8646.752806
    
    
      stochastic state
      12.782863
      17.002244
      71.894509



In [238]:

    
bio_properties = data.loc[data.is_random==False][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (col not in ["is_random", "maximal_change_bits"])]]
rnd_properties = data.loc[data.is_random==True][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (col not in ["is_random", "maximal_change_bits"])]]
bio_properties.to_csv("graph_properties.csv", float_format='%.3g')
bio_properties









    Out[238]:







  
    
      
      graph_name
      size
      n_inputs
      normalized_n_inputs
      max_degree
      mean_degree
      num_attractors
      mean_attractor_length
      median_attractor_length
      std_attractor_length
      max_attractor_length
      std_attractor_basin
      median_attractor_basin
      max_attractor_basin
    
  
  
    
      4
      Body Segmentation in Drosophila 2013
      17
      3
      0.176471
      4
      1.705882
      10
      1.00000
      1.0
      0.000000
      1
      0.046362
      0.118468
      0.137838
    
    
      6
      BT474 Breast Cell Line Short-term ErbB Network
      16
      5
      0.312500
      6
      2.875000
      86
      1.00000
      1.0
      0.000000
      1
      0.012272
      0.003180
      0.042130
    
    
      7
      Budding Yeast Cell Cycle
      20
      4
      0.200000
      6
      2.100000
      32
      1.87500
      1.0
      1.815730
      11
      0.021304
      0.028936
      0.077872
    
    
      8
      Budding Yeast Cell Cycle 2009
      18
      0
      0.000000
      6
      3.277778
      1
      11.00000
      11.0
      0.000000
      11
      0.000000
      1.000000
      1.000000
    
    
      9
      Cardiac development
      15
      1
      0.066667
      7
      2.533333
      5
      1.00000
      1.0
      0.000000
      1
      0.095619
      0.247423
      0.259941
    
    
      10
      Cell Cycle Transcription by Coupled CDK and Ne...
      9
      0
      0.000000
      4
      2.111111
      2
      3.00000
      3.0
      2.000000
      5
      0.257174
      0.500000
      0.757174
    
    
      11
      Cortical Area Development
      5
      0
      0.000000
      4
      2.800000
      2
      1.00000
      1.0
      0.000000
      1
      0.375000
      0.500000
      0.875000
    
    
      12
      Fanconi anemia and checkpoint recovery
      15
      0
      0.000000
      8
      4.400000
      1
      2.00000
      2.0
      0.000000
      2
      0.000000
      1.000000
      1.000000
    
    
      13
      HCC1954 Breast Cell Line Short-term ErbB Network
      16
      5
      0.312500
      6
      2.875000
      78
      1.00000
      1.0
      0.000000
      1
      0.013416
      0.003082
      0.043914
    
    
      15
      Iron acquisition and oxidative stress response...
      22
      2
      0.090909
      6
      1.727273
      4
      5.50000
      5.5
      3.500000
      9
      0.003024
      0.250231
      0.253932
    
    
      16
      Lac Operon
      13
      3
      0.230769
      4
      1.692308
      9
      1.00000
      1.0
      0.000000
      1
      0.039814
      0.121392
      0.142615
    
    
      17
      Mammalian Cell Cycle
      20
      1
      0.050000
      5
      2.550000
      2
      1.00000
      1.0
      0.000000
      1
      0.007884
      0.500000
      0.507884
    
    
      18
      Mammalian Cell Cycle 2006
      10
      0
      0.000000
      6
      3.500000
      2
      4.00000
      4.0
      3.000000
      7
      0.004386
      0.500000
      0.504386
    
    
      19
      Metabolic Interactions in the Gut Microbiome
      12
      1
      0.083333
      5
      2.500000
      39
      1.00000
      1.0
      0.000000
      1
      0.020996
      0.016009
      0.062967
    
    
      20
      Neurotransmitter Signaling Pathway
      16
      0
      0.000000
      3
      1.375000
      4
      2.50000
      2.5
      1.500000
      4
      0.024903
      0.255422
      0.278726
    
    
      21
      Oxidative Stress Pathway
      19
      1
      0.052632
      4
      1.684211
      2
      5.00000
      5.0
      4.000000
      9
      0.028641
      0.500000
      0.528641
    
    
      22
      Predicting Variabilities in Cardiac Gene
      15
      1
      0.066667
      7
      2.533333
      6
      1.00000
      1.0
      0.000000
      1
      0.114805
      0.242582
      0.254451
    
    
      24
      Regulation of the L-arabinose operon of Escher...
      13
      4
      0.307692
      3
      1.384615
      21
      1.47619
      1.0
      1.051939
      4
      0.024785
      0.059856
      0.083001
    
    
      26
      SKBR3 Breast Cell Line Short-term ErbB Network
      16
      5
      0.312500
      5
      2.562500
      129
      1.00000
      1.0
      0.000000
      1
      0.009682
      0.001480
      0.036269
    
    
      27
      T-LGL Survival Network 2011 Reduced Network
      18
      0
      0.000000
      4
      2.388889
      2
      2.50000
      2.5
      1.500000
      4
      0.468028
      0.500000
      0.968028



In [ ]:

    
print "median num attractors:\t bio={:.2f}, rnd={:.2f}".format(np.median(bio_properties['num_attractors']), 
                                                                np.median(rnd_properties['num_attractors']))
print "mean num attractors:\t bio={:.2f}, rnd={:.2f}".format(np.mean(bio_properties['num_attractors']), 
                                                                np.mean(rnd_properties['num_attractors']))
print "average average attractor length:\t bio={:.2f}, rnd={:.2f}".format(np.mean(bio_properties['mean_attractor_length']), 
                                                                np.mean(rnd_properties['mean_attractor_length']))
print "average median attractor basin:\t bio={:.2f}, rnd={:.2f}".format(np.mean(bio_properties['median_attractor_basin']), 
                                                                np.mean(rnd_properties['median_attractor_basin']))
print "average max attractor basin:\t bio={:.2f}, rnd={:.2f}".format(np.mean(bio_properties['max_attractor_basin']), 
                                                                np.mean(rnd_properties['max_attractor_basin']))

JUNK



In [332]:

    
biological_graphs = []
biological_graph_names = []
candidate_biological_graph_names = os.listdir("cellcollective_models")
for graph_dir in candidate_biological_graph_names:
    try:
        G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))
        biological_graphs.append(G)
        biological_graph_names.append(graph_dir)
    except ValueError as e:
        if e.message.startswith("Model export from cellcollective failed"):
            print "warning - did not load graph {}".format(graph_dir)

graph_name_to_attributes = dict()
for i, graph, name in zip(range(len(biological_graphs)), biological_graphs, biological_graph_names):
    n_inputs = len([v for v in graph.vertices if len(v.predecessors()) == 0])
    max_degree = max([len(v.predecessors()) for v in graph.vertices])
    size = len(graph.vertices)
    mean_degree = sum([len(v.predecessors()) for v in graph.vertices]) / float(size)
    normaliezd_n_inputs = n_inputs / float(size)
    graph_name_to_attributes[name] = dict(n_inputs=n_inputs, max_degree=max_degree,
                                                size=size, mean_degree=mean_degree,
                                          normalized_n_inputs=normaliezd_n_inputs, G=graph)
    print "#{}; {} input nodes for graph {} of size {} and max degree {}".format(i, n_inputs, name,
                                                                                 size, max_degree)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-332-95046c625bef> in <module>()
      4 for graph_dir in candidate_biological_graph_names:
      5     try:
----> 6         G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))
      7         biological_graphs.append(G)
      8         biological_graph_names.append(graph_dir)

C:\Users\ariel\PycharmProjects\attractor_learning\graphs.pyc in parse_boolean_tables(path)
    423                 functions.append(None)
    424             else:
--> 425                 with(open(truth_table_path, 'r')) as truth_table_file:
    426                     reader = csv.reader(truth_table_file, delimiter="\t")
    427                     lines = list(reader)

KeyboardInterrupt:



In [ ]:

    
good_graphs = [name for name in graph_name_to_attributes.keys() if name in data["graph_name"].unique()]
bad_graphs = [name for name in graph_name_to_attributes.keys() if name not in good_graphs]
print len(good_graphs), len(bad_graphs)

good_sizes = [graph_name_to_attributes[name]['size'] for name in good_graphs]
bad_sizes = [graph_name_to_attributes[name]['size'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_sizes, alpha=0.5, bins=bins)
plt.hist(bad_sizes, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['max_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['max_degree'] for name in bad_graphs]
bins = np.linspace(0, 15, 20)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in good_graphs]
bad_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in bad_graphs if graph_name_to_attributes[name]['size'] <= 10000]
bins = np.linspace(0, 100, 100)
plt.hist(good_attractors, alpha=0.5, bins=bins)
plt.hist(bad_attractors, alpha=0.5, bins=bins)
plt.show()



In [ ]:



In [ ]:

	biological mean	random mean	p-value (MWW test)
impact variant
optimization model	0.799816	0.832963	0.195415
stochastic model	0.340358	0.364066	0.075680
optimization model addition	0.657237	0.345675	0.000521
stochastic model addition	0.355122	0.317503	0.499329
optimization state	0.059181	0.113715	0.027934
stochastic state	0.095647	0.135742	0.151631

	median (seconds)	mean (seconds)	max (seconds)
impact variant
optimization model	1.362878	3.088127	26.732371
stochastic model	1378.625914	1720.275679	7043.028339
optimization model addition	99.993842	288.020783	4266.760450
stochastic model addition	1506.827890	1828.203960	7891.154656
optimization state	158.021152	892.862811	8646.752806
stochastic state	12.782863	17.002244	71.894509

	graph_name	size	n_inputs	normalized_n_inputs	max_degree	mean_degree	num_attractors	mean_attractor_length	median_attractor_length	std_attractor_length	max_attractor_length	std_attractor_basin	median_attractor_basin	max_attractor_basin
4	Body Segmentation in Drosophila 2013	17	3	0.176471	4	1.705882	10	1.00000	1.0	0.000000	1	0.046362	0.118468	0.137838
6	BT474 Breast Cell Line Short-term ErbB Network	16	5	0.312500	6	2.875000	86	1.00000	1.0	0.000000	1	0.012272	0.003180	0.042130
7	Budding Yeast Cell Cycle	20	4	0.200000	6	2.100000	32	1.87500	1.0	1.815730	11	0.021304	0.028936	0.077872
8	Budding Yeast Cell Cycle 2009	18	0	0.000000	6	3.277778	1	11.00000	11.0	0.000000	11	0.000000	1.000000	1.000000
9	Cardiac development	15	1	0.066667	7	2.533333	5	1.00000	1.0	0.000000	1	0.095619	0.247423	0.259941
10	Cell Cycle Transcription by Coupled CDK and Ne...	9	0	0.000000	4	2.111111	2	3.00000	3.0	2.000000	5	0.257174	0.500000	0.757174
11	Cortical Area Development	5	0	0.000000	4	2.800000	2	1.00000	1.0	0.000000	1	0.375000	0.500000	0.875000
12	Fanconi anemia and checkpoint recovery	15	0	0.000000	8	4.400000	1	2.00000	2.0	0.000000	2	0.000000	1.000000	1.000000
13	HCC1954 Breast Cell Line Short-term ErbB Network	16	5	0.312500	6	2.875000	78	1.00000	1.0	0.000000	1	0.013416	0.003082	0.043914
15	Iron acquisition and oxidative stress response...	22	2	0.090909	6	1.727273	4	5.50000	5.5	3.500000	9	0.003024	0.250231	0.253932
16	Lac Operon	13	3	0.230769	4	1.692308	9	1.00000	1.0	0.000000	1	0.039814	0.121392	0.142615
17	Mammalian Cell Cycle	20	1	0.050000	5	2.550000	2	1.00000	1.0	0.000000	1	0.007884	0.500000	0.507884
18	Mammalian Cell Cycle 2006	10	0	0.000000	6	3.500000	2	4.00000	4.0	3.000000	7	0.004386	0.500000	0.504386
19	Metabolic Interactions in the Gut Microbiome	12	1	0.083333	5	2.500000	39	1.00000	1.0	0.000000	1	0.020996	0.016009	0.062967
20	Neurotransmitter Signaling Pathway	16	0	0.000000	3	1.375000	4	2.50000	2.5	1.500000	4	0.024903	0.255422	0.278726
21	Oxidative Stress Pathway	19	1	0.052632	4	1.684211	2	5.00000	5.0	4.000000	9	0.028641	0.500000	0.528641
22	Predicting Variabilities in Cardiac Gene	15	1	0.066667	7	2.533333	6	1.00000	1.0	0.000000	1	0.114805	0.242582	0.254451
24	Regulation of the L-arabinose operon of Escher...	13	4	0.307692	3	1.384615	21	1.47619	1.0	1.051939	4	0.024785	0.059856	0.083001
26	SKBR3 Breast Cell Line Short-term ErbB Network	16	5	0.312500	5	2.562500	129	1.00000	1.0	0.000000	1	0.009682	0.001480	0.036269
27	T-LGL Survival Network 2011 Reduced Network	18	0	0.000000	4	2.388889	2	2.50000	2.5	1.500000	4	0.468028	0.500000	0.968028