In [1]:

    
%matplotlib inline
import matplotlib.style as style
style.use('ggplot')

import os, sys
import seaborn
import pandas as pd
import numpy as np
import stochastic
from scipy import stats
from matplotlib import pyplot as plt
import graphs
plt.rcParams['figure.figsize'] = [10, 6]

def load_data(res_dir):
    data = pd.DataFrame()
    for filename in os.listdir(res_dir):
        if ".csv" in filename:
            data = pd.concat([data, pd.read_csv(os.path.join(res_dir, filename))])
            
    # exclude three acyclic graphs that might be included by accident
    data = data[~(data.graph_name.str.contains("VEGF") |
                          data.graph_name.str.contains("Toll Pathway") | data.graph_name.str.contains("Processing of Spz"))]

    return data

def treat_unequal_representation(data, min_threshold=0.8, ignore_multiple_bio=False):
    # TODO: rewrite this horror of a code
    res_data = pd.DataFrame()
    n_rnds = []
    print "#original graphs={}".format(len(data.graph_name.unique()))
    for graph_name in data.graph_name.unique():
        n_bio = len(data.loc[(data.graph_name == graph_name) & (data.is_random == False)])
        n_rnd = len(data.loc[(data.graph_name == graph_name) & (data.is_random == True)])
        if n_bio > 1:
            print "Warning - too many bio for graph {}: #bio={}, #rnd={}".format(graph_name, n_bio, n_rnd)
            if ignore_multiple_bio:
                res_data = pd.concat([res_data, data[data.graph_name == graph_name]])
            else:
                print "Sampling one bio"
                res_data = pd.concat([res_data, data[(data.graph_name == graph_name) & (data.is_random == True)]])
                res_data = pd.concat([res_data, data[(data.graph_name == graph_name) & (data.is_random == False)].
                                      sample(n=1)])
        elif n_bio == 1:
            res_data = pd.concat([res_data, data[data.graph_name == graph_name]])
        n_rnds.append(n_rnd)

    print "#original graphs with bio={}".format(len(res_data.graph_name.unique()))

    final_res_data = pd.DataFrame()
    if min_threshold <= 1:
        actual_threshold = min([n for n in n_rnds if n >= (max(n_rnds) * min_threshold)])
    else:
        actual_threshold = min_threshold
    for graph_name in res_data.graph_name.unique():
        n_rnd = len(res_data.loc[(res_data.graph_name == graph_name) & (res_data.is_random == True)])
        if n_rnd >= actual_threshold:
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == False) & (
                res_data.graph_name == graph_name)]])
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == True) & (
                res_data.graph_name == graph_name)].sample(actual_threshold)])
    
    return final_res_data

def convert_model_addition_scores(data, K=30): 
    # because I have to ways to normalize that, and want to be able to convert here from 1/K to 1/ATT
    data['optimization_model_addition_impact_score'] = data['optimization_model_addition_impact_score'] * K / data['num_attractors']
    return data

def extract_vertex_scores_data(list_data):
    res_data = []
    score_cols = [col for col in list_data.columns if "scores" in col]
    non_score_cols = [col for col in list_data if col not in score_cols]
    for index, row in list_data.iterrows():
        val_lists = [eval(str(val_list).
                          replace("nan", "np.nan")) for val_list in row[score_cols]]
        for vals in zip(*val_lists):
            if not np.isnan(vals[0]):
                res_data.append(list(row[non_score_cols]) + list(vals))
    res_data = pd.DataFrame(res_data)
    res_data.columns = non_score_cols + score_cols
    res_data = res_data.rename(columns={"optimization_model_impact_scores": "optimization_model_impact_score",
                                       "stochastic_model_impact_scores": "stochastic_model_impact_score"})
    return res_data



In [2]:

    
data = treat_unequal_representation(load_data("results/post_submission/1_dot_5_hours"), min_threshold=0.2)
mean_data = data # TODO: refactor nicely

score_types = [col for col in data if "score" in col]
print "number of graphs: {}".format(len(data.graph_name.unique()))
print "#bio={}, #rnd={}".format(len(data.loc[data.is_random == False]), len(data.loc[data.is_random == True]))
random_to_real_ratio = len(data.loc[data.is_random == True]) / len(data.loc[data.is_random == False])

vertex_data = treat_unequal_representation(load_data("results/post_submission/1_dot_5_hours/vertices"), 
                                           min_threshold=random_to_real_ratio)
print "number of graphs (vertex): {}".format(len(vertex_data.graph_name.unique()))
print "#bio={}, #rnd={} (vertex)".format(len(vertex_data.loc[vertex_data.is_random == False]), len(vertex_data.loc[vertex_data.is_random == True]))
if (len(vertex_data.loc[vertex_data.is_random == True]) / len(vertex_data.loc[vertex_data.is_random == False])) < random_to_real_ratio:
    print "Warning: vertex data has smaller random-to-real ratio, consider taking this as the filtering ratio"
vertex_data = extract_vertex_scores_data(vertex_data)

intersection_graph_names = set(mean_data.graph_name.unique()).intersection(vertex_data.graph_name.unique())
print "number of graphs in mean-vertex intersection: {}".format(len(intersection_graph_names))
mean_data = mean_data[mean_data.graph_name.isin(intersection_graph_names)]
vertex_data = vertex_data[vertex_data.graph_name.isin(intersection_graph_names)]









    



#original graphs=38
#original graphs with bio=34
number of graphs: 30
#bio=30, #rnd=1200
#original graphs=38
#original graphs with bio=38
number of graphs (vertex): 36
#bio=36, #rnd=1440 (vertex)
number of graphs in mean-vertex intersection: 30

mean



In [5]:

    
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))
# fig.delaxes(ax[1,2]) #The indexing is zero-based here
# fig.delaxes(ax[1,1]) #The indexing is zero-based here

# fig, ax = plt.subplots(nrows=1, ncols=3, sharey=False, sharex=False, figsize=(15, 4))

i = 0
for container, score in zip([vertex_data] * 2 + [mean_data] * 2, score_types[:2] + score_types[2:]):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    for is_random in [False, True]:
        bar_data = container.loc[container.is_random == is_random][score].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["biological", "random"])
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e'][i])
#     cur_ax.set_title(score.replace("_", " "))
    i += 1
plt.savefig("mean_score_histograms_israndom.png", dpi=300)
plt.show()



In [5]:

    
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    for is_random in [False, True]:
        bins = np.linspace(min(min(mean_data[score_pair[0]]), min(mean_data[score_pair[1]])), 
                           max(max(mean_data[score_pair[0]]), max(mean_data[score_pair[1]])), 25)
        cur_ax = ax[i / 2, i % 2]
        for score in score_pair:
            bar_data = mean_data.loc[mean_data.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(["optimization", "stochastic"])
        cur_ax.set_title(['a', 'b', 'c', 'd'][i])
#         cur_ax.set_title("{}, {}".format(score_pair[0].replace("optimization", " ").replace("_", " "), 
#                                          "random" if is_random else "biological"))
        i += 1
plt.savefig("mean_score_histograms_method.png", dpi=300)
plt.show()



In [6]:

    
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, sharex=False, figsize=(12, 5))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    cur_ax = ax[i]
    container = vertex_data if "model" in score_pair[0] else mean_data
    container.loc[container.is_random == True].plot.scatter(x=score_pair[0], y=score_pair[1], c="blue", ax=cur_ax)
    container.loc[container.is_random == False].plot.scatter(x=score_pair[0], y=score_pair[1], c="red", ax=cur_ax)
    type_description = score_pair[0].replace("optimization", " ").replace("_", " ")
    correlation = np.corrcoef(container[score_pair[0]], container[score_pair[1]])[0, 1]
    print "{}: {:.2f}".format(score_pair, correlation)
#     cur_ax.set_title("{} (correlation: {:.2f})".format(type_description, correlation))
    cur_ax.set_title(['a', 'b', 'c', 'd'][i])
    cur_ax.set_xlabel("optimization")
    cur_ax.set_ylabel("scochastic")
    
    lims = [
    np.min([cur_ax.get_xlim(), cur_ax.get_ylim()]),  # min of both axes
    np.max([cur_ax.get_xlim(), cur_ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against eachother
    cur_ax.plot(lims, lims, '--', alpha=0.5, zorder=10)
    cur_ax.set_aspect('equal')
    cur_ax.set_xlim(lims)
    cur_ax.set_ylim(lims)

    i += 1
plt.savefig("mean_scatter_score_types.png", dpi=300)
plt.show()









    



('optimization_model_impact_score', 'stochastic_model_impact_score'): 0.19
('optimization_state_impact_score', 'stochastic_state_impact_score'): 0.72



In [7]:

    
mean_pvalue_data = []
for container, score in zip([vertex_data] * 2 + [mean_data] * 3, score_types[:2] + score_types[1:]):
    bio = container.loc[container.is_random == False]
    rnd = container.loc[container.is_random == True]
    
    n_passed = 0
    graph_names = container['graph_name'].unique()
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
#         p_value = (np.argsort(list(graph_rnd) + list(graph_bio))[-1] + 1) / float(len(graph_rnd) + 1)
#         print p_value
#         if p_value < 0.05:
#             n_passed += 1
#     proportion_passed = n_passed / float(len(graph_names))
#     row = score.replace("_", " ").replace(" impact score", ""), np.nanmean(bio[score]), np.nanmean(rnd[score]), stats.mannwhitneyu(bio[score], rnd[score])[1], proportion_passed
    row = score.replace("_", " ").replace(" impact score", ""), np.nanmean(bio[score]), np.nanmean(rnd[score]) 
#     stats.mannwhitneyu(bio[score], rnd[score])[1]
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "real mean", "random mean"]
# "p-value (MWW test)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data









    Out[7]:







  
    
      
      real mean
      random mean
    
    
      impact variant
      
      
    
  
  
    
      optimization model
      0.837056
      0.866986
    
    
      stochastic model
      0.310972
      0.343692
    
    
      stochastic model
      0.153594
      0.166365
    
    
      optimization state
      0.225588
      0.423722
    
    
      stochastic state
      0.072933
      0.127127



In [8]:

    
mean_pvalue_data = []
for score in score_types:
    bio = mean_data.loc[mean_data.is_random == False]
    rnd = mean_data.loc[mean_data.is_random == True]
    graph_names = mean_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    ranks = []
    samples_per_graph = None
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score].values[0]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
        if samples_per_graph is None:
            samples_per_graph = len(graph_rnd) + 1
        mean_pairs.append((graph_bio, graph_rnd.mean()))
        for rnd_point in graph_rnd:
            pairs.append((graph_bio, rnd_point))
        ranks.append(np.argsort(list(graph_rnd) + [graph_bio])[-1])

    pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    rank_counts = [len([r for r in ranks if r == i]) for i in range(samples_per_graph)]
#     print rank_counts
    rank_pvalue = stats.chisquare(f_obs=rank_counts)[1]
    row = score.replace("_", " ").replace(" impact score", ""), pair_pvalue, mean_pair_pvalue, rank_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon mean test)", "p-value (Wilcoxon test)", "p-value (rank chisquared)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data









    



c:\python27\lib\site-packages\scipy\stats\morestats.py:2388: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")






    Out[8]:







  
    
      
      p-value (Wilcoxon mean test)
      p-value (Wilcoxon test)
      p-value (rank chisquared)
    
    
      impact variant
      
      
      
    
  
  
    
      optimization model
      1.847885e-05
      0.345231
      1.365543e-58
    
    
      stochastic model
      2.984902e-23
      0.012453
      8.714693e-01
    
    
      optimization state
      5.821174e-50
      0.003634
      5.454239e-01
    
    
      stochastic state
      3.775244e-34
      0.002947
      7.807679e-01



In [9]:

    
# For vertex-based scores
mean_pvalue_data = []
for score in score_types[:2]:
    bio = vertex_data.loc[vertex_data.is_random == False]
    rnd = vertex_data.loc[vertex_data.is_random == True]
    graph_names = vertex_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    ranks = []
    samples_per_graph = None
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
        mean_pairs.append((graph_bio.mean(), graph_rnd.mean()))
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    row = score.replace("_", " ").replace(" impact score", ""), mean_pair_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data









    Out[9]:







  
    
      
      p-value (Wilcoxon test)
    
    
      impact variant
      
    
  
  
    
      optimization model
      0.036826
    
    
      stochastic model
      0.016566



In [49]:

    
# good cols = [col for col in data.columns if (col not in score_types) and (
#     col not in timing_types) and (
#     col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length"]) and (
#     "std" not in col)]

bio_rows = []
rnd_rows = []
property_columns = [col for col in data.columns if ("score" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]
for score, container in zip(score_types[:2] + score_types[1:], [vertex_data]*2 + [mean_data]*3):
    print score
    bio = container.loc[container.is_random == False]
    rnd = container.loc[container.is_random == True]
    rnd_rows.append([None] * (len(property_columns) + 1))
    bio_rows.append([None] * (len(property_columns) + 1))
    for i, field in enumerate(property_columns):
#         bio_corr = np.corrcoef(bio[score], bio[field])[1,0]
#         rnd_corr = np.corrcoef(rnd[score], rnd[field])[1,0]
        bio_corr_pvalue = stats.pearsonr(bio[score], bio[field])
        rnd_corr_pvalue = stats.pearsonr(rnd[score], rnd[field])
        
        bio_rows[-1][0] = score 
        rnd_rows[-1][0] = score
        bio_rows[-1][i + 1] = "{:.2f} (p={:.2E})".format(*bio_corr_pvalue)
        rnd_rows[-1][i + 1] = "{:.2f} (p={:.2E})".format(*rnd_corr_pvalue)
        if field == score:
            continue
        if (bio_corr_pvalue[1] < 0.05) or (rnd_corr_pvalue[1] < 0.05):
            print "{}\t biological corr={:.2f} (p={:.2E})\t random corr={:.2f} (p={:.2E})".format(
                field, bio_corr_pvalue[0], bio_corr_pvalue[1], rnd_corr_pvalue[0], rnd_corr_pvalue[1])
    print ""

bio_rows_df = pd.DataFrame(bio_rows)
bio_rows_df.columns = ["score"] + property_columns
bio_rows_df.index.name = "score variant"
bio_rows_df = bio_rows_df.set_index("score")
bio_rows_df.to_csv("bio_corr.csv", float_format="%.3g")

rnd_rows_df = pd.DataFrame(rnd_rows)
rnd_rows_df.columns = ["score"] + property_columns
rnd_rows_df.index.name = "score variant"
rnd_rows_df = rnd_rows_df.set_index("score")
rnd_rows_df.to_csv("rnd_corr.csv", float_format="%.3g")
# print rnd_rows_df









    



optimization_model_impact_score
size	 biological corr=0.14 (p=6.63E-04)	 random corr=-0.02 (p=3.15E-03)
n_inputs	 biological corr=-0.39 (p=4.12E-22)	 random corr=-0.37 (p=0.00E+00)
normalized_n_inputs	 biological corr=-0.45 (p=4.43E-29)	 random corr=-0.40 (p=0.00E+00)
max_degree	 biological corr=-0.07 (p=9.88E-02)	 random corr=0.16 (p=4.86E-126)
mean_degree	 biological corr=0.05 (p=2.39E-01)	 random corr=0.19 (p=3.16E-181)
num_attractors	 biological corr=-0.38 (p=5.27E-21)	 random corr=-0.30 (p=0.00E+00)
mean_attractor_length	 biological corr=0.35 (p=3.55E-17)	 random corr=0.24 (p=5.55E-301)
median_attractor_length	 biological corr=0.31 (p=3.21E-14)	 random corr=0.23 (p=1.47E-279)
std_attractor_length	 biological corr=0.24 (p=4.52E-09)	 random corr=0.16 (p=3.45E-133)
max_attractor_length	 biological corr=0.33 (p=4.66E-16)	 random corr=0.19 (p=7.72E-185)
std_attractor_basin	 biological corr=0.16 (p=1.25E-04)	 random corr=0.26 (p=0.00E+00)
median_attractor_basin	 biological corr=0.38 (p=8.48E-21)	 random corr=0.17 (p=5.61E-152)
max_attractor_basin	 biological corr=0.49 (p=6.40E-35)	 random corr=0.34 (p=0.00E+00)

stochastic_model_impact_score
size	 biological corr=0.08 (p=7.39E-02)	 random corr=0.05 (p=2.85E-13)
n_inputs	 biological corr=-0.01 (p=7.28E-01)	 random corr=-0.09 (p=1.24E-37)
normalized_n_inputs	 biological corr=-0.09 (p=2.48E-02)	 random corr=-0.13 (p=1.99E-82)
max_degree	 biological corr=-0.29 (p=3.82E-12)	 random corr=-0.17 (p=2.48E-142)
mean_degree	 biological corr=-0.39 (p=2.01E-21)	 random corr=-0.31 (p=0.00E+00)
num_attractors	 biological corr=-0.21 (p=4.60E-07)	 random corr=-0.16 (p=1.72E-126)
mean_attractor_length	 biological corr=0.36 (p=5.80E-19)	 random corr=0.27 (p=0.00E+00)
median_attractor_length	 biological corr=0.35 (p=2.67E-17)	 random corr=0.25 (p=5.29E-308)
std_attractor_length	 biological corr=0.34 (p=2.47E-16)	 random corr=0.22 (p=6.90E-255)
max_attractor_length	 biological corr=0.38 (p=3.58E-21)	 random corr=0.27 (p=0.00E+00)
std_attractor_basin	 biological corr=-0.09 (p=2.86E-02)	 random corr=0.02 (p=7.82E-03)

stochastic_model_impact_score
size	 biological corr=0.10 (p=6.08E-01)	 random corr=0.09 (p=1.45E-03)
n_inputs	 biological corr=-0.09 (p=6.19E-01)	 random corr=-0.14 (p=1.98E-06)
normalized_n_inputs	 biological corr=-0.13 (p=4.78E-01)	 random corr=-0.15 (p=1.14E-07)
max_degree	 biological corr=-0.68 (p=3.83E-05)	 random corr=-0.65 (p=8.85E-148)
mean_degree	 biological corr=-0.64 (p=1.53E-04)	 random corr=-0.59 (p=3.86E-111)
num_attractors	 biological corr=-0.30 (p=1.03E-01)	 random corr=-0.24 (p=1.45E-17)
mean_attractor_length	 biological corr=0.36 (p=5.19E-02)	 random corr=0.15 (p=1.16E-07)
median_attractor_length	 biological corr=0.34 (p=6.80E-02)	 random corr=0.13 (p=3.53E-06)
std_attractor_length	 biological corr=0.54 (p=1.93E-03)	 random corr=0.12 (p=3.52E-05)
max_attractor_length	 biological corr=0.48 (p=7.37E-03)	 random corr=0.15 (p=7.32E-08)

optimization_state_impact_score
size	 biological corr=-0.06 (p=7.40E-01)	 random corr=-0.22 (p=3.37E-15)
n_inputs	 biological corr=-0.17 (p=3.59E-01)	 random corr=-0.10 (p=6.38E-04)
max_degree	 biological corr=-0.20 (p=2.87E-01)	 random corr=0.09 (p=2.23E-03)
mean_degree	 biological corr=-0.18 (p=3.35E-01)	 random corr=0.15 (p=1.75E-07)
num_attractors	 biological corr=0.10 (p=5.82E-01)	 random corr=0.24 (p=4.10E-17)
mean_attractor_length	 biological corr=-0.10 (p=6.16E-01)	 random corr=0.16 (p=2.94E-08)
median_attractor_length	 biological corr=-0.16 (p=4.10E-01)	 random corr=0.14 (p=6.37E-07)
std_attractor_length	 biological corr=0.21 (p=2.55E-01)	 random corr=0.18 (p=4.12E-10)
max_attractor_length	 biological corr=0.07 (p=7.07E-01)	 random corr=0.18 (p=4.36E-10)
std_attractor_basin	 biological corr=0.20 (p=2.98E-01)	 random corr=0.23 (p=1.55E-15)
median_attractor_basin	 biological corr=-0.32 (p=8.65E-02)	 random corr=-0.35 (p=1.66E-36)
max_attractor_basin	 biological corr=-0.10 (p=5.94E-01)	 random corr=-0.14 (p=8.11E-07)

stochastic_state_impact_score
size	 biological corr=-0.32 (p=8.24E-02)	 random corr=-0.28 (p=2.33E-22)
n_inputs	 biological corr=0.15 (p=4.29E-01)	 random corr=-0.10 (p=3.17E-04)
max_degree	 biological corr=-0.18 (p=3.46E-01)	 random corr=0.13 (p=2.77E-06)
mean_degree	 biological corr=-0.03 (p=8.55E-01)	 random corr=0.21 (p=9.45E-14)
num_attractors	 biological corr=0.52 (p=3.52E-03)	 random corr=0.23 (p=5.85E-16)
mean_attractor_length	 biological corr=-0.22 (p=2.39E-01)	 random corr=0.10 (p=5.10E-04)
median_attractor_length	 biological corr=-0.24 (p=2.05E-01)	 random corr=0.07 (p=1.58E-02)
std_attractor_length	 biological corr=0.05 (p=8.06E-01)	 random corr=0.18 (p=2.67E-10)
max_attractor_length	 biological corr=-0.12 (p=5.21E-01)	 random corr=0.17 (p=2.27E-09)
std_attractor_basin	 biological corr=0.04 (p=8.19E-01)	 random corr=0.15 (p=4.10E-07)
median_attractor_basin	 biological corr=-0.39 (p=3.12E-02)	 random corr=-0.37 (p=5.40E-41)
max_attractor_basin	 biological corr=-0.33 (p=7.16E-02)	 random corr=-0.20 (p=4.84E-12)



In [15]:

    
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=True, figsize=(15, 18))
# fig.delaxes(ax[2,1]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[:2] + score_types[2:], [vertex_data]*2 + [mean_data]*2):
    print score, np.nanmean(container[score])
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 2: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
#     cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])
    
    i += 1
ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score.png", dpi=300)
plt.show()









    



optimization_model_impact_score 0.864382354635227
stochastic_model_impact_score 0.3409292644561798
optimization_state_impact_score 0.4159812931996136
stochastic_state_impact_score 0.12583902439024391



In [12]:

    
# smaller version of last plot
fig, ax = plt.subplots(nrows=2, ncols=3, sharey=False, sharex=True, figsize=(15, 12))
fig.delaxes(ax[1,2]) #The indexing is zero-based here
fig.delaxes(ax[1,1]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[:2] + score_types[2:], [vertex_data]*2 + [mean_data]*2):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 3, i % 3]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 2: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

#     cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.axes.get_xaxis().set_visible(False)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
#     cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])    
    i += 1

handles, labels = ax[i / 3, (i - 1) % 3].get_legend_handles_labels()
labels = [(l if len(l) <= 17 else l[:25] + "...") for l in labels]

leg = fig.legend(handles, labels, bbox_to_anchor=(0.180, 0.295, 0.37, 0.11),  ncol=2)
leg.get_texts()[0].set_fontsize(6)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score_compressed.png", bbox_extra_artists=(leg,), bbox_inches='tight', dpi=300)
plt.show()



In [61]:

    
# Just one one stochastic and one ILP, no labels, less graphs
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, sharex=True, figsize=(15, 6))
# fig.delaxes(ax[1,2]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[2:4], [mean_data]*2):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 1: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.axes.get_xaxis().set_visible(False)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
    cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['State ILP', 'State Stochastic'][i], fontsize=20)    
    i += 1

# handles, labels = ax[i / 2, (i - 1) % 2].get_legend_handles_labels()
# labels = [(l if len(l) <= 17 else l[:25] + "...") for l in labels]

# leg = fig.legend(handles, labels, bbox_to_anchor=(0.440, 0.313, 0.37, 0.11),  ncol=2)
# leg.get_texts()[0].set_fontsize(6)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score_compressed.png", bbox_extra_artists=(leg,), bbox_inches='tight', dpi=300)
plt.show()



In [10]:

    
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=True, figsize=(15, 13))

i = 0
for score in score_types[:2]:
    for is_random in [False, True]:
        bins = np.linspace(min(vertex_data[score]), max(vertex_data[score]), 25)
        cur_ax = ax[i / 2, i % 2]
        g = seaborn.boxplot(ax=cur_ax, data=vertex_data.loc[vertex_data.is_random == is_random], 
                        x="graph_name", y=score, hue="graph_name", dodge=False)
#         bio = vertex_data.loc[vertex_data.is_random == False]
#         seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
#               size=5, color="red", marker="o", linewidth=1)

        cur_ax.xaxis.set_tick_params(rotation=90)
        cur_ax.set_ylabel("")
        cur_ax.set_xlabel("")
        cur_ax.legend_.remove()
#         cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
        cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])
        i += 1

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("vertex_graph_scores.png", dpi=300)
plt.show()









    



0.866256402893148
0.866256402893148
0.3428938014736245
0.3428938014736245

More bits



In [54]:

    
more_bits_data = treat_unequal_representation(load_data("results/graph_impact_multiple_bits"), min_threshold=0.20,
                                             ignore_multiple_bio=True)
print "number of graphs: {}".format(len(more_bits_data.graph_name.unique()))
print "#bio={}, #rnd={}".format(len(more_bits_data.loc[more_bits_data.is_random == False]), len(more_bits_data.loc[more_bits_data.is_random == True]))









    



#original graphs=37
Warning - too many bio for graph FA BRCA pathway: #bio=4, #rnd=1
Warning - too many bio for graph Arabidopsis thaliana Cell Cycle: #bio=4, #rnd=39
Warning - too many bio for graph B cell differentiation: #bio=4, #rnd=48
Warning - too many bio for graph Body Segmentation in Drosophila 2013: #bio=4, #rnd=54
Warning - too many bio for graph Bordetella bronchiseptica: #bio=4, #rnd=41
Warning - too many bio for graph Budding Yeast Cell Cycle 2009: #bio=4, #rnd=45
Warning - too many bio for graph Cardiac development: #bio=4, #rnd=54
Warning - too many bio for graph Cell Cycle Transcription by Coupled CDK and Network Oscillators: #bio=4, #rnd=54
Warning - too many bio for graph Cortical Area Development: #bio=4, #rnd=54
Warning - too many bio for graph Fanconi anemia and checkpoint recovery: #bio=4, #rnd=50
Warning - too many bio for graph Human Gonadal Sex Determination: #bio=4, #rnd=0
Warning - too many bio for graph Iron acquisition and oxidative stress response in aspergillus fumigatus: #bio=4, #rnd=33
Warning - too many bio for graph Lac Operon: #bio=4, #rnd=53
Warning - too many bio for graph Lymphoid and myeloid cell specification and transdifferentiation: #bio=4, #rnd=0
Warning - too many bio for graph Mammalian Cell Cycle: #bio=4, #rnd=54
Warning - too many bio for graph Mammalian Cell Cycle 2006: #bio=4, #rnd=54
Warning - too many bio for graph Neurotransmitter Signaling Pathway: #bio=4, #rnd=51
Warning - too many bio for graph Oxidative Stress Pathway: #bio=4, #rnd=54
Warning - too many bio for graph Predicting Variabilities in Cardiac Gene: #bio=4, #rnd=52
Warning - too many bio for graph Pro-inflammatory Tumor Microenvironment in Acute Lymphoblastic Leukemia: #bio=4, #rnd=18
Warning - too many bio for graph Regulation of the L-arabinose operon of Escherichia coli: #bio=4, #rnd=54
Warning - too many bio for graph T cell differentiation: #bio=3, #rnd=16
Warning - too many bio for graph T-LGL Survival Network 2011 Reduced Network: #bio=4, #rnd=54
Warning - too many bio for graph BT474 Breast Cell Line Short-term ErbB Network: #bio=2, #rnd=24
Warning - too many bio for graph HCC1954 Breast Cell Line Short-term ErbB Network: #bio=2, #rnd=23
#original graphs with bio=33
number of graphs: 27
#bio=88, #rnd=378



In [55]:

    
fig, ax = plt.subplots(nrows=len(score_types), ncols=2, sharey=False, sharex=False, figsize=(15, 20))

i = 0
for score in score_types:
    bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    for is_random in [False, True]:
        cur_ax = ax[i / 2, i % 2]
        for n_bits in more_bits_data.maximal_change_bits.unique():
            bits = more_bits_data.loc[more_bits_data.maximal_change_bits == n_bits]
            bar_data = bits.loc[bits.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(more_bits_data.maximal_change_bits.unique())
    #         cur_ax.set_title(['a', 'b', 'c', 'd'][i])
        cur_ax.set_title("{}, random={}".format(score, is_random))
    #     cur_ax.set_title(score.replace("_", " "))
        i += 1
plt.savefig("2_bit_score_histograms.png", dpi=300)
plt.show()



In [56]:

    
# fig, ax = plt.subplots(nrows=len(score_types), ncols=2, sharey=False, sharex=False, figsize=(15, 20))

# i = 0
for score in score_types:
#     bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    for is_random in [False, True]:
#         cur_ax = ax[i / 2, i % 2]
        more_bits_data.loc[more_bits_data.is_random == is_random].groupby('maximal_change_bits')[score].mean().plot()
    plt.legend(["real", "random"])
    plt.title("{}".format(score))
    plt.show()

more_bits_data.loc[more_bits_data.is_random == is_random].groupby('maximal_change_bits')['graph_name'].count().plot()
plt.title("counts")
plt.show()
    
    
#         for n_bits in more_bits_data.maximal_change_bits.unique():
#             bits = more_bits_data.loc[more_bits_data.maximal_change_bits == n_bits]
#             bar_data = bits.loc[bits.is_random == is_random][score].values
#             weights = np.ones_like(bar_data) / float(len(bar_data))
#             cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
#         cur_ax.legend(more_bits_data.maximal_change_bits.unique())
#     #         cur_ax.set_title(['a', 'b', 'c', 'd'][i])
#         cur_ax.set_title("{}, random={}".format(score, is_random))
#     #     cur_ax.set_title(score.replace("_", " "))
#         i += 1
# plt.savefig("2_bit_score_histograms.png")
# plt.show()



In [57]:

    
for k in range(more_bits_data.maximal_change_bits.min(), more_bits_data.maximal_change_bits.max()):
    mean_pvalue_data = []
    for score in score_types:
        bio = more_bits_data.loc[more_bits_data.is_random == False & (more_bits_data.maximal_change_bits == k)]
        rnd = more_bits_data.loc[more_bits_data.is_random == True & (more_bits_data.maximal_change_bits == k)]
        graph_names = more_bits_data['graph_name'].unique()
        mean_pairs = []
        pairs = []
        ranks = []
        samples_per_graph = None
        for graph_name in graph_names:
            graph_bio = bio.loc[bio.graph_name == graph_name][score].values[0]
            graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
            if samples_per_graph is None:
                samples_per_graph = len(graph_rnd) + 1
            mean_pairs.append((graph_bio, graph_rnd.mean()))
            for rnd_point in graph_rnd:
                pairs.append((graph_bio, rnd_point))
            ranks.append(np.argsort(list(graph_rnd) + [graph_bio])[-1])

        pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
        mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
        rank_counts = [len([r for r in ranks if r == i]) for i in range(samples_per_graph)]
#             print rank_counts
        rank_pvalue = stats.chisquare(f_obs=rank_counts)[1]
        row = score.replace("_", " ").replace(" impact score", ""), pair_pvalue, mean_pair_pvalue, rank_pvalue
        mean_pvalue_data.append(row)
    mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
    mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon mean test)", "p-value (Wilcoxon test)", "p-value (rank chisquared)"]
    #                             "significant ratio (sorting test)"]
    mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
    # mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
    print "k={}".format(k)
    print mean_pvalue_data









    



k=1
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                  9.899089e-01                 0.108809   
stochastic model                    9.805473e-14                 0.002672   
optimization state                  1.934818e-03                 0.166300   
stochastic state                    6.197526e-01                 0.989868   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.203558e-03  
stochastic model                 5.874621e-25  
optimization state               1.517649e-01  
stochastic state                 7.895122e-01  






    



c:\python27\lib\site-packages\scipy\stats\morestats.py:2413: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)






    



k=2
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                      0.157299                      NaN   
stochastic model                        0.220120                 0.066608   
optimization state                      0.457073                 0.573169   
stochastic state                        0.717613                 0.493901   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.806173e-01  
stochastic model                 2.637253e-11  
optimization state               1.273937e-01  
stochastic state                 1.256492e-03  
k=3
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                      0.705457                 0.317311   
stochastic model                        0.000146                 0.000087   
optimization state                      0.004602                 0.019941   
stochastic state                        0.037394                 0.068017   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.768669e-02  
stochastic model                 2.983746e-11  
optimization state               3.634239e-02  
stochastic state                 2.810327e-03

Timing and graph properties



In [31]:

    
timing_types = [score.replace("impact_score", "time") for score in score_types]
timing_data = []
for timing_type in timing_types:
    row = [np.nanmedian(mean_data[timing_type]), np.nanmean(mean_data[timing_type]),
                                                         np.nanmax(mean_data[timing_type])]
    timing_data.append([timing_type.replace("_", " ").replace(" time", "")] + row)
timing_data = pd.DataFrame(timing_data)
timing_data = timing_data.set_index(timing_data.columns[0])
timing_data.columns = ["median (seconds)", "mean (seconds)", "max (seconds)"]
timing_data.index.name = "impact variant"
timing_data.to_csv("timings.csv", float_format="%.0f")
timing_data

# bins = np.linspace(0, 1000, 100)
# for timing_type in timing_types:
#     data[timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
# plt.legend(timing_types)
# plt.show()

# for timing_type in timing_types:
#     for graph_name in data.graph_name.unique():
#         data.loc[data.graph_name == graph_name][timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
#     plt.title(timing_type)
#     plt.legend(data.graph_name.unique())
#     plt.show()

# for timing_type in timing_types:
#     for field in [col for col in data.columns if ("score" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]:
#         data.plot.scatter(field, timing_type)









    Out[31]:







  
    
      
      median (seconds)
      mean (seconds)
      max (seconds)
    
    
      impact variant
      
      
      
    
  
  
    
      optimization model
      0.096472
      0.743750
      10.047526
    
    
      stochastic model
      27.953848
      83.446544
      1916.533971
    
    
      optimization state
      13.323562
      342.932171
      6728.493179
    
    
      stochastic state
      5.966551
      42.206432
      1192.938316



In [32]:

    
bio_properties = mean_data.loc[mean_data.is_random==False][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (
    col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length", "normalized_n_inputs"]) and (
    "std" not in col)]]
rnd_properties = mean_data.loc[mean_data.is_random==True][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (
    col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length", "normalized_n_inputs"]) and (
    "std" not in col)]]
bio_properties.to_csv("bio_graph_properties.csv", float_format='%.3g')

# rnd_properties.to_csv("rnd_graph_properties.csv", float_format='%.3g')
bio_properties









    Out[32]:







  
    
      
      graph_name
      size
      n_inputs
      max_degree
      mean_degree
      num_attractors
      mean_attractor_length
      median_attractor_basin
      max_attractor_basin
    
  
  
    
      0
      Trichostrongylus retortaeformis
      26
      1
      5
      2.230769
      12
      4.250000
      0.015806
      0.294374
    
    
      1
      FA BRCA pathway
      28
      0
      9
      4.392857
      1
      2.000000
      1.000000
      1.000000
    
    
      2
      Arabidopsis thaliana Cell Cycle
      14
      0
      8
      4.714286
      1
      8.000000
      1.000000
      1.000000
    
    
      4
      B cell differentiation
      22
      5
      7
      1.772727
      57
      1.017544
      0.020340
      0.036945
    
    
      5
      Body Segmentation in Drosophila 2013
      17
      3
      4
      1.705882
      10
      1.000000
      0.123046
      0.138901
    
    
      6
      Bordetella bronchiseptica
      33
      0
      6
      2.393939
      4
      1.250000
      0.034087
      0.912752
    
    
      8
      BT474 Breast Cell Line Short-term ErbB Network
      16
      5
      6
      2.875000
      82
      1.000000
      0.003162
      0.039526
    
    
      10
      Budding Yeast Cell Cycle 2009
      18
      0
      6
      3.277778
      1
      11.000000
      1.000000
      1.000000
    
    
      11
      Cardiac development
      15
      1
      7
      2.533333
      6
      1.000000
      0.242857
      0.253480
    
    
      12
      Cell Cycle Transcription by Coupled CDK and Ne...
      9
      0
      4
      2.111111
      2
      3.000000
      0.500000
      0.760870
    
    
      13
      Cortical Area Development
      5
      0
      4
      2.800000
      2
      1.000000
      0.500000
      0.875000
    
    
      14
      Fanconi anemia and checkpoint recovery
      15
      0
      8
      4.400000
      1
      2.000000
      1.000000
      1.000000
    
    
      16
      HCC1954 Breast Cell Line Short-term ErbB Network
      16
      5
      6
      2.875000
      97
      1.000000
      0.003023
      0.042328
    
    
      17
      Human Gonadal Sex Determination
      19
      0
      10
      4.157895
      2
      1.000000
      0.500000
      0.534722
    
    
      18
      Iron acquisition and oxidative stress response...
      22
      2
      6
      1.727273
      4
      5.500000
      0.250345
      0.262770
    
    
      19
      Lac Operon
      13
      3
      4
      1.692308
      9
      1.000000
      0.120200
      0.140234
    
    
      20
      Mammalian Cell Cycle
      20
      1
      5
      2.550000
      3
      1.000000
      0.469468
      0.526592
    
    
      21
      Mammalian Cell Cycle 2006
      10
      0
      6
      3.500000
      2
      4.000000
      0.500000
      0.503737
    
    
      22
      Metabolic Interactions in the Gut Microbiome
      12
      1
      5
      2.500000
      39
      1.000000
      0.015806
      0.062171
    
    
      23
      Neurotransmitter Signaling Pathway
      16
      0
      3
      1.375000
      4
      2.500000
      0.258766
      0.270220
    
    
      24
      Oxidative Stress Pathway
      19
      1
      4
      1.684211
      2
      5.000000
      0.500000
      0.547459
    
    
      25
      PC12 Cell Differentiation
      62
      1
      4
      1.741935
      2
      1.000000
      0.500000
      0.639295
    
    
      26
      Predicting Variabilities in Cardiac Gene
      15
      1
      7
      2.533333
      5
      1.000000
      0.245719
      0.253909
    
    
      27
      Pro-inflammatory Tumor Microenvironment in Acu...
      26
      0
      9
      3.115385
      6
      1.666667
      0.184327
      0.239726
    
    
      28
      Regulation of the L-arabinose operon of Escher...
      13
      4
      3
      1.384615
      21
      1.476190
      0.055244
      0.087270
    
    
      29
      T cell differentiation
      23
      4
      5
      1.478261
      33
      1.000000
      0.030528
      0.051155
    
    
      31
      SKBR3 Breast Cell Line Short-term ErbB Network
      16
      5
      5
      2.562500
      109
      1.000000
      0.002230
      0.035688
    
    
      32
      T-Cell Signaling 2006
      40
      3
      5
      1.325000
      10
      2.300000
      0.114557
      0.134278
    
    
      33
      T-LGL Survival Network 2011 Reduced Network
      18
      0
      4
      2.388889
      3
      3.000000
      0.047507
      0.948387
    
    
      34
      Tumour Cell Invasion and Migration
      32
      2
      8
      4.875000
      14
      1.857143
      0.045473
      0.244011



In [33]:

    
attractor_properties = [p for p in bio_properties.columns if ("attractor" in p or "basin" in p)]
fig, ax = plt.subplots(nrows=len(attractor_properties)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))
print attractor_properties

i = 0
for attractor_property in attractor_properties:
    bins = np.linspace(min(rnd_properties[attractor_property]), max(rnd_properties[attractor_property]), 25)
    cur_ax = ax[i / 2, i % 2]
    for is_random, container in zip([False, True], [bio_properties, rnd_properties]):
        bar_data = container[attractor_property].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["biological", "random"])
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e', 'f', 'g'][i])
#     cur_ax.set_title(score.replace("_", " "))
    i += 1
plt.savefig("attractor_properties_hist.png")
plt.show()

fig, ax = plt.subplots(nrows=len(attractor_properties)/2, ncols=2, sharey=False, sharex=True, figsize=(15, 10))
i = 0
for attractor_property in attractor_properties:
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=rnd_properties, 
                    x="graph_name", y=attractor_property, hue="graph_name", dodge=False)
    bio = bio_properties
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=attractor_property, data=bio_properties,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e', 'f', 'g'][i])
    i += 1
plt.savefig("attractor_properties_graph_breakdown.png")
plt.show()

mean_pvalue_data = []
for attractor_property in attractor_properties:
    bio = bio_properties
    rnd = rnd_properties
    graph_names = mean_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][attractor_property].values[0]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][attractor_property]
        mean_pairs.append((graph_bio, graph_rnd.mean()))
        for rnd_point in graph_rnd:
            pairs.append((graph_bio, rnd_point))

    bio_mean = np.nanmean(bio[attractor_property])
    rnd_mean = np.nanmean(rnd[attractor_property])
    pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    row = attractor_property.replace("_", " "), bio_mean, rnd_mean, pair_pvalue#, mean_pair_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["attractor property", "real mean", "random mean", "p-value (Wilcoxon mean test)"]
#                             "p-value (Wilcoxon test)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
mean_pvalue_data.to_csv("properties_pvalues.csv", float_format="%.3g")
mean_pvalue_data









    



['num_attractors', 'mean_attractor_length', 'median_attractor_basin', 'max_attractor_basin']






    












    












    Out[33]:







  
    
      
      real mean
      random mean
      p-value (Wilcoxon mean test)
    
    
      attractor property
      
      
      
    
  
  
    
      num attractors
      18.133333
      16.968333
      3.458069e-01
    
    
      mean attractor length
      2.427251
      2.958009
      3.575274e-08
    
    
      median attractor basin
      0.309416
      0.270216
      6.761789e-01
    
    
      max attractor basin
      0.427860
      0.417988
      6.946136e-01

JUNK



In [34]:

    
# graph_dirs = ["Trichostrongylus retortaeformis",
# "FA BRCA pathway",
# "Apoptosis Network",
# "Arabidopsis thaliana Cell Cycle",
# "Aurora Kinase A in Neuroblastoma",
# "B bronchiseptica and T retortaeformis coinfection",
# "B cell differentiation",
# "Body Segmentation in Drosophila 2013",
# "Bordetella bronchiseptica",
# "BT474 Breast Cell Line Short-term ErbB Network",
# "Budding Yeast Cell Cycle",
# "Budding Yeast Cell Cycle 2009",
# "Cardiac development",
# "Cell Cycle Transcription by Coupled CDK and Network Oscillators",
# "Cholesterol Regulatory Pathway",
# "Cortical Area Development",
# "Death Receptor Signaling",
# "Fanconi anemia and checkpoint recovery",
# "Toll Pathway of Drosophila Signaling Pathway",
# "Guard Cell Abscisic Acid Signaling",
# "HCC1954 Breast Cell Line Short-term ErbB Network",
# "Human Gonadal Sex Determination",
# "Iron acquisition and oxidative stress response in aspergillus fumigatus",
# "Lac Operon",
# "Lymphoid and myeloid cell specification and transdifferentiation",
# "Mammalian Cell Cycle",
# "Mammalian Cell Cycle 2006",
# "MAPK Cancer Cell Fate Network",
# "Metabolic Interactions in the Gut Microbiome",
# "Neurotransmitter Signaling Pathway",
# "Oxidative Stress Pathway",
# "Predicting Variabilities in Cardiac Gene",
# "Pro-inflammatory Tumor Microenvironment in Acute Lymphoblastic Leukemia",
# "Regulation of the L-arabinose operon of Escherichia coli",
# "T cell differentiation",
# "Senescence Associated Secretory Phenotype",
# "SKBR3 Breast Cell Line Long-term ErbB Network",
# "SKBR3 Breast Cell Line Short-term ErbB Network",
# "T-Cell Signaling 2006",
# "T-LGL Survival Network 2011 Reduced Network",
# "Tumour Cell Invasion and Migration",
# "Stomatal Opening Model",
# "T-LGL Survival Network 2011",
# "Processing of Spz Network from the Drosophila Signaling Pathway"]
biological_graphs = []
biological_graph_names = []
size_cutoff = 60
candidate_biological_graph_names = os.listdir("cellcollective_models")
# for graph_dir in candidate_biological_graph_names:
for graph_dir in graph_dirs:
    try:
        G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))
        if len(G.vertices) <= size_cutoff:
            stochastic.estimate_path_len_to_attractor(G, n_iter=100)
            biological_graphs.append(G)
            biological_graph_names.append(graph_dir)
    except ValueError as e:
        if e.message.startswith("Model export from cellcollective failed"):
            print "warning - did not load graph {}".format(graph_dir)

graph_name_to_attributes = dict()
for i, graph, name in zip(range(len(biological_graphs)), biological_graphs, biological_graph_names):
    n_inputs = len([v for v in graph.vertices if len(v.predecessors()) == 0])
    max_degree = max([len(v.predecessors()) for v in graph.vertices])
    size = len(graph.vertices)
    mean_degree = sum([len(v.predecessors()) for v in graph.vertices]) / float(size)
    normaliezd_n_inputs = n_inputs / float(size)
    graph_name_to_attributes[name] = dict(n_inputs=n_inputs, max_degree=max_degree,
                                                size=size, mean_degree=mean_degree,
                                          normalized_n_inputs=normaliezd_n_inputs, G=graph)
    print "#{}; {} input nodes for graph {} of size {} and max degree {}".format(i, n_inputs, name,
                                                                                 size, max_degree)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-963be8e46764> in <module>()
     48 candidate_biological_graph_names = os.listdir("cellcollective_models")
     49 # for graph_dir in candidate_biological_graph_names:
---> 50 for graph_dir in graph_dirs:
     51     try:
     52         G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))

NameError: name 'graph_dirs' is not defined



In [52]:

    
good_graphs = [name for name in graph_name_to_attributes.keys() if name in data["graph_name"].unique()]
bad_graphs = [name for name in graph_name_to_attributes.keys() if name not in good_graphs]
print len(good_graphs), len(bad_graphs)

good_sizes = [graph_name_to_attributes[name]['size'] for name in good_graphs]
bad_sizes = [graph_name_to_attributes[name]['size'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_sizes, alpha=0.5, bins=bins)
plt.hist(bad_sizes, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['max_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['max_degree'] for name in bad_graphs]
bins = np.linspace(0, 15, 20)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in good_graphs]
bad_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in bad_graphs if graph_name_to_attributes[name]['size'] <= 10000]
bins = np.linspace(0, 100, 100)
plt.hist(good_attractors, alpha=0.5, bins=bins)
plt.hist(bad_attractors, alpha=0.5, bins=bins)
plt.show()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-52-b0bdfcfe389c> in <module>()
----> 1 good_graphs = [name for name in graph_name_to_attributes.keys() if name in data["graph_name"].unique()]
      2 bad_graphs = [name for name in graph_name_to_attributes.keys() if name not in good_graphs]
      3 print len(good_graphs), len(bad_graphs)
      4 
      5 good_sizes = [graph_name_to_attributes[name]['size'] for name in good_graphs]

NameError: name 'graph_name_to_attributes' is not defined



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	real mean	random mean
impact variant
optimization model	0.837056	0.866986
stochastic model	0.310972	0.343692
stochastic model	0.153594	0.166365
optimization state	0.225588	0.423722
stochastic state	0.072933	0.127127

	p-value (Wilcoxon mean test)	p-value (Wilcoxon test)	p-value (rank chisquared)
impact variant
optimization model	1.847885e-05	0.345231	1.365543e-58
stochastic model	2.984902e-23	0.012453	8.714693e-01
optimization state	5.821174e-50	0.003634	5.454239e-01
stochastic state	3.775244e-34	0.002947	7.807679e-01

	p-value (Wilcoxon test)
impact variant
optimization model	0.036826
stochastic model	0.016566

	median (seconds)	mean (seconds)	max (seconds)
impact variant
optimization model	0.096472	0.743750	10.047526
stochastic model	27.953848	83.446544	1916.533971
optimization state	13.323562	342.932171	6728.493179
stochastic state	5.966551	42.206432	1192.938316

	graph_name	size	n_inputs	max_degree	mean_degree	num_attractors	mean_attractor_length	median_attractor_basin	max_attractor_basin
0	Trichostrongylus retortaeformis	26	1	5	2.230769	12	4.250000	0.015806	0.294374
1	FA BRCA pathway	28	0	9	4.392857	1	2.000000	1.000000	1.000000
2	Arabidopsis thaliana Cell Cycle	14	0	8	4.714286	1	8.000000	1.000000	1.000000
4	B cell differentiation	22	5	7	1.772727	57	1.017544	0.020340	0.036945
5	Body Segmentation in Drosophila 2013	17	3	4	1.705882	10	1.000000	0.123046	0.138901
6	Bordetella bronchiseptica	33	0	6	2.393939	4	1.250000	0.034087	0.912752
8	BT474 Breast Cell Line Short-term ErbB Network	16	5	6	2.875000	82	1.000000	0.003162	0.039526
10	Budding Yeast Cell Cycle 2009	18	0	6	3.277778	1	11.000000	1.000000	1.000000
11	Cardiac development	15	1	7	2.533333	6	1.000000	0.242857	0.253480
12	Cell Cycle Transcription by Coupled CDK and Ne...	9	0	4	2.111111	2	3.000000	0.500000	0.760870
13	Cortical Area Development	5	0	4	2.800000	2	1.000000	0.500000	0.875000
14	Fanconi anemia and checkpoint recovery	15	0	8	4.400000	1	2.000000	1.000000	1.000000
16	HCC1954 Breast Cell Line Short-term ErbB Network	16	5	6	2.875000	97	1.000000	0.003023	0.042328
17	Human Gonadal Sex Determination	19	0	10	4.157895	2	1.000000	0.500000	0.534722
18	Iron acquisition and oxidative stress response...	22	2	6	1.727273	4	5.500000	0.250345	0.262770
19	Lac Operon	13	3	4	1.692308	9	1.000000	0.120200	0.140234
20	Mammalian Cell Cycle	20	1	5	2.550000	3	1.000000	0.469468	0.526592
21	Mammalian Cell Cycle 2006	10	0	6	3.500000	2	4.000000	0.500000	0.503737
22	Metabolic Interactions in the Gut Microbiome	12	1	5	2.500000	39	1.000000	0.015806	0.062171
23	Neurotransmitter Signaling Pathway	16	0	3	1.375000	4	2.500000	0.258766	0.270220
24	Oxidative Stress Pathway	19	1	4	1.684211	2	5.000000	0.500000	0.547459
25	PC12 Cell Differentiation	62	1	4	1.741935	2	1.000000	0.500000	0.639295
26	Predicting Variabilities in Cardiac Gene	15	1	7	2.533333	5	1.000000	0.245719	0.253909
27	Pro-inflammatory Tumor Microenvironment in Acu...	26	0	9	3.115385	6	1.666667	0.184327	0.239726
28	Regulation of the L-arabinose operon of Escher...	13	4	3	1.384615	21	1.476190	0.055244	0.087270
29	T cell differentiation	23	4	5	1.478261	33	1.000000	0.030528	0.051155
31	SKBR3 Breast Cell Line Short-term ErbB Network	16	5	5	2.562500	109	1.000000	0.002230	0.035688
32	T-Cell Signaling 2006	40	3	5	1.325000	10	2.300000	0.114557	0.134278
33	T-LGL Survival Network 2011 Reduced Network	18	0	4	2.388889	3	3.000000	0.047507	0.948387
34	Tumour Cell Invasion and Migration	32	2	8	4.875000	14	1.857143	0.045473	0.244011

	real mean	random mean	p-value (Wilcoxon mean test)
attractor property
num attractors	18.133333	16.968333	3.458069e-01
mean attractor length	2.427251	2.958009	3.575274e-08
median attractor basin	0.309416	0.270216	6.761789e-01
max attractor basin	0.427860	0.417988	6.946136e-01