In [1]:
%matplotlib inline
import matplotlib.style as style
style.use('ggplot')

import os, sys
import seaborn
import pandas as pd
import numpy as np
import stochastic
from scipy import stats
from matplotlib import pyplot as plt
import graphs
plt.rcParams['figure.figsize'] = [10, 6]

def load_data(res_dir):
    data = pd.DataFrame()
    for filename in os.listdir(res_dir):
        if ".csv" in filename:
            data = pd.concat([data, pd.read_csv(os.path.join(res_dir, filename))])
            
    # exclude three acyclic graphs that might be included by accident
    data = data[~(data.graph_name.str.contains("VEGF") |
                          data.graph_name.str.contains("Toll Pathway") | data.graph_name.str.contains("Processing of Spz"))]

    return data

def treat_unequal_representation(data, min_threshold=0.8, ignore_multiple_bio=False):
    # TODO: rewrite this horror of a code
    res_data = pd.DataFrame()
    n_rnds = []
    print "#original graphs={}".format(len(data.graph_name.unique()))
    for graph_name in data.graph_name.unique():
        n_bio = len(data.loc[(data.graph_name == graph_name) & (data.is_random == False)])
        n_rnd = len(data.loc[(data.graph_name == graph_name) & (data.is_random == True)])
        if n_bio > 1:
            print "Warning - too many bio for graph {}: #bio={}, #rnd={}".format(graph_name, n_bio, n_rnd)
            if ignore_multiple_bio:
                res_data = pd.concat([res_data, data[data.graph_name == graph_name]])
            else:
                print "Sampling one bio"
                res_data = pd.concat([res_data, data[(data.graph_name == graph_name) & (data.is_random == True)]])
                res_data = pd.concat([res_data, data[(data.graph_name == graph_name) & (data.is_random == False)].
                                      sample(n=1)])
        elif n_bio == 1:
            res_data = pd.concat([res_data, data[data.graph_name == graph_name]])
        n_rnds.append(n_rnd)

    print "#original graphs with bio={}".format(len(res_data.graph_name.unique()))

    final_res_data = pd.DataFrame()
    if min_threshold <= 1:
        actual_threshold = min([n for n in n_rnds if n >= (max(n_rnds) * min_threshold)])
    else:
        actual_threshold = min_threshold
    for graph_name in res_data.graph_name.unique():
        n_rnd = len(res_data.loc[(res_data.graph_name == graph_name) & (res_data.is_random == True)])
        if n_rnd >= actual_threshold:
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == False) & (
                res_data.graph_name == graph_name)]])
            final_res_data = pd.concat([final_res_data, res_data.loc[(res_data.is_random == True) & (
                res_data.graph_name == graph_name)].sample(actual_threshold)])
    
    return final_res_data

def convert_model_addition_scores(data, K=30): 
    # because I have to ways to normalize that, and want to be able to convert here from 1/K to 1/ATT
    data['optimization_model_addition_impact_score'] = data['optimization_model_addition_impact_score'] * K / data['num_attractors']
    return data

def extract_vertex_scores_data(list_data):
    res_data = []
    score_cols = [col for col in list_data.columns if "scores" in col]
    non_score_cols = [col for col in list_data if col not in score_cols]
    for index, row in list_data.iterrows():
        val_lists = [eval(str(val_list).
                          replace("nan", "np.nan")) for val_list in row[score_cols]]
        for vals in zip(*val_lists):
            if not np.isnan(vals[0]):
                res_data.append(list(row[non_score_cols]) + list(vals))
    res_data = pd.DataFrame(res_data)
    res_data.columns = non_score_cols + score_cols
    res_data = res_data.rename(columns={"optimization_model_impact_scores": "optimization_model_impact_score",
                                       "stochastic_model_impact_scores": "stochastic_model_impact_score"})
    return res_data

In [2]:
data = treat_unequal_representation(load_data("results/post_submission/1_dot_5_hours"), min_threshold=0.2)
mean_data = data # TODO: refactor nicely

score_types = [col for col in data if "score" in col]
print "number of graphs: {}".format(len(data.graph_name.unique()))
print "#bio={}, #rnd={}".format(len(data.loc[data.is_random == False]), len(data.loc[data.is_random == True]))
random_to_real_ratio = len(data.loc[data.is_random == True]) / len(data.loc[data.is_random == False])

vertex_data = treat_unequal_representation(load_data("results/post_submission/1_dot_5_hours/vertices"), 
                                           min_threshold=random_to_real_ratio)
print "number of graphs (vertex): {}".format(len(vertex_data.graph_name.unique()))
print "#bio={}, #rnd={} (vertex)".format(len(vertex_data.loc[vertex_data.is_random == False]), len(vertex_data.loc[vertex_data.is_random == True]))
if (len(vertex_data.loc[vertex_data.is_random == True]) / len(vertex_data.loc[vertex_data.is_random == False])) < random_to_real_ratio:
    print "Warning: vertex data has smaller random-to-real ratio, consider taking this as the filtering ratio"
vertex_data = extract_vertex_scores_data(vertex_data)

intersection_graph_names = set(mean_data.graph_name.unique()).intersection(vertex_data.graph_name.unique())
print "number of graphs in mean-vertex intersection: {}".format(len(intersection_graph_names))
mean_data = mean_data[mean_data.graph_name.isin(intersection_graph_names)]
vertex_data = vertex_data[vertex_data.graph_name.isin(intersection_graph_names)]


#original graphs=38
#original graphs with bio=34
number of graphs: 30
#bio=30, #rnd=1200
#original graphs=38
#original graphs with bio=38
number of graphs (vertex): 36
#bio=36, #rnd=1440 (vertex)
number of graphs in mean-vertex intersection: 30

mean


In [5]:
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))
# fig.delaxes(ax[1,2]) #The indexing is zero-based here
# fig.delaxes(ax[1,1]) #The indexing is zero-based here

# fig, ax = plt.subplots(nrows=1, ncols=3, sharey=False, sharex=False, figsize=(15, 4))

i = 0
for container, score in zip([vertex_data] * 2 + [mean_data] * 2, score_types[:2] + score_types[2:]):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    for is_random in [False, True]:
        bar_data = container.loc[container.is_random == is_random][score].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["biological", "random"])
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e'][i])
#     cur_ax.set_title(score.replace("_", " "))
    i += 1
plt.savefig("mean_score_histograms_israndom.png", dpi=300)
plt.show()



In [5]:
fig, ax = plt.subplots(nrows=len(score_types)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    for is_random in [False, True]:
        bins = np.linspace(min(min(mean_data[score_pair[0]]), min(mean_data[score_pair[1]])), 
                           max(max(mean_data[score_pair[0]]), max(mean_data[score_pair[1]])), 25)
        cur_ax = ax[i / 2, i % 2]
        for score in score_pair:
            bar_data = mean_data.loc[mean_data.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(["optimization", "stochastic"])
        cur_ax.set_title(['a', 'b', 'c', 'd'][i])
#         cur_ax.set_title("{}, {}".format(score_pair[0].replace("optimization", " ").replace("_", " "), 
#                                          "random" if is_random else "biological"))
        i += 1
plt.savefig("mean_score_histograms_method.png", dpi=300)
plt.show()



In [6]:
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, sharex=False, figsize=(12, 5))

i = 0
score_pairs = [(s, s.replace("optimization", "stochastic")) for s in score_types if "optimization" in s]
for score_pair in score_pairs:
    cur_ax = ax[i]
    container = vertex_data if "model" in score_pair[0] else mean_data
    container.loc[container.is_random == True].plot.scatter(x=score_pair[0], y=score_pair[1], c="blue", ax=cur_ax)
    container.loc[container.is_random == False].plot.scatter(x=score_pair[0], y=score_pair[1], c="red", ax=cur_ax)
    type_description = score_pair[0].replace("optimization", " ").replace("_", " ")
    correlation = np.corrcoef(container[score_pair[0]], container[score_pair[1]])[0, 1]
    print "{}: {:.2f}".format(score_pair, correlation)
#     cur_ax.set_title("{} (correlation: {:.2f})".format(type_description, correlation))
    cur_ax.set_title(['a', 'b', 'c', 'd'][i])
    cur_ax.set_xlabel("optimization")
    cur_ax.set_ylabel("scochastic")
    
    lims = [
    np.min([cur_ax.get_xlim(), cur_ax.get_ylim()]),  # min of both axes
    np.max([cur_ax.get_xlim(), cur_ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against eachother
    cur_ax.plot(lims, lims, '--', alpha=0.5, zorder=10)
    cur_ax.set_aspect('equal')
    cur_ax.set_xlim(lims)
    cur_ax.set_ylim(lims)

    i += 1
plt.savefig("mean_scatter_score_types.png", dpi=300)
plt.show()


('optimization_model_impact_score', 'stochastic_model_impact_score'): 0.19
('optimization_state_impact_score', 'stochastic_state_impact_score'): 0.72

In [7]:
mean_pvalue_data = []
for container, score in zip([vertex_data] * 2 + [mean_data] * 3, score_types[:2] + score_types[1:]):
    bio = container.loc[container.is_random == False]
    rnd = container.loc[container.is_random == True]
    
    n_passed = 0
    graph_names = container['graph_name'].unique()
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
#         p_value = (np.argsort(list(graph_rnd) + list(graph_bio))[-1] + 1) / float(len(graph_rnd) + 1)
#         print p_value
#         if p_value < 0.05:
#             n_passed += 1
#     proportion_passed = n_passed / float(len(graph_names))
#     row = score.replace("_", " ").replace(" impact score", ""), np.nanmean(bio[score]), np.nanmean(rnd[score]), stats.mannwhitneyu(bio[score], rnd[score])[1], proportion_passed
    row = score.replace("_", " ").replace(" impact score", ""), np.nanmean(bio[score]), np.nanmean(rnd[score]) 
#     stats.mannwhitneyu(bio[score], rnd[score])[1]
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "real mean", "random mean"]
# "p-value (MWW test)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data


Out[7]:
real mean random mean
impact variant
optimization model 0.837056 0.866986
stochastic model 0.310972 0.343692
stochastic model 0.153594 0.166365
optimization state 0.225588 0.423722
stochastic state 0.072933 0.127127

In [8]:
mean_pvalue_data = []
for score in score_types:
    bio = mean_data.loc[mean_data.is_random == False]
    rnd = mean_data.loc[mean_data.is_random == True]
    graph_names = mean_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    ranks = []
    samples_per_graph = None
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score].values[0]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
        if samples_per_graph is None:
            samples_per_graph = len(graph_rnd) + 1
        mean_pairs.append((graph_bio, graph_rnd.mean()))
        for rnd_point in graph_rnd:
            pairs.append((graph_bio, rnd_point))
        ranks.append(np.argsort(list(graph_rnd) + [graph_bio])[-1])

    pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    rank_counts = [len([r for r in ranks if r == i]) for i in range(samples_per_graph)]
#     print rank_counts
    rank_pvalue = stats.chisquare(f_obs=rank_counts)[1]
    row = score.replace("_", " ").replace(" impact score", ""), pair_pvalue, mean_pair_pvalue, rank_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon mean test)", "p-value (Wilcoxon test)", "p-value (rank chisquared)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data


c:\python27\lib\site-packages\scipy\stats\morestats.py:2388: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
Out[8]:
p-value (Wilcoxon mean test) p-value (Wilcoxon test) p-value (rank chisquared)
impact variant
optimization model 1.847885e-05 0.345231 1.365543e-58
stochastic model 2.984902e-23 0.012453 8.714693e-01
optimization state 5.821174e-50 0.003634 5.454239e-01
stochastic state 3.775244e-34 0.002947 7.807679e-01

In [9]:
# For vertex-based scores
mean_pvalue_data = []
for score in score_types[:2]:
    bio = vertex_data.loc[vertex_data.is_random == False]
    rnd = vertex_data.loc[vertex_data.is_random == True]
    graph_names = vertex_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    ranks = []
    samples_per_graph = None
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][score]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
        mean_pairs.append((graph_bio.mean(), graph_rnd.mean()))
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    row = score.replace("_", " ").replace(" impact score", ""), mean_pair_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
# mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
mean_pvalue_data


Out[9]:
p-value (Wilcoxon test)
impact variant
optimization model 0.036826
stochastic model 0.016566

In [49]:
# good cols = [col for col in data.columns if (col not in score_types) and (
#     col not in timing_types) and (
#     col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length"]) and (
#     "std" not in col)]

bio_rows = []
rnd_rows = []
property_columns = [col for col in data.columns if ("score" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]
for score, container in zip(score_types[:2] + score_types[1:], [vertex_data]*2 + [mean_data]*3):
    print score
    bio = container.loc[container.is_random == False]
    rnd = container.loc[container.is_random == True]
    rnd_rows.append([None] * (len(property_columns) + 1))
    bio_rows.append([None] * (len(property_columns) + 1))
    for i, field in enumerate(property_columns):
#         bio_corr = np.corrcoef(bio[score], bio[field])[1,0]
#         rnd_corr = np.corrcoef(rnd[score], rnd[field])[1,0]
        bio_corr_pvalue = stats.pearsonr(bio[score], bio[field])
        rnd_corr_pvalue = stats.pearsonr(rnd[score], rnd[field])
        
        bio_rows[-1][0] = score 
        rnd_rows[-1][0] = score
        bio_rows[-1][i + 1] = "{:.2f} (p={:.2E})".format(*bio_corr_pvalue)
        rnd_rows[-1][i + 1] = "{:.2f} (p={:.2E})".format(*rnd_corr_pvalue)
        if field == score:
            continue
        if (bio_corr_pvalue[1] < 0.05) or (rnd_corr_pvalue[1] < 0.05):
            print "{}\t biological corr={:.2f} (p={:.2E})\t random corr={:.2f} (p={:.2E})".format(
                field, bio_corr_pvalue[0], bio_corr_pvalue[1], rnd_corr_pvalue[0], rnd_corr_pvalue[1])
    print ""

bio_rows_df = pd.DataFrame(bio_rows)
bio_rows_df.columns = ["score"] + property_columns
bio_rows_df.index.name = "score variant"
bio_rows_df = bio_rows_df.set_index("score")
bio_rows_df.to_csv("bio_corr.csv", float_format="%.3g")

rnd_rows_df = pd.DataFrame(rnd_rows)
rnd_rows_df.columns = ["score"] + property_columns
rnd_rows_df.index.name = "score variant"
rnd_rows_df = rnd_rows_df.set_index("score")
rnd_rows_df.to_csv("rnd_corr.csv", float_format="%.3g")
# print rnd_rows_df


optimization_model_impact_score
size	 biological corr=0.14 (p=6.63E-04)	 random corr=-0.02 (p=3.15E-03)
n_inputs	 biological corr=-0.39 (p=4.12E-22)	 random corr=-0.37 (p=0.00E+00)
normalized_n_inputs	 biological corr=-0.45 (p=4.43E-29)	 random corr=-0.40 (p=0.00E+00)
max_degree	 biological corr=-0.07 (p=9.88E-02)	 random corr=0.16 (p=4.86E-126)
mean_degree	 biological corr=0.05 (p=2.39E-01)	 random corr=0.19 (p=3.16E-181)
num_attractors	 biological corr=-0.38 (p=5.27E-21)	 random corr=-0.30 (p=0.00E+00)
mean_attractor_length	 biological corr=0.35 (p=3.55E-17)	 random corr=0.24 (p=5.55E-301)
median_attractor_length	 biological corr=0.31 (p=3.21E-14)	 random corr=0.23 (p=1.47E-279)
std_attractor_length	 biological corr=0.24 (p=4.52E-09)	 random corr=0.16 (p=3.45E-133)
max_attractor_length	 biological corr=0.33 (p=4.66E-16)	 random corr=0.19 (p=7.72E-185)
std_attractor_basin	 biological corr=0.16 (p=1.25E-04)	 random corr=0.26 (p=0.00E+00)
median_attractor_basin	 biological corr=0.38 (p=8.48E-21)	 random corr=0.17 (p=5.61E-152)
max_attractor_basin	 biological corr=0.49 (p=6.40E-35)	 random corr=0.34 (p=0.00E+00)

stochastic_model_impact_score
size	 biological corr=0.08 (p=7.39E-02)	 random corr=0.05 (p=2.85E-13)
n_inputs	 biological corr=-0.01 (p=7.28E-01)	 random corr=-0.09 (p=1.24E-37)
normalized_n_inputs	 biological corr=-0.09 (p=2.48E-02)	 random corr=-0.13 (p=1.99E-82)
max_degree	 biological corr=-0.29 (p=3.82E-12)	 random corr=-0.17 (p=2.48E-142)
mean_degree	 biological corr=-0.39 (p=2.01E-21)	 random corr=-0.31 (p=0.00E+00)
num_attractors	 biological corr=-0.21 (p=4.60E-07)	 random corr=-0.16 (p=1.72E-126)
mean_attractor_length	 biological corr=0.36 (p=5.80E-19)	 random corr=0.27 (p=0.00E+00)
median_attractor_length	 biological corr=0.35 (p=2.67E-17)	 random corr=0.25 (p=5.29E-308)
std_attractor_length	 biological corr=0.34 (p=2.47E-16)	 random corr=0.22 (p=6.90E-255)
max_attractor_length	 biological corr=0.38 (p=3.58E-21)	 random corr=0.27 (p=0.00E+00)
std_attractor_basin	 biological corr=-0.09 (p=2.86E-02)	 random corr=0.02 (p=7.82E-03)

stochastic_model_impact_score
size	 biological corr=0.10 (p=6.08E-01)	 random corr=0.09 (p=1.45E-03)
n_inputs	 biological corr=-0.09 (p=6.19E-01)	 random corr=-0.14 (p=1.98E-06)
normalized_n_inputs	 biological corr=-0.13 (p=4.78E-01)	 random corr=-0.15 (p=1.14E-07)
max_degree	 biological corr=-0.68 (p=3.83E-05)	 random corr=-0.65 (p=8.85E-148)
mean_degree	 biological corr=-0.64 (p=1.53E-04)	 random corr=-0.59 (p=3.86E-111)
num_attractors	 biological corr=-0.30 (p=1.03E-01)	 random corr=-0.24 (p=1.45E-17)
mean_attractor_length	 biological corr=0.36 (p=5.19E-02)	 random corr=0.15 (p=1.16E-07)
median_attractor_length	 biological corr=0.34 (p=6.80E-02)	 random corr=0.13 (p=3.53E-06)
std_attractor_length	 biological corr=0.54 (p=1.93E-03)	 random corr=0.12 (p=3.52E-05)
max_attractor_length	 biological corr=0.48 (p=7.37E-03)	 random corr=0.15 (p=7.32E-08)

optimization_state_impact_score
size	 biological corr=-0.06 (p=7.40E-01)	 random corr=-0.22 (p=3.37E-15)
n_inputs	 biological corr=-0.17 (p=3.59E-01)	 random corr=-0.10 (p=6.38E-04)
max_degree	 biological corr=-0.20 (p=2.87E-01)	 random corr=0.09 (p=2.23E-03)
mean_degree	 biological corr=-0.18 (p=3.35E-01)	 random corr=0.15 (p=1.75E-07)
num_attractors	 biological corr=0.10 (p=5.82E-01)	 random corr=0.24 (p=4.10E-17)
mean_attractor_length	 biological corr=-0.10 (p=6.16E-01)	 random corr=0.16 (p=2.94E-08)
median_attractor_length	 biological corr=-0.16 (p=4.10E-01)	 random corr=0.14 (p=6.37E-07)
std_attractor_length	 biological corr=0.21 (p=2.55E-01)	 random corr=0.18 (p=4.12E-10)
max_attractor_length	 biological corr=0.07 (p=7.07E-01)	 random corr=0.18 (p=4.36E-10)
std_attractor_basin	 biological corr=0.20 (p=2.98E-01)	 random corr=0.23 (p=1.55E-15)
median_attractor_basin	 biological corr=-0.32 (p=8.65E-02)	 random corr=-0.35 (p=1.66E-36)
max_attractor_basin	 biological corr=-0.10 (p=5.94E-01)	 random corr=-0.14 (p=8.11E-07)

stochastic_state_impact_score
size	 biological corr=-0.32 (p=8.24E-02)	 random corr=-0.28 (p=2.33E-22)
n_inputs	 biological corr=0.15 (p=4.29E-01)	 random corr=-0.10 (p=3.17E-04)
max_degree	 biological corr=-0.18 (p=3.46E-01)	 random corr=0.13 (p=2.77E-06)
mean_degree	 biological corr=-0.03 (p=8.55E-01)	 random corr=0.21 (p=9.45E-14)
num_attractors	 biological corr=0.52 (p=3.52E-03)	 random corr=0.23 (p=5.85E-16)
mean_attractor_length	 biological corr=-0.22 (p=2.39E-01)	 random corr=0.10 (p=5.10E-04)
median_attractor_length	 biological corr=-0.24 (p=2.05E-01)	 random corr=0.07 (p=1.58E-02)
std_attractor_length	 biological corr=0.05 (p=8.06E-01)	 random corr=0.18 (p=2.67E-10)
max_attractor_length	 biological corr=-0.12 (p=5.21E-01)	 random corr=0.17 (p=2.27E-09)
std_attractor_basin	 biological corr=0.04 (p=8.19E-01)	 random corr=0.15 (p=4.10E-07)
median_attractor_basin	 biological corr=-0.39 (p=3.12E-02)	 random corr=-0.37 (p=5.40E-41)
max_attractor_basin	 biological corr=-0.33 (p=7.16E-02)	 random corr=-0.20 (p=4.84E-12)


In [15]:
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=True, figsize=(15, 18))
# fig.delaxes(ax[2,1]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[:2] + score_types[2:], [vertex_data]*2 + [mean_data]*2):
    print score, np.nanmean(container[score])
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 2: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
#     cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])
    
    i += 1
ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score.png", dpi=300)
plt.show()


optimization_model_impact_score 0.864382354635227
stochastic_model_impact_score 0.3409292644561798
optimization_state_impact_score 0.4159812931996136
stochastic_state_impact_score 0.12583902439024391

In [12]:
# smaller version of last plot
fig, ax = plt.subplots(nrows=2, ncols=3, sharey=False, sharex=True, figsize=(15, 12))
fig.delaxes(ax[1,2]) #The indexing is zero-based here
fig.delaxes(ax[1,1]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[:2] + score_types[2:], [vertex_data]*2 + [mean_data]*2):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i / 3, i % 3]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 2: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

#     cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.axes.get_xaxis().set_visible(False)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
#     cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])    
    i += 1

handles, labels = ax[i / 3, (i - 1) % 3].get_legend_handles_labels()
labels = [(l if len(l) <= 17 else l[:25] + "...") for l in labels]

leg = fig.legend(handles, labels, bbox_to_anchor=(0.180, 0.295, 0.37, 0.11),  ncol=2)
leg.get_texts()[0].set_fontsize(6)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score_compressed.png", bbox_extra_artists=(leg,), bbox_inches='tight', dpi=300)
plt.show()



In [61]:
# Just one one stochastic and one ILP, no labels, less graphs
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, sharex=True, figsize=(15, 6))
# fig.delaxes(ax[1,2]) #The indexing is zero-based here

i = 0
for score, container in zip(score_types[2:4], [mean_data]*2):
    bins = np.linspace(min(container[score]), max(container[score]), 25)
    cur_ax = ax[i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=container.loc[container.is_random == True], 
                    x="graph_name", y=score, hue="graph_name", dodge=False)
    bio = container.loc[container.is_random == False]
    if i < 1: # vertex based
        bio = bio.groupby("graph_name").mean().reset_index()
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.axes.get_xaxis().set_visible(False)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
    cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
    cur_ax.set_title(['State ILP', 'State Stochastic'][i], fontsize=20)    
    i += 1

# handles, labels = ax[i / 2, (i - 1) % 2].get_legend_handles_labels()
# labels = [(l if len(l) <= 17 else l[:25] + "...") for l in labels]

# leg = fig.legend(handles, labels, bbox_to_anchor=(0.440, 0.313, 0.37, 0.11),  ncol=2)
# leg.get_texts()[0].set_fontsize(6)
# ax[1, 1].xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False)

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("mean_graph_score_compressed.png", bbox_extra_artists=(leg,), bbox_inches='tight', dpi=300)
plt.show()



In [10]:
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=False, sharex=True, figsize=(15, 13))

i = 0
for score in score_types[:2]:
    for is_random in [False, True]:
        bins = np.linspace(min(vertex_data[score]), max(vertex_data[score]), 25)
        cur_ax = ax[i / 2, i % 2]
        g = seaborn.boxplot(ax=cur_ax, data=vertex_data.loc[vertex_data.is_random == is_random], 
                        x="graph_name", y=score, hue="graph_name", dodge=False)
#         bio = vertex_data.loc[vertex_data.is_random == False]
#         seaborn.swarmplot(ax=cur_ax, x="graph_name", y=score, data=bio,
#               size=5, color="red", marker="o", linewidth=1)

        cur_ax.xaxis.set_tick_params(rotation=90)
        cur_ax.set_ylabel("")
        cur_ax.set_xlabel("")
        cur_ax.legend_.remove()
#         cur_ax.set_title(score.replace("_", " ").replace(" score", ""))
        cur_ax.set_title(['a', 'b', 'c','d', 'e'][i])
        i += 1

plt.gcf().subplots_adjust(bottom=0.3)

plt.savefig("vertex_graph_scores.png", dpi=300)
plt.show()


0.866256402893148
0.866256402893148
0.3428938014736245
0.3428938014736245

More bits


In [54]:
more_bits_data = treat_unequal_representation(load_data("results/graph_impact_multiple_bits"), min_threshold=0.20,
                                             ignore_multiple_bio=True)
print "number of graphs: {}".format(len(more_bits_data.graph_name.unique()))
print "#bio={}, #rnd={}".format(len(more_bits_data.loc[more_bits_data.is_random == False]), len(more_bits_data.loc[more_bits_data.is_random == True]))


#original graphs=37
Warning - too many bio for graph FA BRCA pathway: #bio=4, #rnd=1
Warning - too many bio for graph Arabidopsis thaliana Cell Cycle: #bio=4, #rnd=39
Warning - too many bio for graph B cell differentiation: #bio=4, #rnd=48
Warning - too many bio for graph Body Segmentation in Drosophila 2013: #bio=4, #rnd=54
Warning - too many bio for graph Bordetella bronchiseptica: #bio=4, #rnd=41
Warning - too many bio for graph Budding Yeast Cell Cycle 2009: #bio=4, #rnd=45
Warning - too many bio for graph Cardiac development: #bio=4, #rnd=54
Warning - too many bio for graph Cell Cycle Transcription by Coupled CDK and Network Oscillators: #bio=4, #rnd=54
Warning - too many bio for graph Cortical Area Development: #bio=4, #rnd=54
Warning - too many bio for graph Fanconi anemia and checkpoint recovery: #bio=4, #rnd=50
Warning - too many bio for graph Human Gonadal Sex Determination: #bio=4, #rnd=0
Warning - too many bio for graph Iron acquisition and oxidative stress response in aspergillus fumigatus: #bio=4, #rnd=33
Warning - too many bio for graph Lac Operon: #bio=4, #rnd=53
Warning - too many bio for graph Lymphoid and myeloid cell specification and transdifferentiation: #bio=4, #rnd=0
Warning - too many bio for graph Mammalian Cell Cycle: #bio=4, #rnd=54
Warning - too many bio for graph Mammalian Cell Cycle 2006: #bio=4, #rnd=54
Warning - too many bio for graph Neurotransmitter Signaling Pathway: #bio=4, #rnd=51
Warning - too many bio for graph Oxidative Stress Pathway: #bio=4, #rnd=54
Warning - too many bio for graph Predicting Variabilities in Cardiac Gene: #bio=4, #rnd=52
Warning - too many bio for graph Pro-inflammatory Tumor Microenvironment in Acute Lymphoblastic Leukemia: #bio=4, #rnd=18
Warning - too many bio for graph Regulation of the L-arabinose operon of Escherichia coli: #bio=4, #rnd=54
Warning - too many bio for graph T cell differentiation: #bio=3, #rnd=16
Warning - too many bio for graph T-LGL Survival Network 2011 Reduced Network: #bio=4, #rnd=54
Warning - too many bio for graph BT474 Breast Cell Line Short-term ErbB Network: #bio=2, #rnd=24
Warning - too many bio for graph HCC1954 Breast Cell Line Short-term ErbB Network: #bio=2, #rnd=23
#original graphs with bio=33
number of graphs: 27
#bio=88, #rnd=378

In [55]:
fig, ax = plt.subplots(nrows=len(score_types), ncols=2, sharey=False, sharex=False, figsize=(15, 20))

i = 0
for score in score_types:
    bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    for is_random in [False, True]:
        cur_ax = ax[i / 2, i % 2]
        for n_bits in more_bits_data.maximal_change_bits.unique():
            bits = more_bits_data.loc[more_bits_data.maximal_change_bits == n_bits]
            bar_data = bits.loc[bits.is_random == is_random][score].values
            weights = np.ones_like(bar_data) / float(len(bar_data))
            cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
        cur_ax.legend(more_bits_data.maximal_change_bits.unique())
    #         cur_ax.set_title(['a', 'b', 'c', 'd'][i])
        cur_ax.set_title("{}, random={}".format(score, is_random))
    #     cur_ax.set_title(score.replace("_", " "))
        i += 1
plt.savefig("2_bit_score_histograms.png", dpi=300)
plt.show()



In [56]:
# fig, ax = plt.subplots(nrows=len(score_types), ncols=2, sharey=False, sharex=False, figsize=(15, 20))

# i = 0
for score in score_types:
#     bins = np.linspace(min(mean_data[score]), max(mean_data[score]), 25)
    for is_random in [False, True]:
#         cur_ax = ax[i / 2, i % 2]
        more_bits_data.loc[more_bits_data.is_random == is_random].groupby('maximal_change_bits')[score].mean().plot()
    plt.legend(["real", "random"])
    plt.title("{}".format(score))
    plt.show()

more_bits_data.loc[more_bits_data.is_random == is_random].groupby('maximal_change_bits')['graph_name'].count().plot()
plt.title("counts")
plt.show()
    
    
#         for n_bits in more_bits_data.maximal_change_bits.unique():
#             bits = more_bits_data.loc[more_bits_data.maximal_change_bits == n_bits]
#             bar_data = bits.loc[bits.is_random == is_random][score].values
#             weights = np.ones_like(bar_data) / float(len(bar_data))
#             cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
#         cur_ax.legend(more_bits_data.maximal_change_bits.unique())
#     #         cur_ax.set_title(['a', 'b', 'c', 'd'][i])
#         cur_ax.set_title("{}, random={}".format(score, is_random))
#     #     cur_ax.set_title(score.replace("_", " "))
#         i += 1
# plt.savefig("2_bit_score_histograms.png")
# plt.show()



In [57]:
for k in range(more_bits_data.maximal_change_bits.min(), more_bits_data.maximal_change_bits.max()):
    mean_pvalue_data = []
    for score in score_types:
        bio = more_bits_data.loc[more_bits_data.is_random == False & (more_bits_data.maximal_change_bits == k)]
        rnd = more_bits_data.loc[more_bits_data.is_random == True & (more_bits_data.maximal_change_bits == k)]
        graph_names = more_bits_data['graph_name'].unique()
        mean_pairs = []
        pairs = []
        ranks = []
        samples_per_graph = None
        for graph_name in graph_names:
            graph_bio = bio.loc[bio.graph_name == graph_name][score].values[0]
            graph_rnd = rnd.loc[rnd.graph_name == graph_name][score]
            if samples_per_graph is None:
                samples_per_graph = len(graph_rnd) + 1
            mean_pairs.append((graph_bio, graph_rnd.mean()))
            for rnd_point in graph_rnd:
                pairs.append((graph_bio, rnd_point))
            ranks.append(np.argsort(list(graph_rnd) + [graph_bio])[-1])

        pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
        mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
        rank_counts = [len([r for r in ranks if r == i]) for i in range(samples_per_graph)]
#             print rank_counts
        rank_pvalue = stats.chisquare(f_obs=rank_counts)[1]
        row = score.replace("_", " ").replace(" impact score", ""), pair_pvalue, mean_pair_pvalue, rank_pvalue
        mean_pvalue_data.append(row)
    mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
    mean_pvalue_data.columns = ["impact variant", "p-value (Wilcoxon mean test)", "p-value (Wilcoxon test)", "p-value (rank chisquared)"]
    #                             "significant ratio (sorting test)"]
    mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
    # mean_pvalue_data.to_csv("mean_pvalues.csv", float_format="%.3g")
    print "k={}".format(k)
    print mean_pvalue_data


k=1
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                  9.899089e-01                 0.108809   
stochastic model                    9.805473e-14                 0.002672   
optimization state                  1.934818e-03                 0.166300   
stochastic state                    6.197526e-01                 0.989868   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.203558e-03  
stochastic model                 5.874621e-25  
optimization state               1.517649e-01  
stochastic state                 7.895122e-01  
c:\python27\lib\site-packages\scipy\stats\morestats.py:2413: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
c:\python27\lib\site-packages\scipy\stats\_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
k=2
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                      0.157299                      NaN   
stochastic model                        0.220120                 0.066608   
optimization state                      0.457073                 0.573169   
stochastic state                        0.717613                 0.493901   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.806173e-01  
stochastic model                 2.637253e-11  
optimization state               1.273937e-01  
stochastic state                 1.256492e-03  
k=3
                    p-value (Wilcoxon mean test)  p-value (Wilcoxon test)  \
impact variant                                                              
optimization model                      0.705457                 0.317311   
stochastic model                        0.000146                 0.000087   
optimization state                      0.004602                 0.019941   
stochastic state                        0.037394                 0.068017   

                    p-value (rank chisquared)  
impact variant                                 
optimization model               1.768669e-02  
stochastic model                 2.983746e-11  
optimization state               3.634239e-02  
stochastic state                 2.810327e-03  

Timing and graph properties


In [31]:
timing_types = [score.replace("impact_score", "time") for score in score_types]
timing_data = []
for timing_type in timing_types:
    row = [np.nanmedian(mean_data[timing_type]), np.nanmean(mean_data[timing_type]),
                                                         np.nanmax(mean_data[timing_type])]
    timing_data.append([timing_type.replace("_", " ").replace(" time", "")] + row)
timing_data = pd.DataFrame(timing_data)
timing_data = timing_data.set_index(timing_data.columns[0])
timing_data.columns = ["median (seconds)", "mean (seconds)", "max (seconds)"]
timing_data.index.name = "impact variant"
timing_data.to_csv("timings.csv", float_format="%.0f")
timing_data

# bins = np.linspace(0, 1000, 100)
# for timing_type in timing_types:
#     data[timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
# plt.legend(timing_types)
# plt.show()

# for timing_type in timing_types:
#     for graph_name in data.graph_name.unique():
#         data.loc[data.graph_name == graph_name][timing_type].hist(figsize=(20, 12), bins=bins, alpha=0.3)
#     plt.title(timing_type)
#     plt.legend(data.graph_name.unique())
#     plt.show()

# for timing_type in timing_types:
#     for field in [col for col in data.columns if ("score" not in col) and ("time" not in col) and data[col].dtype in [np.int, np.float, np.int64]]:
#         data.plot.scatter(field, timing_type)


Out[31]:
median (seconds) mean (seconds) max (seconds)
impact variant
optimization model 0.096472 0.743750 10.047526
stochastic model 27.953848 83.446544 1916.533971
optimization state 13.323562 342.932171 6728.493179
stochastic state 5.966551 42.206432 1192.938316

In [32]:
bio_properties = mean_data.loc[mean_data.is_random==False][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (
    col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length", "normalized_n_inputs"]) and (
    "std" not in col)]]
rnd_properties = mean_data.loc[mean_data.is_random==True][[col for col in data.columns if (col not in score_types) and (
    col not in timing_types) and (
    col not in ["is_random", "maximal_change_bits", "median_attractor_length", "max_attractor_length", "normalized_n_inputs"]) and (
    "std" not in col)]]
bio_properties.to_csv("bio_graph_properties.csv", float_format='%.3g')

# rnd_properties.to_csv("rnd_graph_properties.csv", float_format='%.3g')
bio_properties


Out[32]:
graph_name size n_inputs max_degree mean_degree num_attractors mean_attractor_length median_attractor_basin max_attractor_basin
0 Trichostrongylus retortaeformis 26 1 5 2.230769 12 4.250000 0.015806 0.294374
1 FA BRCA pathway 28 0 9 4.392857 1 2.000000 1.000000 1.000000
2 Arabidopsis thaliana Cell Cycle 14 0 8 4.714286 1 8.000000 1.000000 1.000000
4 B cell differentiation 22 5 7 1.772727 57 1.017544 0.020340 0.036945
5 Body Segmentation in Drosophila 2013 17 3 4 1.705882 10 1.000000 0.123046 0.138901
6 Bordetella bronchiseptica 33 0 6 2.393939 4 1.250000 0.034087 0.912752
8 BT474 Breast Cell Line Short-term ErbB Network 16 5 6 2.875000 82 1.000000 0.003162 0.039526
10 Budding Yeast Cell Cycle 2009 18 0 6 3.277778 1 11.000000 1.000000 1.000000
11 Cardiac development 15 1 7 2.533333 6 1.000000 0.242857 0.253480
12 Cell Cycle Transcription by Coupled CDK and Ne... 9 0 4 2.111111 2 3.000000 0.500000 0.760870
13 Cortical Area Development 5 0 4 2.800000 2 1.000000 0.500000 0.875000
14 Fanconi anemia and checkpoint recovery 15 0 8 4.400000 1 2.000000 1.000000 1.000000
16 HCC1954 Breast Cell Line Short-term ErbB Network 16 5 6 2.875000 97 1.000000 0.003023 0.042328
17 Human Gonadal Sex Determination 19 0 10 4.157895 2 1.000000 0.500000 0.534722
18 Iron acquisition and oxidative stress response... 22 2 6 1.727273 4 5.500000 0.250345 0.262770
19 Lac Operon 13 3 4 1.692308 9 1.000000 0.120200 0.140234
20 Mammalian Cell Cycle 20 1 5 2.550000 3 1.000000 0.469468 0.526592
21 Mammalian Cell Cycle 2006 10 0 6 3.500000 2 4.000000 0.500000 0.503737
22 Metabolic Interactions in the Gut Microbiome 12 1 5 2.500000 39 1.000000 0.015806 0.062171
23 Neurotransmitter Signaling Pathway 16 0 3 1.375000 4 2.500000 0.258766 0.270220
24 Oxidative Stress Pathway 19 1 4 1.684211 2 5.000000 0.500000 0.547459
25 PC12 Cell Differentiation 62 1 4 1.741935 2 1.000000 0.500000 0.639295
26 Predicting Variabilities in Cardiac Gene 15 1 7 2.533333 5 1.000000 0.245719 0.253909
27 Pro-inflammatory Tumor Microenvironment in Acu... 26 0 9 3.115385 6 1.666667 0.184327 0.239726
28 Regulation of the L-arabinose operon of Escher... 13 4 3 1.384615 21 1.476190 0.055244 0.087270
29 T cell differentiation 23 4 5 1.478261 33 1.000000 0.030528 0.051155
31 SKBR3 Breast Cell Line Short-term ErbB Network 16 5 5 2.562500 109 1.000000 0.002230 0.035688
32 T-Cell Signaling 2006 40 3 5 1.325000 10 2.300000 0.114557 0.134278
33 T-LGL Survival Network 2011 Reduced Network 18 0 4 2.388889 3 3.000000 0.047507 0.948387
34 Tumour Cell Invasion and Migration 32 2 8 4.875000 14 1.857143 0.045473 0.244011

In [33]:
attractor_properties = [p for p in bio_properties.columns if ("attractor" in p or "basin" in p)]
fig, ax = plt.subplots(nrows=len(attractor_properties)/2, ncols=2, sharey=False, sharex=False, figsize=(15, 10))
print attractor_properties

i = 0
for attractor_property in attractor_properties:
    bins = np.linspace(min(rnd_properties[attractor_property]), max(rnd_properties[attractor_property]), 25)
    cur_ax = ax[i / 2, i % 2]
    for is_random, container in zip([False, True], [bio_properties, rnd_properties]):
        bar_data = container[attractor_property].values
        weights = np.ones_like(bar_data) / float(len(bar_data))
        cur_ax.hist(bar_data, alpha=0.7, bins=bins, weights=weights)
    cur_ax.legend(["biological", "random"])
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e', 'f', 'g'][i])
#     cur_ax.set_title(score.replace("_", " "))
    i += 1
plt.savefig("attractor_properties_hist.png")
plt.show()

fig, ax = plt.subplots(nrows=len(attractor_properties)/2, ncols=2, sharey=False, sharex=True, figsize=(15, 10))
i = 0
for attractor_property in attractor_properties:
    cur_ax = ax[i / 2, i % 2]
    g = seaborn.boxplot(ax=cur_ax, data=rnd_properties, 
                    x="graph_name", y=attractor_property, hue="graph_name", dodge=False)
    bio = bio_properties
    seaborn.swarmplot(ax=cur_ax, x="graph_name", y=attractor_property, data=bio_properties,
              size=5, color="red", marker="o", linewidth=1)

    cur_ax.xaxis.set_tick_params(rotation=90)
    cur_ax.set_ylabel("")
    cur_ax.set_xlabel("")
    cur_ax.legend_.remove()
    cur_ax.set_title(['a', 'b', 'c', 'd', 'e', 'f', 'g'][i])
    i += 1
plt.savefig("attractor_properties_graph_breakdown.png")
plt.show()

mean_pvalue_data = []
for attractor_property in attractor_properties:
    bio = bio_properties
    rnd = rnd_properties
    graph_names = mean_data['graph_name'].unique()
    mean_pairs = []
    pairs = []
    for graph_name in graph_names:
        graph_bio = bio.loc[bio.graph_name == graph_name][attractor_property].values[0]
        graph_rnd = rnd.loc[rnd.graph_name == graph_name][attractor_property]
        mean_pairs.append((graph_bio, graph_rnd.mean()))
        for rnd_point in graph_rnd:
            pairs.append((graph_bio, rnd_point))

    bio_mean = np.nanmean(bio[attractor_property])
    rnd_mean = np.nanmean(rnd[attractor_property])
    pair_pvalue = stats.wilcoxon(zip(*pairs)[0], zip(*pairs)[1])[1]
    mean_pair_pvalue = stats.wilcoxon(zip(*mean_pairs)[0], zip(*mean_pairs)[1])[1]
    row = attractor_property.replace("_", " "), bio_mean, rnd_mean, pair_pvalue#, mean_pair_pvalue
    mean_pvalue_data.append(row)
mean_pvalue_data = pd.DataFrame(mean_pvalue_data)
mean_pvalue_data.columns = ["attractor property", "real mean", "random mean", "p-value (Wilcoxon mean test)"]
#                             "p-value (Wilcoxon test)"]
#                             "significant ratio (sorting test)"]
mean_pvalue_data = mean_pvalue_data.set_index(mean_pvalue_data.columns[0])
mean_pvalue_data.to_csv("properties_pvalues.csv", float_format="%.3g")
mean_pvalue_data


['num_attractors', 'mean_attractor_length', 'median_attractor_basin', 'max_attractor_basin']
Out[33]:
real mean random mean p-value (Wilcoxon mean test)
attractor property
num attractors 18.133333 16.968333 3.458069e-01
mean attractor length 2.427251 2.958009 3.575274e-08
median attractor basin 0.309416 0.270216 6.761789e-01
max attractor basin 0.427860 0.417988 6.946136e-01

JUNK


In [34]:
# graph_dirs = ["Trichostrongylus retortaeformis",
# "FA BRCA pathway",
# "Apoptosis Network",
# "Arabidopsis thaliana Cell Cycle",
# "Aurora Kinase A in Neuroblastoma",
# "B bronchiseptica and T retortaeformis coinfection",
# "B cell differentiation",
# "Body Segmentation in Drosophila 2013",
# "Bordetella bronchiseptica",
# "BT474 Breast Cell Line Short-term ErbB Network",
# "Budding Yeast Cell Cycle",
# "Budding Yeast Cell Cycle 2009",
# "Cardiac development",
# "Cell Cycle Transcription by Coupled CDK and Network Oscillators",
# "Cholesterol Regulatory Pathway",
# "Cortical Area Development",
# "Death Receptor Signaling",
# "Fanconi anemia and checkpoint recovery",
# "Toll Pathway of Drosophila Signaling Pathway",
# "Guard Cell Abscisic Acid Signaling",
# "HCC1954 Breast Cell Line Short-term ErbB Network",
# "Human Gonadal Sex Determination",
# "Iron acquisition and oxidative stress response in aspergillus fumigatus",
# "Lac Operon",
# "Lymphoid and myeloid cell specification and transdifferentiation",
# "Mammalian Cell Cycle",
# "Mammalian Cell Cycle 2006",
# "MAPK Cancer Cell Fate Network",
# "Metabolic Interactions in the Gut Microbiome",
# "Neurotransmitter Signaling Pathway",
# "Oxidative Stress Pathway",
# "Predicting Variabilities in Cardiac Gene",
# "Pro-inflammatory Tumor Microenvironment in Acute Lymphoblastic Leukemia",
# "Regulation of the L-arabinose operon of Escherichia coli",
# "T cell differentiation",
# "Senescence Associated Secretory Phenotype",
# "SKBR3 Breast Cell Line Long-term ErbB Network",
# "SKBR3 Breast Cell Line Short-term ErbB Network",
# "T-Cell Signaling 2006",
# "T-LGL Survival Network 2011 Reduced Network",
# "Tumour Cell Invasion and Migration",
# "Stomatal Opening Model",
# "T-LGL Survival Network 2011",
# "Processing of Spz Network from the Drosophila Signaling Pathway"]
biological_graphs = []
biological_graph_names = []
size_cutoff = 60
candidate_biological_graph_names = os.listdir("cellcollective_models")
# for graph_dir in candidate_biological_graph_names:
for graph_dir in graph_dirs:
    try:
        G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))
        if len(G.vertices) <= size_cutoff:
            stochastic.estimate_path_len_to_attractor(G, n_iter=100)
            biological_graphs.append(G)
            biological_graph_names.append(graph_dir)
    except ValueError as e:
        if e.message.startswith("Model export from cellcollective failed"):
            print "warning - did not load graph {}".format(graph_dir)

graph_name_to_attributes = dict()
for i, graph, name in zip(range(len(biological_graphs)), biological_graphs, biological_graph_names):
    n_inputs = len([v for v in graph.vertices if len(v.predecessors()) == 0])
    max_degree = max([len(v.predecessors()) for v in graph.vertices])
    size = len(graph.vertices)
    mean_degree = sum([len(v.predecessors()) for v in graph.vertices]) / float(size)
    normaliezd_n_inputs = n_inputs / float(size)
    graph_name_to_attributes[name] = dict(n_inputs=n_inputs, max_degree=max_degree,
                                                size=size, mean_degree=mean_degree,
                                          normalized_n_inputs=normaliezd_n_inputs, G=graph)
    print "#{}; {} input nodes for graph {} of size {} and max degree {}".format(i, n_inputs, name,
                                                                                 size, max_degree)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-963be8e46764> in <module>()
     48 candidate_biological_graph_names = os.listdir("cellcollective_models")
     49 # for graph_dir in candidate_biological_graph_names:
---> 50 for graph_dir in graph_dirs:
     51     try:
     52         G = graphs.Network.parse_boolean_tables(os.path.join("cellcollective_models", graph_dir))

NameError: name 'graph_dirs' is not defined

In [52]:
good_graphs = [name for name in graph_name_to_attributes.keys() if name in data["graph_name"].unique()]
bad_graphs = [name for name in graph_name_to_attributes.keys() if name not in good_graphs]
print len(good_graphs), len(bad_graphs)

good_sizes = [graph_name_to_attributes[name]['size'] for name in good_graphs]
bad_sizes = [graph_name_to_attributes[name]['size'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_sizes, alpha=0.5, bins=bins)
plt.hist(bad_sizes, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['size'] * graph_name_to_attributes[name]['mean_degree'] for name in bad_graphs]
bins = np.linspace(0, 100, 100)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_edges = [graph_name_to_attributes[name]['max_degree'] for name in good_graphs]
bad_edges = [graph_name_to_attributes[name]['max_degree'] for name in bad_graphs]
bins = np.linspace(0, 15, 20)
plt.hist(good_edges, alpha=0.5, bins=bins)
plt.hist(bad_edges, alpha=0.5, bins=bins)
plt.show()

good_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in good_graphs]
bad_attractors = [len(stochastic.estimate_attractors(
    G=graph_name_to_attributes[name]['G'], max_walk_len=50, n_walks=200, with_basins=False)) for name in bad_graphs if graph_name_to_attributes[name]['size'] <= 10000]
bins = np.linspace(0, 100, 100)
plt.hist(good_attractors, alpha=0.5, bins=bins)
plt.hist(bad_attractors, alpha=0.5, bins=bins)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-52-b0bdfcfe389c> in <module>()
----> 1 good_graphs = [name for name in graph_name_to_attributes.keys() if name in data["graph_name"].unique()]
      2 bad_graphs = [name for name in graph_name_to_attributes.keys() if name not in good_graphs]
      3 print len(good_graphs), len(bad_graphs)
      4 
      5 good_sizes = [graph_name_to_attributes[name]['size'] for name in good_graphs]

NameError: name 'graph_name_to_attributes' is not defined

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: