In [105]:
# Imports
import matplotlib.pyplot as plt
import numpy
import pandas
import scipy
import scipy.stats
In [5]:
import os
# Using os.listdir to show the current directory
os.listdir("./")
Out[5]:
In [8]:
# Using os.listdir to show the output directory
os.listdir("output")[0:5]
Out[8]:
In [9]:
import glob
# Using glob to list the output directory
glob.glob("output/run-*")[0:5]
Out[9]:
In [13]:
run_directory = os.listdir("output")[0]
print(run_directory)
print(os.path.join(run_directory,
"parameters.csv"))
In [25]:
print(run_directory)
print(os.path.basename(run_directory))
In [101]:
# Create "complete" data frames
run_data = []
all_timeseries_data = pandas.DataFrame()
all_interaction_data = pandas.DataFrame()
# Iterate over all directories
for run_directory in glob.glob("output/run*"):
# Get the run ID from our directory name
run_id = os.path.basename(run_directory)
# Load parameter and reshape
run_parameter_data = pandas.read_csv(os.path.join(run_directory, "parameters.csv"))
run_parameter_data.index = run_parameter_data["parameter"]
# Load timeseries and interactions
run_interaction_data = pandas.read_csv(os.path.join(run_directory, "interactions.csv"))
run_interaction_data["run"] = run_id
run_ts_data = pandas.read_csv(os.path.join(run_directory, "timeseries.csv"))
run_ts_data["run"] = run_id
# Flatten parameter data into interaction and TS data
for parameter_name in run_parameter_data.index:
run_ts_data.loc[:, parameter_name] = run_parameter_data.loc[parameter_name, "value"]
if run_interaction_data.shape[0] > 0:
for parameter_name in run_parameter_data.index:
run_interaction_data.loc[:, parameter_name] = run_parameter_data.loc[parameter_name, "value"]
# Store raw run data
run_data.append({"parameters": run_parameter_data,
"interactions": run_interaction_data,
"timeseries": run_ts_data})
# Update final steps
all_timeseries_data = all_timeseries_data.append(run_ts_data)
all_interaction_data = all_interaction_data.append(run_interaction_data)
In [102]:
# let's see how many records we have.
print(all_timeseries_data.shape)
print(all_interaction_data.shape)
In [112]:
# Let's see what the data looks like.
all_timeseries_data.head()
Out[112]:
In [114]:
all_interaction_data.head()
Out[114]:
In [107]:
%matplotlib inline
# let's use groupby to find some information.
last_step_data = all_timeseries_data.groupby("run").tail(1)
In [110]:
# Simple plot
f = plt.figure()
plt.scatter(last_step_data["min_subsidy"],
last_step_data["num_infected"],
alpha=0.5)
plt.xlabel("Subsidy")
plt.ylabel("Number infected")
plt.title("Subsidy vs. number infected")
Out[110]:
In [129]:
# Let's use groupby with **multiple** variables now.
mean_infected_by_subsidy = all_timeseries_data.groupby(["run", "min_subsidy", "min_prob_hookup"])["num_infected"].mean()
std_infected_by_subsidy = all_timeseries_data.groupby(["run", "min_subsidy", "min_prob_hookup"])["num_infected"].std()
infected_by_subsidy = pandas.concat((mean_infected_by_subsidy,
std_infected_by_subsidy), axis=1)
infected_by_subsidy.columns = ["mean", "std"]
infected_by_subsidy.head()
Out[129]:
In [134]:
# Plot a distribution
f = plt.figure()
_ = plt.hist(last_step_data["num_infected"].values,
color="red",
alpha=0.5)
plt.xlabel("Number infected")
plt.ylabel("Frequency")
plt.title("Distribution of number infected")
Out[134]:
In [140]:
# Perform distribution tests for no subsidy vs. some subsidy
no_subsidy_data = last_step_data.loc[last_step_data["min_subsidy"] == 0,
"num_infected"]
some_subsidy_data = last_step_data.loc[last_step_data["min_subsidy"] > 0,
"num_infected"]
In [153]:
# Plot a distribution
f = plt.figure()
_ = plt.hist(no_subsidy_data.values,
color="red",
alpha=0.25)
_ = plt.hist(some_subsidy_data.values,
color="blue",
alpha=0.25)
plt.xlabel("Number infected")
plt.ylabel("Frequency")
plt.title("Distribution of number infected")
Out[153]:
In [147]:
# Test for normality
print(scipy.stats.shapiro(no_subsidy_data))
print(scipy.stats.shapiro(some_subsidy_data))
# Test for equal variances
print(scipy.stats.levene(no_subsidy_data, some_subsidy_data))
In [148]:
# Perform t-test
print(scipy.stats.ttest_ind(no_subsidy_data, some_subsidy_data))
# Perform rank-sum test
print(scipy.stats.ranksums(no_subsidy_data, some_subsidy_data))
In [ ]: