This notebook contains various monte carlo simulations for iPosition data. In particular, a few primary methods of simulation are used. Naive 2D simulation, histogram data-driven simulation, and dirichlet distribution simulation are used to determine chance levels. The "actual coordinates" are either from real coordinates or from random coordinates.
First we need to import the pipeline. You'll need to change this directory to wherever it is stored on your machine.
In [1]:
from cogrecon.core.full_pipeline import full_pipeline, get_header_labels
from cogrecon.core.data_structures import TrialData, ParticipantData, AnalysisConfiguration
In [2]:
from sklearn.externals import joblib
sim_iterations = 1000 # For convenience, the number of iterations each simulation configuration should run
# Define the dimensions, number of items, and iterations for each test
root_dir = r'C:\Users\Kevin\Google Drive\iPyNotebooks\iPosition'
# kde_model = joblib.load(root_dir + r'\pat_kde.pkl')
test_configs = [
{'dims': 2, 'items': 2, 'iterations': sim_iterations, 'random_source': 'naive2d'}, # 'kde', 'model': kde_model},
{'dims': 2, 'items': 3, 'iterations': sim_iterations, 'random_source': 'naive2d'}, # 'kde', 'model': kde_model},
{'dims': 2, 'items': 4, 'iterations': sim_iterations, 'random_source': 'naive2d'}, # 'kde', 'model': kde_model},
{'dims': 2, 'items': 5, 'iterations': sim_iterations, 'random_source': 'naive2d'}, # 'kde', 'model': kde_model},
{'dims': 2, 'items': 6, 'iterations': sim_iterations, 'random_source': 'naive2d'}, # 'kde', 'model': kde_model},
{'dims': 2, 'items': 7, 'iterations': sim_iterations, 'random_source': 'naive2d'} # 'kde', 'model': kde_model}
]
remove_columns = [4, 18, 40, 41, 42, 43] # Some columns of our output may not average or standard-deviation easily, so we remove those
save_filename = 'naive_2d_monte_carlo.p' # The filename to save the output as we go
In [3]:
import numpy as np
import numpy.random as rand
import logging
import time
import os
import pickle
# Disable some outputs that we don't need given our circumstances
logger = logging.getLogger()
logger.disabled = True
np.seterr(invalid='ignore')
Out[3]:
In [4]:
# Helper for getting the appropriate headers for columns we keep
def get_output_labels():
headers = get_header_labels()
headers = np.delete(headers, remove_columns)
return headers
# Helper for printing our variables as we run
def print_read_friendly(o):
headers = get_output_labels()
row_format ="{0:55}: {1:15}"
for h, oo in zip(headers, o):
print(row_format.format(h, oo))
# Helper for converting our outputs to an easy-to-save format
def get_save_data(_test_configs, _output_labels, _mean_outputs, _std_outputs, _times):
save_data = {
'test_configs': _test_configs,
'output_labels': _output_labels,
'mean_outputs': _mean_outputs,
'std_outputs': _std_outputs,
'times': _times
}
return save_data
# Helper for saving our data
def checkpoint_data(save_filename, data):
pickle.dump(data, open(save_filename, 'wb'))
def generate_bounded_samples(kde, n_samples, x_range=(0, 1), y_range=(0, 1)):
y_sample, x_sample = np.transpose(kde.sample(n_samples=n_samples))
count = np.inf
while count != 0:
count = 0
for idx, (x, y) in enumerate(zip(x_sample, y_sample)):
if (not (x_range[0] <= x <= x_range[1])) or (not (y_range[0] <= y <= y_range[1])):
count += 1
yy, xx = np.transpose(kde.sample(n_samples=1))
x_sample[idx] = xx[0]
y_sample[idx] = yy[0]
return x_sample, y_sample
# Helper for getting random data
def get_random_data(n, dims, source='naive2d', model=None):
if source == 'naive2d':
actual = np.array([np.array([rand.random() for _ in range(dims)]) for _ in range(n)])
data = np.array([np.array([rand.random() for _ in range(dims)]) for _ in range(n)])
elif source == 'kde':
if model is None:
raise ValueError("No model provided for KDE.")
if dims != 2:
raise ValueError("Dimension must be 2D for KDE.")
x, y = generate_bounded_samples(model, n)
actual = np.array([np.array([xx, yy]) for xx, yy in zip(x, y)])
data = np.array([np.array([rand.random() for _ in range(dims)]) for _ in range(n)])
return actual.tolist(), data.tolist()
In [5]:
np.seterr(divide='ignore')
# Lists to store our main outputs
mean_outputs = []
std_outputs = []
times = []
# Iterate through our configurations
for config in test_configs:
# Get config parameters
dims = config['dims']
items = config['items']
iterations = config['iterations']
random_source = config['random_source']
if 'model' in config:
model = config['model']
else:
model = None
# List to store each iteration output - for large iterations, this is the list that can balloon up
outputs = []
# Record start runtime
start_time = time.time()
# Iterate the number of times requested
for _ in range(iterations):
# Generate random data
actual, data = get_random_data(items, dims, random_source, model)
# Run the pipeline
output = full_pipeline(ParticipantData([TrialData(actual, data)]), AnalysisConfiguration(), visualize=False)[0]
# Delete the removal columns and append the output
output = np.delete(output, remove_columns, axis=0)
outputs.append(output)
# Save the runtime, mean of outputs, and standard deviation of outputs (converting to float for that to avoid errors)
duration = time.time() - start_time
try:
avgs = np.nanmean(outputs, axis=0)
except ZeroDivisionError:
print(np.array(outputs).tolist())
break
stds = np.nanstd([[float(x) for x in inner] for inner in outputs], axis=0)
mean_outputs.append(avgs)
std_outputs.append(stds)
times.append(duration)
# Checkpoint/save the data to file
checkpoint_data(save_filename, get_save_data(test_configs, get_output_labels(), mean_outputs, std_outputs, times))
# Print a report on this configuration for the user
print('{0} iterations run in {1} seconds ({2} average) on {3}.'.format(sim_iterations, duration, duration/sim_iterations, config))
print('_'*100)
print_read_friendly(avgs)
print('_'*100)
print('_'*100)
Load the data to confirm it saved properly.
In [ ]:
load_data = pickle.load(open(save_filename, "rb"))
print(load_data)
In [ ]: