In [1]:
%matplotlib notebook
from analyze import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
In [2]:
# Conditions key:
# b = buzz, s = silent
# j = Justin's WG, r = Seminar room, c = Cafeteria
# e = easy, h = hard
subject_meta = {}
subject_meta[14] = 'bje'
subject_meta[15] = 'bre'
subject_meta[16] = 'sch'
subject_meta[17] = 'sce'
subject_meta[18] = 'sch'
subject_meta[19] = 'sce'
subject_meta[20] = 'bjh'
In [3]:
data = load_all_data()
subject_id = 20
s20 = get_subject(data,subject_id)
baseline = get_baseline(s20)
times, widths = get_distances(s20)
time_delta = [(x-times[0]).total_seconds() for x in times]
In [4]:
fig, (ax1, ax2) = plt.subplots(nrows=2,ncols=1)
# Plot raw data
plot_it(ax1,time_delta,widths,conditions=subject_meta[subject_id], baseline=baseline)
# Exlude outliers
# z-score threshold (3) to limit errors caused by signal noise
z_threshold = 3
# Keep items within z-score of `z_threshold`
time_keep, width_keep = remove_outliers(time_delta, widths, z_threshold)
# Plot clean data
title = """Head Proximity to Computer over Time
Excluding Outliers (z = {})
Subject ID: {} Condition: {}""".format(z_threshold,subject_id,
subject_meta[subject_id])
plot_it(ax2,time_keep,width_keep, conditions=subject_meta[subject_id], \
baseline=baseline,title=title)
plt.tight_layout()
plt.show()
In [5]:
plot_subjects(data, subject_meta, exclude_outliers=True)
In [13]:
import csv
import pandas as pd
df = pd.DataFrame()
df_list = []
keys = []
# Move data to pandas dataframe
for subject_id in subject_meta.keys():
subject_data = get_subject(data,subject_id)
baseline = get_baseline(subject_data)
times, widths = get_distances(subject_data)
time_delta = [(x-times[0]).total_seconds() for x in times]
# Trim to about first 20 minutes
# time_delta = time_delta[:21*30]
# widths = widths[:21*30]
df = pd.DataFrame(pd.Series(widths, index=time_delta))
df_list.append(df)
keys.append(subject_id)
In [14]:
df = pd.concat(df_list, keys = keys)
df
Out[14]:
In [8]:
# Place data into one index by filling missing values
index = [pd.Timedelta(np.timedelta64(x, 's')) for x in range(1,1240)]
df = pd.DataFrame(index=index)
data = load_all_data()
for subject_id in subject_meta.keys():
subject_data = get_subject(data,subject_id)
baseline = get_baseline(subject_data)
times, widths = get_distances(subject_data)
time_delta = [(x-times[0]) for x in times]
# Impute missing values with rolling mean
impute = lambda z: int(z[pd.notnull(z)].mean())
df[subject_id] = pd.Series(widths,index=time_delta)
In [9]:
# Fill missing values by interpolation with average of neighbors
df_clean = df.interpolate()
df_clean = df_clean.ffill().fillna(method='backfill').applymap(int)
# Save to csv file for anova analysis in R
df_clean.to_csv('posture_data.csv')