In [1]:
# matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

# pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# seaborn
import seaborn as sns
import statsmodels

# other stuff
import numpy as np
import scipy
import re
import pickle
import os
import datetime

# for more sophisticated display
from IPython.display import display, Math, Latex, HTML

from data_tools import *
from helper_functions import round_significant
from math import floor, ceil

%load_ext autoreload
%autoreload 2

In [2]:
files = [
#         vvvvvvvvvvvvvvvvvvvvvv=== depth=11 ===vvvvvvvvvvvvvvvvvvvvvvvv
#         './data/2017-02-22_00-56-13_YNKMQFSEMD.pickle', # active 0.5
#         './data/2017-02-22_01-37-32_LYW54AI19Q.pickle', # rollout
#         './data/2017-02-22_11-21-27_TM5WEHCC1H.pickle', # active 0.1
#         './data/2017-02-23_23-35-46_UXP32B0EI5.pickle', # rollout
#         './data/2017-02-23_23-42-35_F0MJQ7FOFU.pickle', # active 0.1, T=100, determ. rewards
#         './data/2017-02-24_00-15-41_I7ZLQVXH0E.pickle', # active 0.5, T=100, determ. rewards
#         './data/2017-02-24_14-52-38_I3P75AW17A.pickle', # active 0.1, T=100, stoch. rewards
#         './data/2017-02-24_15-00-17_MKF60HXY51.pickle', # active 0.5, T=100, stoch. rewards
#         './data/2017-02-24_15-18-09_R4IEB4F2J6.pickle', # rollout
#         './data/2017-02-24_20-57-53_CNGG7XXI5Q.pickle', # mixed 10
#         './data/2017-02-25_02-52-35_ICXAJF6IYC.pickle', # mixed harmonic, T=0.5
#         './data/2017-02-25_02-52-58_1DKJ86EK2L.pickle', # mixed exponential, T=0.5
#         './data/2017-02-26_04-47-36_0HCYMVURZH.pickle', # mixed harmonic, T=10
#         './data/2017-02-26_04-56-00_6LOUY0BO09.pickle', # mixed exponential, T=10
#         './data/2017-02-28_05-48-36_O94X1VQNJZ.pickle', # rollout
#         './data/2017-02-28_07-30-25_EAQFOTRXF7.pickle', # active 0.1
#         -----------------------------------------------------------------
#         './data/2017-02-28_18-02-26_3SHLAF9Q8A.pickle', # active 0.1
#         './data/2017-02-28_18-13-25_F9TJWHZA7L.pickle', # rollout
#         './data/2017-02-28_18-19-48_5916TQJ0R0.pickle', # mixed window
#         vvvvvvvvvvvvvvvvvvvvvv=== depth=21 ===vvvvvvvvvvvvvvvvvvvvvvvv
#         './data/2017-03-01_15-37-27_IBYN3JBD1I.pickle', # active 0.1
#         './data/2017-03-01_16-12-24_7ILEL958IR.pickle', # rollout
#         './data/2017-03-02_05-14-27_4NLAHRTJ6I.pickle', # active 0.1 (determ. reward)
#         './data/2017-03-02_09-26-28_BT4SATX1BF.pickle', # rollout (determ. reward)
#         './data/2017-03-02_13-43-18_VL6GIMQ0CT.pickle', # active 0.1 (rnd. intermediate reward)
#         './data/2017-03-02_14-08-23_BTQI300R3D.pickle', # rollout (rnd. intermediate reward)
#         './data/tmp/2017-03-02_19-24-18_B3BLQ2A2PU.pickle', # mixed window
#         './data/tmp/2017-03-02_19-40-26_X5MACJ22CU.pickle', # mixed exponential
#         -----------------------------------------------------------------
#         './data/2017-03-03_02-22-27_0I49HCWBFZ.pickle', # switch 0.1/0.05
#         './data/2017-03-03_02-40-39_6Y49ORESKK.pickle', # switch 0.1/0.01
#         './data/2017-03-03_02-51-43_GFF366XK7I.pickle', # mixed window
#         './data/2017-03-03_03-12-47_6B7ES0T5FU.pickle', # mixed exponential
#         './data/2017-03-03_14-51-24_PRCR7LMDE8.pickle',  # active 0.2
#         './data/2017-03-03_15-27-36_Q7UV2P9C6L.pickle', # active 0.01
#         -----------------------------new version----------------------------------
        './data/2017-03-05_15-13-32_6FI10TNYF6.pickle',  # rollout
        './data/2017-03-05_15-17-46_PKVE49BQD7.pickle',  # active 0.1
        './data/2017-03-05_15-53-46_6S1YAD5Y6U.pickle',  # active 0.01
        './data/2017-03-05_16-37-13_EYYUPC7JZL.pickle',  # active 0.2        
#         vvvvvvvvvvvvvvvvvvvvvv=== 6/6 ===vvvvvvvvvvvvvvvvvvvvvvvv
#         './data/2017-03-04_04-35-08_Z891API2CU.pickle',  # rollout
#         './data/2017-03-04_05-30-57_WSBZZO7Q33.pickle',  # active 0.1
#         './data/2017-03-04_16-09-12_WCT31NW5GA.pickle',  # active 0.01
]

In [3]:
for file in files:
    tmp_data = read_data([file])
    %reset_selective -f tmp_
    len(tmp_data)


reading 1 files
...done
computing extra variables...
...done
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-1cf3544a67b6> in <module>()
      2     tmp_data = read_data([file])
      3     get_ipython().magic('reset_selective -f tmp_')
----> 4     len(tmp_data)

NameError: name 'tmp_data' is not defined

In [3]:
# batch_ids = data['batch'].unique()
table = []
N = 3
fig, ax = plt.subplots(N, 1, figsize=(15, 10 * N))
legend =[[] for _ in range(len(ax))]
for idx, file in enumerate(files):
    plot_data = read_data([file])
    if len(plot_data['batch'].unique()) > 1:
        print("WARNING: multiple batches in one file:")
        for batch in plot_data['batch'].unique():
            print("    ", batch)
    batch = plot_data['batch'].unique()[0]
    
    # print progess info
    print('Processing batch ' + batch + '...')
    
    # define legend/table entries
    legend_entry = "{}{}".format(batch, plot_data['method'].unique())
    table.append([batch, len(plot_data['run ID'].unique())] + \
                 list(identify(row=list(plot_data.loc[0,:].values),
                               header=list(plot_data.columns.values))))
    
    # define parameters
    x = 'samples'
    color = sns.color_palette()[idx]
    kernel_width = 50
    kernel_exp = 2
    resolution = 200
    markersize = 3
    thick_line = 10
    thin_line = 1
    alpha = 0.25
    
    # plot best action probability
    a = ax[0]
    a.set_title('Best Action')
    print('Best Action')
    band_plot(*(mean_std(data=plot_data,
                         x=x,
                         y='best action probability',
                         kernel_width=kernel_width,
                         kernel_exp=kernel_exp,
                         resolution=resolution)[0:2]),
              color=color + (alpha,),
              ax=a,
              line_plot_args={'markersize': 0,
                              'linewidth': thick_line,
                              'label': '_nolegend_'})
    band_plot(*(mean_std(data=plot_data, x=x, y='best action probability')[0:2]),
              color=color,
              ax=a,
              line_plot_args={'markersize': markersize,
                              'linewidth': thin_line}
             )
    legend[0].append(legend_entry)
    
    # objective
    a = ax[1]
    a.set_title('Objective')
    print('Objective')
    band_plot(*(mean_std(data=plot_data,
                         x=x,
                         y='objective',
                         kernel_width=kernel_width,
                         kernel_exp=kernel_exp,
                         resolution=resolution)[0:2]),
              color=color + (alpha, ),
              ax=a,
              line_plot_args={'markersize': 0,
                              'linewidth': thick_line,
                              'label': '_nolegend_'})
    band_plot(*(mean_std(data=plot_data, x=x, y='objective')[0:2]),
              color=color,
              ax=a,
              line_plot_args={'markersize': markersize,
                              'linewidth': thin_line}
             )
    a.set_yscale("log")
    legend[1].append(legend_entry)
    
    # change of objective for active/rollout samples
    a = ax[2]
    a.set_title('Objective Change')
    print('Objective Change')
    scale = lambda x: sigmoid(x, 1e-5)
    x_vals_1, y_mean_1, y_error_1 = mean_std(data=plot_data[plot_data['type']=='active'],
                                x=x,
                                y='normalized signed objective change',
                                kernel_width=kernel_width,
                                kernel_exp=kernel_exp,
                                resolution=resolution)
    x_vals_2, y_mean_2, y_error_2 = mean_std(data=plot_data[plot_data['type']=='rollout'],
                                x=x,
                                y='normalized signed objective change',
                                kernel_width=kernel_width,
                                kernel_exp=kernel_exp,
                                resolution=resolution)
    band_plot(x_vals_1, y_mean_1, color=color + (alpha, ), scale_func=scale, ax=a,
              line_plot_args={'linewidth': thick_line,
                              'label': '_nolegend_'})
    band_plot(x_vals_2, y_mean_2, color=color + (alpha, ), scale_func=scale, ax=a,
              line_plot_args={'linewidth': thick_line,
                              'label': '_nolegend_'})
    x_vals_1, y_mean_1, y_error_1 = mean_std(data=plot_data[plot_data['type']=='active'],
                                x=x,
                                y='normalized signed objective change')
    x_vals_2, y_mean_2, y_error_2 = mean_std(data=plot_data[plot_data['type']=='rollout'],
                                x=x,
                                y='normalized signed objective change')
    band_plot(x_vals_1, y_mean_1, color=color, scale_func=scale, ax=a,
              line_plot_args={'markersize': markersize,
                              'linewidth': thin_line})
    band_plot(x_vals_2, y_mean_2, color=color, scale_func=scale, ax=a,
              line_plot_args={'linestyle': 'dotted',
                              'markersize': markersize,
                              'linewidth': thin_line})
    ticks = np.append(np.arange(-7e-5, 0, 5e-6), np.arange(0, 7e-5, 5e-6))
    a.set_yticks(scale(ticks))
    ticks = list(ticks)
    ticks[:7] = [''] * 7
    ticks[-6:] = [''] * 7
    a.set_yticklabels(ticks)
    legend[2].append(legend_entry + ' – active')
    legend[2].append(legend_entry + ' – rollout')
    
    %reset_selective plot_data

# add legends to plots etc
for idx, a in enumerate(ax):
    a.legend(legend[idx], loc='best', fontsize='12');
    a.set_xlim(0, data['samples'].max()+1)
#     a.set_xlim(0, 1000)
fig.tight_layout()
plt.savefig('plot.pdf')

# show table of methods and parameters
table = pd.DataFrame(data=table, columns=['batch', '# runs'] + list(get_discriminating_columns()))
for col in table.columns:
    if len(table[col].unique()) == 1 and col != '# runs':
        table.drop(col, axis=1, inplace=True)
display(table)


reading 1 files
...done
computing extra variables...
...done
Processing batch 2017-03-05_15-13-32_6FI10TNYF6...
Best Action
Objective
Objective Change
/home/robert/data/git_repos/active-tree-search/python/data_tools.py:30: RuntimeWarning: invalid value encountered in double_scalars
  y_mean[x_idx] = weights.dot(data[y].values) / weight_sum
/home/robert/data/git_repos/active-tree-search/python/data_tools.py:32: RuntimeWarning: invalid value encountered in double_scalars
  y_mean_std[x_idx] = np.sqrt(y_var / weight_sum)
Once deleted, variables cannot be recovered. Proceed (y/[n])?  y
reading 1 files
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-3-4785a82eaab3> in <module>()
      5 legend =[[] for _ in range(len(ax))]
      6 for idx, file in enumerate(files):
----> 7     plot_data = read_data([file])
      8     if len(plot_data['batch'].unique()) > 1:
      9         print("WARNING: multiple batches in one file:")

/home/robert/data/git_repos/active-tree-search/python/data_tools.py in read_data(list_of_files)
    196     for file in list_of_files:
    197         header, values = read_file(file_name=file)
--> 198         data.append(pd.DataFrame(data=values, columns=header))
    199     data = pd.concat(data)
    200     print('...done')

/usr/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    316 
    317                     mgr = _arrays_to_mgr(arrays, columns, index, columns,
--> 318                                          dtype=dtype)
    319                 else:
    320                     mgr = self._init_ndarray(data, index, columns, dtype=dtype,

/usr/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5387     axes = [_ensure_index(columns), _ensure_index(index)]
   5388 
-> 5389     return create_block_manager_from_arrays(arrays, arr_names, axes)
   5390 
   5391 

/usr/lib/python3.5/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
   4237 
   4238     try:
-> 4239         blocks = form_blocks(arrays, names, axes)
   4240         mgr = BlockManager(blocks, axes)
   4241         mgr._consolidate_inplace()

/usr/lib/python3.5/site-packages/pandas/core/internals.py in form_blocks(arrays, names, axes)
   4314 
   4315     if len(int_items):
-> 4316         int_blocks = _multi_blockify(int_items)
   4317         blocks.extend(int_blocks)
   4318 

/usr/lib/python3.5/site-packages/pandas/core/internals.py in _multi_blockify(tuples, dtype)
   4383     for dtype, tup_block in grouper:
   4384 
-> 4385         values, placement = _stack_arrays(list(tup_block), dtype)
   4386 
   4387         block = make_block(values, placement=placement)

/usr/lib/python3.5/site-packages/pandas/core/internals.py in _stack_arrays(tuples, dtype)
   4426     shape = (len(arrays),) + _shape_compat(first)
   4427 
-> 4428     stacked = np.empty(shape, dtype=dtype)
   4429     for i, arr in enumerate(arrays):
   4430         stacked[i] = _asarray_compat(arr)

MemoryError: 

Old Stuff


In [33]:
# combine raw data
raw_data = {}
# for file_name in ['../tmp1/data.raw', '../tmp2/data.raw', '../tmp3/data.raw', '../tmp4/data.raw']:
for file_name in ['./data.raw']:
    with open(file_name, 'rb') as file:
        for method, data in pickle.load(file).items():
            if method not in raw_data:
                raw_data[method] = []
            raw_data[method] += data
# transform data into pandas data frame
columns = None
for method_data in raw_data.values():
    for _, action_values in method_data:
        action_columns = list(action_values.keys())
        if columns is None:
            columns = action_columns
        else:
            if columns != action_columns:
                raise UserWarning("Inconsistent action columns: {} / {}".format(columns, action_columns))
columns = ['method', 'samples'] + columns
data_matrix = []
for method, method_data in raw_data.items():
    for samples, action_values in method_data:
        data_matrix.append([method, samples] +
                           [action_value for action_value in action_values.values()])
pandas_data = pd.DataFrame(data=data_matrix, columns=columns)
pickle.dump(raw_data, open('data.raw.tmp', "wb"))
pickle.dump(pandas_data, open('data.pandas.tmp', "wb"))

In [34]:
data = pickle.load(open("data.pandas.tmp", "rb"))
# data['best action probability'] = np.logical_and(data['+'] > data['•'], data['+'] > data['-'])
# data['Q-difference'] = ((data['+'] - data['•']) + (data['+'] - data['-']))/2
data['best action probability'] = data['+'] > data['•']
data['Q-difference'] = data['+'] - data['•']
display(data.head())
# sns.set(color_codes=True)
# # data = data[data['method']=='trial-rollout']
# # data = data[data['method']=='active']
# # data = data[data['samples']<=50]
# for method in data['method'].unique():
#     try:
#         ax = sns.regplot(x="samples", y="best action probability", data=data[data['method']==method],
#                          n_boot=500, y_jitter=.03, scatter=True, logistic=True)
# #         ax = sns.regplot(x="samples", y="Q-difference", data=data[data['method']==method],
# #                          n_boot=500, x_jitter=1, y_jitter=.03, scatter=True, fit_reg=False)
#     except statsmodels.tools.sm_exceptions.PerfectSeparationError:
#         print("Warning: Perfect separation for method '{}'".format(method))
# plt.legend(data['method'].unique(), loc='best');
# ax.set_xlim(0, data['samples'].max()+1)
# # ax.set_ylim(-0.2,1.4);
# # plt.savefig('plot.pdf')


method samples + best action probability Q-difference
0 (active, 1.41421356237, 0.5) 34 0.525155 0.570315 False -0.045160
1 (active, 1.41421356237, 0.5) 35 0.524125 0.570315 False -0.046190
2 (active, 1.41421356237, 0.5) 36 0.542104 0.570315 False -0.028211
3 (active, 1.41421356237, 0.5) 37 0.557369 0.570315 False -0.012946
4 (active, 1.41421356237, 0.5) 38 0.566937 0.570315 False -0.003378

In [19]:
series_data = data[data['method']==data['method'].unique()[0]]
series_data.reset_index(inplace=True, drop=True)
idx_list = []
last_sample = None
for d in series_data.values:
    if len(idx_list) == 0:
        idx_list.append(0)
    elif d[1] < last_sample:
        idx_list.append(idx_list[-1]+1)
    else:
        idx_list.append(idx_list[-1])
    last_sample = d[1]
series_data['run'] = idx_list
display(series_data.head())
sns.tsplot(data=series_data,
           time='samples',
           unit='run',
           value='best action probability',
           err_style='unit_traces')


/usr/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
method samples + best action probability Q-difference run
0 (active, 1.41421356237, 0.5) 34 0.525155 0.570315 False -0.045160 0
1 (active, 1.41421356237, 0.5) 35 0.524125 0.570315 False -0.046190 0
2 (active, 1.41421356237, 0.5) 36 0.542104 0.570315 False -0.028211 0
3 (active, 1.41421356237, 0.5) 37 0.557369 0.570315 False -0.012946 0
4 (active, 1.41421356237, 0.5) 38 0.566937 0.570315 False -0.003378 0
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8d9ba54470>

In [ ]: