Split datasets into sequences and plot them

Before you start

Refer to the README in nupic.research/projects/capybara/datasets to download and format the datasets.

Helper functions



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import patches, colors
import seaborn as sns
import pandas as pd
import numpy as np
import os



In [2]:

    
# Debugger
from IPython.core.debugger import Pdb
t = Pdb().set_trace



In [3]:

    
# Seaborn settings for nice plots
sns.set_style('dark')
colors_example = sns.color_palette('colorblind', 3)
sns.palplot(colors_example)



In [4]:

    
# Utility functions to generate time series data
np.random.seed(22)

def generate_data(period, num_series, num_points, var):
    t = np.linspace(0, num_points/2, num_points)
    random_means = np.random.rand(num_series, num_points) # a random mean for each serie
    random_var = var * np.random.randn(num_series, 1)
    data = random_means + random_var + np.sin(t * period)
    return data
    

def generate_series(num_labels, num_series_per_label, num_points_per_series):
    var = 0.5
    dfs = []
    for label in range(num_labels):
        period = label + 1
        data = generate_data(period, num_series_per_label, num_points_per_series, var)
        df = pd.DataFrame(data=data, columns=['t%s' % i for i in range(len(data[0]))])
        labels = np.zeros(num_series_per_label, dtype=int)
        labels.fill(label)
        df['label'] = labels
        dfs.append(df)
    return pd.concat(dfs)



In [5]:

    
class OffsetTooLargeException(Exception):
    pass
    
# Utility function to reshape a dataframe
def reshape_df(df, max_rows, max_columns, row_offset=0, col_offset=0):
    if max_columns is not None:
        max_col_offset = len(df.columns.values) - (max_columns-1)
        if col_offset > max_col_offset:
            raise OffsetTooLargeException('Column offset should be lower than %s' %max_col_offset)
        columns = list(df.columns.values)[col_offset:col_offset+(max_columns-1)]
        if 'label' not in columns: columns.append('label')
        df = df.loc[:, columns]
    
    if max_rows is not None:
        max_row_offset = len(df) - max_rows
        if row_offset > max_row_offset:
            raise OffsetTooLargeException('Row offset should be lower than %s' %max_row_offset)
        df = df.iloc[row_offset:row_offset+max_rows]
    return df



In [6]:

    
# Utility function to plot groups of time series
def plot_df(df, title=None, size=1, ymin=None, ymax=None):
    
    label_names = df.label.unique()
    for label in label_names:
        data = df.loc[df.label==label, df.columns.values != 'label'].values
            
        if len(data) == 1: # If there is only one time series, there's no point to plot uncertainty across series.
            plot_uncertainty = False
        else:
            plot_uncertainty = True
        
        if plot_uncertainty:
            f, ax = plt.subplots(2, figsize=(20,5 * size))
            # Plot all series
            for series in data:
                ax[0].plot(series)
            ax[0].set_xlim(0, len(series) -1)

            # Show variance / uncertainty across all series 
            sns.tsplot(data=data, ax=ax[1])
            if ymin is not None and ymax is not None:
                ax[0].set_ylim(ymin, ymax)
                ax[1].set_ylim(ymin, ymax)
            if title is not None:
                ax[0].set_ylabel(title)
                ax[1].set_ylabel('var(%s)' % title)
        else:
            f, ax = plt.subplots(1, figsize=(20,5 * size))

            # Plot all series
            for series in data:
                ax.plot(series)
            ax.set_xlim(0, len(series) -1)
            if ymin is not None and ymax is not None:
                ax.set_ylim(ymin, ymax)
            if title is not None:
                ax.set_ylabel(title)



# Utility functions plot timeseries with color patches.
# Useful for timeseries with labels.
# Note on how to draw rectangles with mpl: http://matthiaseisen.com/pp/patterns/p0203 
def plot_df_labels(df, name):
    """
    Plot data, hihglight each label in different colors, and add a legend.
    
    :param df: (pd.DataFrame) data to plot. Must contain a 'label' column.
    :param name: (str) name of the plot.
    """
    # Label IDs and color palette
    label_names = sorted(df.label.unique())
    palette = sns.color_palette('colorblind', len(label_names))  
    
    # Plot color palette legend
    plot_palette_legend(palette, label_names, 'Labels of %s' %name, alpha=0.5, size=0.5)

    # Features
    columns = list(df.columns.values)
    if 'label' in columns: columns.remove('label')
    nb_columns = len(columns)
    
    # Plot data
    fig, ax = plt.subplots(nrows=nb_columns, ncols=1, 
                           figsize=(20, nb_columns* 2))
    
    for i in range(nb_columns):
        column_name = columns[i]
        values = df[column_name].values
        labels = df.label.values
        ax[i].plot(values)
        add_patches(ax[i], values, labels, label_names, palette, alpha=0.5)
        ax[i].set_ylabel(column_name)
    plt.tight_layout()
    

def plot_palette_legend(palette, label_names, title, alpha=0.5, size=0.5):
    """
    Plot the values in a color palette as a horizontal array.

    palette: sequence of matplotlib colors as returned by seaborn.color_palette()
    size: scaling factor for size of plot
    """
    n = len(palette)
    f, ax = plt.subplots(1, 1, figsize=(n * size, size))
    ax.imshow(np.arange(n).reshape(1, n),
              alpha=alpha,
              cmap=colors.ListedColormap(palette),
              interpolation="nearest", aspect="auto")
    ax.set_xticks(np.arange(n))
    ax.set_xticklabels(label_names)
    ax.set_yticklabels([])
    ax.set_title(title, loc='left')


def add_patches(ax, values, labels, label_ids, palette, alpha=0.5):
    """
    Highlight each label with different color patches.
    """
    minval = min(values)
    maxval = max(values) 
    ax.set_xlim(0, len(values)-1)
    ax.set_ylim(minval, maxval)
    
    start = 0
    curr_label = labels[0]
    # Print patches when the label changes
    for t in range(len(values)):
        label = labels[t]
        if label != curr_label:
            end = t
            color_idx = label_ids.index(curr_label)
            color = palette[color_idx]
            ax.add_patch(patches.Rectangle((start, minval), # (x, y)
                                            end-start, # width
                                            maxval-minval, # height
                                            alpha=alpha, 
                                            color=color))
            start = end
            curr_label = label

    # Print the last patch
    end = t
    color_idx = label_ids.index(curr_label)
    color = palette[color_idx]
    ax.add_patch(patches.Rectangle((start, minval), # (x, y)
                                    end-start, # width
                                    maxval-minval, # height
                                    alpha=alpha, 
                                    color=color))



In [7]:

    
# Util function to plot a summary of the sequences in a dataframe.
def plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size):
    print 'number of sequences:', len(df)
    for label in sorted(df.label.unique()):
        print 'number of sequences for label %s: %s' %(label_names[int(label)], len(df[df.label == label]))
        df2 = reshape_df(df[df.label == label], max_rows, max_columns, col_offset=col_offset, row_offset=row_offset)
        plot_df(df2, label_names[int(label)], ymin=ymin, ymax=ymax, size=plot_size)



In [8]:

    
# Example with artificial data
num_labels = 2
num_series_per_label = 10
num_points_per_series = 20
df = generate_series(num_labels, num_series_per_label, num_points_per_series)

max_rows = None
max_columns = 3
df1 = reshape_df(df, max_rows, max_columns)
plot_df_labels(df1, 'artificial data')



In [9]:

    
max_rows = 10
max_columns = None
for label in df.label.unique():
    df2 = reshape_df(df[df.label == label], max_rows, max_columns)
    plot_df(df2, title='class%s' %label)

UCR data



In [10]:

    
# Util functions to load a UCR dataset
def load(csv_path):
    df = pd.read_csv(csv_path, header=None)
    df.columns = ['label'] + ['t%s' % i for i in range(len(df.columns) - 1)]
    return df

def get_dataset_names(datasets_dir):
    dataset_names = []
    for root, dirs, files in os.walk(datasets_dir):
        for name in dirs:
            if name[0] != '.': # ignore cached files
                dataset_names.append(name)
    return dataset_names



In [11]:

    
# Load a UCR dataset 
phase = 'test'

datasets_dir = "UCR_TS_Archive_2015"
names = get_dataset_names(datasets_dir)
idx = names.index('synthetic_control') 
print names
dataset_name = names[idx]
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase.upper()))
df = load(input_csv_path)









    



['50words', 'Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'Car', 'CBF', 'ChlorineConcentration', 'CinC_ECG_torso', 'Coffee', 'Computers', 'Cricket_X', 'Cricket_Y', 'Cricket_Z', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'Earthquakes', 'ECG200', 'ECG5000', 'ECGFiveDays', 'ElectricDevices', 'FaceAll', 'FaceFour', 'FacesUCR', 'FISH', 'FordA', 'FordB', 'Gun_Point', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'InlineSkate', 'InsectWingbeatSound', 'ItalyPowerDemand', 'LargeKitchenAppliances', 'Lighting2', 'Lighting7', 'MALLAT', 'Meat', 'MedicalImages', 'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 'MoteStrain', 'NonInvasiveFatalECG_Thorax1', 'NonInvasiveFatalECG_Thorax2', 'OliveOil', 'OSULeaf', 'PhalangesOutlinesCorrect', 'Phoneme', 'Plane', 'ProximalPhalanxOutlineAgeGroup', 'ProximalPhalanxOutlineCorrect', 'ProximalPhalanxTW', 'RefrigerationDevices', 'ScreenType', 'ShapeletSim', 'ShapesAll', 'SmallKitchenAppliances', 'SonyAIBORobotSurface', 'SonyAIBORobotSurfaceII', 'StarLightCurves', 'Strawberry', 'SwedishLeaf', 'Symbols', 'synthetic_control', 'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'Two_Patterns', 'TwoLeadECG', 'uWaveGestureLibrary_X', 'uWaveGestureLibrary_Y', 'uWaveGestureLibrary_Z', 'UWaveGestureLibraryAll', 'wafer', 'Wine', 'WordsSynonyms', 'Worms', 'WormsTwoClass', 'yoga']



In [12]:

    
label_names = range(7) # 6 labels but the first label ID is 1, not 0.
max_rows = 10
max_columns = None
row_offset = 0
col_offset = 0
ymin=-2
ymax=3
plot_size = 1



In [13]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 300
number of sequences for label 1: 50
number of sequences for label 2: 50
number of sequences for label 3: 50
number of sequences for label 4: 50
number of sequences for label 5: 50
number of sequences for label 6: 50

Synthetic data



In [16]:

    
# Health check to make sure the values are loaded well with pandas
data = np.loadtxt(os.path.join('SyntheticData', 'Test1','Test1_TRAIN'),  delimiter=',')

phase = 'TRAIN'
datasets_dir = "SyntheticData"
dataset_name = 'Test1'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))

df = load(input_csv_path)
assert (np.isclose(data[:,0], df[df.columns.values[0]])).all()
assert (np.isclose(data[:,1], df[df.columns.values[1]])).all()



In [17]:

    
from IPython.display import Image, display
display(Image('SyntheticData/Test1/Test1.png'))



In [18]:

    
phase = 'TRAIN'
datasets_dir = "SyntheticData"
dataset_name = 'Test1'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,2) # 2 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=0
ymax=1
plot_size = 1



In [19]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 100
number of sequences for label 0: 50
number of sequences for label 1: 50

UCI data



In [20]:

    
# UCI labels
LABELS = [
    'WALKING',
    'WALKING_UPSTAIRS',
    'WALKING_DOWNSTAIRS',
    'SITTING',
    'STANDING',
    'LAYING'
]



In [21]:

    
df = pd.read_csv('uci_har/inertial_signals_train.csv')
df = df.rename(index={i: 't%s' %i for i in range(len(df))})
df[['body_acc_x', 'label']].head()



In [25]:

    
input_dir = 'uci_har'
parent_output_dir = 'uci_sequences'
base_names = ['inertial_signals', 'debug']
phases = ['train', 'test']



In [27]:

    
phase = 'TRAIN'
datasets_dir = "uci_sequences/inertial_signals"
dataset_name = 'body_acc_x'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
print input_csv_path
df = load(input_csv_path)
df.label = df.label.astype(int)
df = df.rename(index={i: 'sequence_%s' %i for i in range(len(df))})
label_names = LABELS
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-1
ymax=1
plot_size = 1









    



uci_sequences/inertial_signals/body_acc_x/body_acc_x_TRAIN



In [28]:

    
df.head()









    Out[28]:






  
    
      
      label
      t0
      t1
      t2
      t3
      t4
      t5
      t6
      t7
      t8
      ...
      t190
      t191
      t192
      t193
      t194
      t195
      t196
      t197
      t198
      t199
    
  
  
    
      sequence_0
      4
      0.000181
      0.010139
      0.009276
      0.005066
      0.010810
      0.004045
      0.004757
      0.006214
      0.003307
      ...
      -0.000222
      0.001576
      0.003531
      0.002285
      -0.000420
      -0.003738
      -0.006706
      -0.003148
      0.000733
      0.000668
    
    
      sequence_1
      4
      0.002162
      -0.000946
      -0.006476
      -0.003423
      -0.000610
      -0.002929
      -0.001796
      0.000956
      0.002311
      ...
      0.001109
      -0.003149
      -0.008882
      -0.010483
      -0.004482
      0.004528
      0.008167
      0.002929
      -0.004487
      -0.004717
    
    
      sequence_2
      4
      -0.001637
      -0.000097
      0.001614
      0.002619
      0.004765
      0.005851
      0.002579
      0.000677
      0.002138
      ...
      -0.000400
      0.002377
      0.005650
      0.004639
      0.001717
      -0.002094
      -0.006847
      -0.005862
      -0.002603
      -0.003230
    
    
      sequence_3
      4
      -0.001015
      0.001832
      0.001169
      0.000362
      -0.002587
      -0.002581
      0.001470
      0.003026
      0.003734
      ...
      -0.002441
      -0.002222
      -0.000514
      -0.000866
      -0.000462
      -0.000653
      -0.000948
      -0.000315
      -0.000946
      -0.000635
    
    
      sequence_4
      4
      -0.000353
      0.000120
      0.002108
      0.002159
      0.001069
      0.000453
      0.000548
      0.000977
      0.000505
      ...
      -0.003866
      -0.002742
      -0.002602
      -0.001981
      0.001679
      0.003066
      0.001767
      0.000439
      0.000822
      -0.000282
    
  

5 rows × 201 columns



In [29]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 54
number of sequences for label WALKING: 9
number of sequences for label WALKING_UPSTAIRS: 9
number of sequences for label WALKING_DOWNSTAIRS: 9
number of sequences for label SITTING: 9
number of sequences for label STANDING: 9
number of sequences for label LAYING: 9

Artificial data



In [30]:

    
phase = 'ALL'
datasets_dir = "artificial_sequences"
dataset_name = 'y'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,3) # 3 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-2
ymax=12
plot_size = 1



In [31]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 599
number of sequences for label 0: 99
number of sequences for label 1: 250
number of sequences for label 2: 250

Sensortag data



In [32]:

    
phase = 'ALL'
datasets_dir = "sensortag_sequences"
dataset_name = 'y'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,8) # 7 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-10
ymax=10
plot_size = 1



In [33]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 97
number of sequences for label 1: 15
number of sequences for label 2: 14
number of sequences for label 3: 14
number of sequences for label 4: 15
number of sequences for label 5: 14
number of sequences for label 6: 13
number of sequences for label 7: 12

	label	t0	t1	t2	t3	t4	t5	t6	t7	t8	...	t190	t191	t192	t193	t194	t195	t196	t197	t198	t199
sequence_0	4	0.000181	0.010139	0.009276	0.005066	0.010810	0.004045	0.004757	0.006214	0.003307	...	-0.000222	0.001576	0.003531	0.002285	-0.000420	-0.003738	-0.006706	-0.003148	0.000733	0.000668
sequence_1	4	0.002162	-0.000946	-0.006476	-0.003423	-0.000610	-0.002929	-0.001796	0.000956	0.002311	...	0.001109	-0.003149	-0.008882	-0.010483	-0.004482	0.004528	0.008167	0.002929	-0.004487	-0.004717
sequence_2	4	-0.001637	-0.000097	0.001614	0.002619	0.004765	0.005851	0.002579	0.000677	0.002138	...	-0.000400	0.002377	0.005650	0.004639	0.001717	-0.002094	-0.006847	-0.005862	-0.002603	-0.003230
sequence_3	4	-0.001015	0.001832	0.001169	0.000362	-0.002587	-0.002581	0.001470	0.003026	0.003734	...	-0.002441	-0.002222	-0.000514	-0.000866	-0.000462	-0.000653	-0.000948	-0.000315	-0.000946	-0.000635
sequence_4	4	-0.000353	0.000120	0.002108	0.002159	0.001069	0.000453	0.000548	0.000977	0.000505	...	-0.003866	-0.002742	-0.002602	-0.001981	0.001679	0.003066	0.001767	0.000439	0.000822	-0.000282