Split datasets into sequences and plot them

Helper functions



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import patches, colors
import seaborn as sns
import pandas as pd
import numpy as np
import os



In [2]:

    
# Debugger
from IPython.core.debugger import Pdb
t = Pdb().set_trace



In [3]:

    
# Seaborn settings for nice plots
sns.set_style('dark')
colors_example = sns.color_palette('colorblind', 3)
sns.palplot(colors_example)



In [4]:

    
# Utility functions to generate time series data
np.random.seed(22)

def generate_data(period, num_series, num_points, var):
    t = np.linspace(0, num_points/2, num_points)
    random_means = np.random.rand(num_series, num_points) # a random mean for each serie
    random_var = var * np.random.randn(num_series, 1)
    data = random_means + random_var + np.sin(t * period)
    return data
    

def generate_series(num_labels, num_series_per_label, num_points_per_series):
    var = 0.5
    dfs = []
    for label in range(num_labels):
        period = label + 1
        data = generate_data(period, num_series_per_label, num_points_per_series, var)
        df = pd.DataFrame(data=data, columns=['t%s' % i for i in range(len(data[0]))])
        labels = np.zeros(num_series_per_label, dtype=int)
        labels.fill(label)
        df['label'] = labels
        dfs.append(df)
    return pd.concat(dfs)



In [5]:

    
class OffsetTooLargeException(Exception):
    pass
    
# Utility function to reshape a dataframe
def reshape_df(df, max_rows, max_columns, row_offset=0, col_offset=0):
    if max_columns is not None:
        max_col_offset = len(df.columns.values) - (max_columns-1)
        if col_offset > max_col_offset:
            raise OffsetTooLargeException('Column offset should be lower than %s' %max_col_offset)
        columns = list(df.columns.values)[col_offset:col_offset+(max_columns-1)]
        if 'label' not in columns: columns.append('label')
        df = df.loc[:, columns]
    
    if max_rows is not None:
        max_row_offset = len(df) - max_rows
        if row_offset > max_row_offset:
            raise OffsetTooLargeException('Row offset should be lower than %s' %max_row_offset)
        df = df.iloc[row_offset:row_offset+max_rows]
    return df



In [6]:

    
# Utility function to plot groups of time series
def plot_df(df, title=None, size=1, ymin=None, ymax=None):
    
    label_names = df.label.unique()
    for label in label_names:
        data = df.loc[df.label==label, df.columns.values != 'label'].values
            
        if len(data) == 1: # If there is only one time series, there's no point to plot uncertainty across series.
            plot_uncertainty = False
        else:
            plot_uncertainty = True
        
        if plot_uncertainty:
            f, ax = plt.subplots(2, figsize=(20,5 * size))
            # Plot all series
            for series in data:
                ax[0].plot(series)
            ax[0].set_xlim(0, len(series) -1)

            # Show variance / uncertainty across all series 
            sns.tsplot(data=data, ax=ax[1])
            if ymin is not None and ymax is not None:
                ax[0].set_ylim(ymin, ymax)
                ax[1].set_ylim(ymin, ymax)
            if title is not None:
                ax[0].set_ylabel(title)
                ax[1].set_ylabel('var(%s)' % title)
        else:
            f, ax = plt.subplots(1, figsize=(20,5 * size))

            # Plot all series
            for series in data:
                ax.plot(series)
            ax.set_xlim(0, len(series) -1)
            if ymin is not None and ymax is not None:
                ax.set_ylim(ymin, ymax)
            if title is not None:
                ax.set_ylabel(title)



# Utility functions plot timeseries with color patches.
# Useful for timeseries with labels.
# Note on how to draw rectangles with mpl: http://matthiaseisen.com/pp/patterns/p0203 
def plot_df_labels(df, name):
    """
    Plot data, hihglight each label in different colors, and add a legend.
    
    :param df: (pd.DataFrame) data to plot. Must contain a 'label' column.
    :param name: (str) name of the plot.
    """
    # Label IDs and color palette
    label_names = sorted(df.label.unique())
    palette = sns.color_palette('colorblind', len(label_names))  
    
    # Plot color palette legend
    plot_palette_legend(palette, label_names, 'Labels of %s' %name, alpha=0.5, size=0.5)

    # Features
    columns = list(df.columns.values)
    if 'label' in columns: columns.remove('label')
    nb_columns = len(columns)
    
    # Plot data
    fig, ax = plt.subplots(nrows=nb_columns, ncols=1, 
                           figsize=(20, nb_columns* 2))
    
    for i in range(nb_columns):
        column_name = columns[i]
        values = df[column_name].values
        labels = df.label.values
        ax[i].plot(values)
        add_patches(ax[i], values, labels, label_names, palette, alpha=0.5)
        ax[i].set_ylabel(column_name)
    plt.tight_layout()
    

def plot_palette_legend(palette, label_names, title, alpha=0.5, size=0.5):
    """
    Plot the values in a color palette as a horizontal array.

    palette: sequence of matplotlib colors as returned by seaborn.color_palette()
    size: scaling factor for size of plot
    """
    n = len(palette)
    f, ax = plt.subplots(1, 1, figsize=(n * size, size))
    ax.imshow(np.arange(n).reshape(1, n),
              alpha=alpha,
              cmap=colors.ListedColormap(palette),
              interpolation="nearest", aspect="auto")
    ax.set_xticks(np.arange(n))
    ax.set_xticklabels(label_names)
    ax.set_yticklabels([])
    ax.set_title(title, loc='left')


def add_patches(ax, values, labels, label_ids, palette, alpha=0.5):
    """
    Highlight each label with different color patches.
    """
    minval = min(values)
    maxval = max(values) 
    ax.set_xlim(0, len(values)-1)
    ax.set_ylim(minval, maxval)
    
    start = 0
    curr_label = labels[0]
    # Print patches when the label changes
    for t in range(len(values)):
        label = labels[t]
        if label != curr_label:
            end = t
            color_idx = label_ids.index(curr_label)
            color = palette[color_idx]
            ax.add_patch(patches.Rectangle((start, minval), # (x, y)
                                            end-start, # width
                                            maxval-minval, # height
                                            alpha=alpha, 
                                            color=color))
            start = end
            curr_label = label

    # Print the last patch
    end = t
    color_idx = label_ids.index(curr_label)
    color = palette[color_idx]
    ax.add_patch(patches.Rectangle((start, minval), # (x, y)
                                    end-start, # width
                                    maxval-minval, # height
                                    alpha=alpha, 
                                    color=color))



In [7]:

    
# Util function to plot a summary of the sequences in a dataframe.
def plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size):
    print 'number of sequences:', len(df)
    for label in sorted(df.label.unique()):
        print 'number of sequences for label %s: %s' %(label_names[int(label)], len(df[df.label == label]))
        df2 = reshape_df(df[df.label == label], max_rows, max_columns, col_offset=col_offset, row_offset=row_offset)
        plot_df(df2, label_names[int(label)], ymin=ymin, ymax=ymax, size=plot_size)



In [8]:

    
# Example with artificial data
num_labels = 2
num_series_per_label = 10
num_points_per_series = 20
df = generate_series(num_labels, num_series_per_label, num_points_per_series)

max_rows = None
max_columns = 3
df1 = reshape_df(df, max_rows, max_columns)
plot_df_labels(df1, 'artificial data')



In [9]:

    
max_rows = 10
max_columns = None
for label in df.label.unique():
    df2 = reshape_df(df[df.label == label], max_rows, max_columns)
    plot_df(df2, title='class%s' %label)

UCR data

Download the zipped data here
Password to unzip: attempttopredict



In [10]:

    
# Util functions to load a UCR dataset
def load(csv_path):
    df = pd.read_csv(csv_path, header=None)
    df.columns = ['label'] + ['t%s' % i for i in range(len(df.columns) - 1)]
    return df

def get_dataset_names(datasets_dir):
    dataset_names = []
    for root, dirs, files in os.walk(datasets_dir):
        for name in dirs:
            if name[0] != '.': # ignore cached files
                dataset_names.append(name)
    return dataset_names



In [11]:

    
# Load a UCR dataset 
phase = 'test'

datasets_dir = "UCR_TS_Archive_2015"
names = get_dataset_names(datasets_dir)
idx = names.index('synthetic_control') 
print names
dataset_name = names[idx]
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase.upper()))
df = load(input_csv_path)









    



['50words', 'Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'Car', 'CBF', 'ChlorineConcentration', 'CinC_ECG_torso', 'Coffee', 'Computers', 'Cricket_X', 'Cricket_Y', 'Cricket_Z', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'Earthquakes', 'ECG200', 'ECG5000', 'ECGFiveDays', 'ElectricDevices', 'FaceAll', 'FaceFour', 'FacesUCR', 'FISH', 'FordA', 'FordB', 'Gun_Point', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'InlineSkate', 'InsectWingbeatSound', 'ItalyPowerDemand', 'LargeKitchenAppliances', 'Lighting2', 'Lighting7', 'MALLAT', 'Meat', 'MedicalImages', 'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 'MoteStrain', 'NonInvasiveFatalECG_Thorax1', 'NonInvasiveFatalECG_Thorax2', 'OliveOil', 'OSULeaf', 'PhalangesOutlinesCorrect', 'Phoneme', 'Plane', 'ProximalPhalanxOutlineAgeGroup', 'ProximalPhalanxOutlineCorrect', 'ProximalPhalanxTW', 'RefrigerationDevices', 'ScreenType', 'ShapeletSim', 'ShapesAll', 'SmallKitchenAppliances', 'SonyAIBORobotSurface', 'SonyAIBORobotSurfaceII', 'StarLightCurves', 'Strawberry', 'SwedishLeaf', 'Symbols', 'synthetic_control', 'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'Two_Patterns', 'TwoLeadECG', 'uWaveGestureLibrary_X', 'uWaveGestureLibrary_Y', 'uWaveGestureLibrary_Z', 'UWaveGestureLibraryAll', 'wafer', 'Wine', 'WordsSynonyms', 'Worms', 'WormsTwoClass', 'yoga']



In [12]:

    
label_names = range(7) # 6 labels but the first label ID is 1, not 0.
max_rows = 10
max_columns = None
row_offset = 0
col_offset = 0
ymin=-2
ymax=3
plot_size = 1



In [13]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 300
number of sequences for label 1: 50
number of sequences for label 2: 50
number of sequences for label 3: 50
number of sequences for label 4: 50
number of sequences for label 5: 50
number of sequences for label 6: 50

Synthetic data



In [14]:

    
!cd SyntheticData; python generate_synthetic_data.py

# Health check to make sure the values are loaded well with pandas
data = np.loadtxt(os.path.join('SyntheticData', 'Test1','Test1_TRAIN'),  delimiter=',')

phase = 'TRAIN'
datasets_dir = "SyntheticData"
dataset_name = 'Test1'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))

df = load(input_csv_path)
assert (np.isclose(data[:,0], df[df.columns.values[0]])).all()
assert (np.isclose(data[:,1], df[df.columns.values[1]])).all()



In [15]:

    
from IPython.display import Image, display
display(Image('SyntheticData/Test1/Test1.png'))



In [30]:

    
phase = 'TRAIN'
datasets_dir = "SyntheticData"
dataset_name = 'Test1'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,2) # 2 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=0
ymax=1
plot_size = 1



In [31]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 100
number of sequences for label 0: 50
number of sequences for label 1: 50

UCI data



In [18]:

    
# UCI labels
LABELS = [
    'WALKING',
    'WALKING_UPSTAIRS',
    'WALKING_DOWNSTAIRS',
    'SITTING',
    'STANDING',
    'LAYING'
]



In [19]:

    
df = pd.read_csv('uci_har/inertial_signals_train.csv')
df = df.rename(index={i: 't%s' %i for i in range(len(df))})
df[['body_acc_x', 'label']].head()



In [20]:

    
# Get UCI data ready
%run uci_har/download_dataset.py --output_dir=uci_har
%run uci_har/convert_to_csv.py --input_dir=uci_har --output_dir=uci_har --labels='[0,1,2,3,4,5]' --nb_samples=2000

# Convert UCI data to sequences
sequence_lenth=200
input_dir = 'uci_har'
parent_output_dir = 'uci_sequences'
base_names = ['inertial_signals', 'debug']
phases = ['train', 'test']

!rm -rf $parent_output_dir
print 'DELETED:', parent_output_dir

for base_name in base_names:
    for phase in phases:
        input_file = os.path.join(input_dir, '%s_%s.csv' %(base_name, phase))
        output_dir = os.path.join(parent_output_dir, base_name)
        cmd = 'convert_to_sequences.py -i %s -o %s -c %s' %(input_file, output_dir, sequence_lenth)
        print 'run cmd:', cmd
        %run $cmd









    



Downloading...
Dataset already downloaded. Did not download twice.

Extracting...
Dataset already extracted. Did not extract twice.

Train set size: 11994
Test set size: 11994
Files saved: ['uci_har/inertial_signals_train.csv', 'uci_har/inertial_signals_test.csv']
Debug files saved: ['uci_har/debug_train.csv', 'uci_har/debug_test.csv']
DELETED: uci_sequences
run cmd: convert_to_sequences.py -i uci_har/inertial_signals_train.csv -o uci_sequences/inertial_signals -c 200
Path to converted files: uci_sequences/inertial_signals/
run cmd: convert_to_sequences.py -i uci_har/inertial_signals_test.csv -o uci_sequences/inertial_signals -c 200
Path to converted files: uci_sequences/inertial_signals/
run cmd: convert_to_sequences.py -i uci_har/debug_train.csv -o uci_sequences/debug -c 200
Path to converted files: uci_sequences/debug/
run cmd: convert_to_sequences.py -i uci_har/debug_test.csv -o uci_sequences/debug -c 200
Path to converted files: uci_sequences/debug/






    





<matplotlib.figure.Figure at 0x10ac8dd10>



In [21]:

    
phase = 'TRAIN'
datasets_dir = "uci_sequences/inertial_signals"
dataset_name = 'body_acc_x'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
print input_csv_path
df = load(input_csv_path)
df.label = df.label.astype(int)
df = df.rename(index={i: 'sequence_%s' %i for i in range(len(df))})
label_names = LABELS
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-1
ymax=1
plot_size = 1









    



uci_sequences/inertial_signals/body_acc_x/body_acc_x_TRAIN



In [22]:

    
df.head()









    Out[22]:






  
    
      
      label
      t0
      t1
      t2
      t3
      t4
      t5
      t6
      t7
      t8
      ...
      t190
      t191
      t192
      t193
      t194
      t195
      t196
      t197
      t198
      t199
    
  
  
    
      sequence_0
      4
      0.000181
      0.010139
      0.009276
      0.005066
      0.010810
      0.004045
      0.004757
      0.006214
      0.003307
      ...
      -0.000222
      0.001576
      0.003531
      0.002285
      -0.000420
      -0.003738
      -0.006706
      -0.003148
      0.000733
      0.000668
    
    
      sequence_1
      4
      0.002162
      -0.000946
      -0.006476
      -0.003423
      -0.000610
      -0.002929
      -0.001796
      0.000956
      0.002311
      ...
      0.001109
      -0.003149
      -0.008882
      -0.010483
      -0.004482
      0.004528
      0.008167
      0.002929
      -0.004487
      -0.004717
    
    
      sequence_2
      4
      -0.001637
      -0.000097
      0.001614
      0.002619
      0.004765
      0.005851
      0.002579
      0.000677
      0.002138
      ...
      -0.000400
      0.002377
      0.005650
      0.004639
      0.001717
      -0.002094
      -0.006847
      -0.005862
      -0.002603
      -0.003230
    
    
      sequence_3
      4
      -0.001015
      0.001832
      0.001169
      0.000362
      -0.002587
      -0.002581
      0.001470
      0.003026
      0.003734
      ...
      -0.002441
      -0.002222
      -0.000514
      -0.000866
      -0.000462
      -0.000653
      -0.000948
      -0.000315
      -0.000946
      -0.000635
    
    
      sequence_4
      4
      -0.000353
      0.000120
      0.002108
      0.002159
      0.001069
      0.000453
      0.000548
      0.000977
      0.000505
      ...
      -0.003866
      -0.002742
      -0.002602
      -0.001981
      0.001679
      0.003066
      0.001767
      0.000439
      0.000822
      -0.000282
    
  

5 rows × 201 columns



In [23]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 54
number of sequences for label WALKING: 9
number of sequences for label WALKING_UPSTAIRS: 9
number of sequences for label WALKING_DOWNSTAIRS: 9
number of sequences for label SITTING: 9
number of sequences for label STANDING: 9
number of sequences for label LAYING: 9

Artificial data



In [24]:

    
# generate artificial data
!cd artificial; python generate_artificial_data.py

# Convert artificial data
sequence_lenth=8
input_dir = 'artificial'
output_dir = 'artificial_sequences'

!rm -rf $parent_output_dir
print 'DELETED:', output_dir

input_file = os.path.join(input_dir, 'binary_ampl=10.0_mean=0.0_noise=1.0.csv')
cmd = 'convert_to_sequences.py -i %s -o %s -c %s' %(input_file, output_dir, sequence_lenth)
print 'run cmd:', cmd
%run $cmd









    



==> figure saved: /Users/mleborgne/_git/nupic.research/projects/capybara/datasets/artificial/binary_ampl=10.0_mean=0.0_noise=0.0.png
DELETED: artificial_sequences
run cmd: convert_to_sequences.py -i artificial/binary_ampl=10.0_mean=0.0_noise=1.0.csv -o artificial_sequences -c 8
skip label: int
skip label: C
skip label: int
skip label: C
Path to converted files: artificial_sequences/






    





<matplotlib.figure.Figure at 0x112505f50>



In [25]:

    
phase = 'ALL'
datasets_dir = "artificial_sequences"
dataset_name = 'y'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,3) # 3 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-2
ymax=12
plot_size = 1



In [26]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 599
number of sequences for label 0: 99
number of sequences for label 1: 250
number of sequences for label 2: 250

Sensortag data



In [27]:

    
# generate data
!cd sensortag; python convert_acc_data.py

# Convert data
sequence_lenth=100
input_dir = 'sensortag/converted'
output_dir = 'sensortag_sequences'

!rm -rf $parent_output_dir
print 'DELETED:', output_dir

input_file = os.path.join(input_dir, 'sensortag_x.csv')
cmd = 'convert_to_sequences.py -i %s -o %s -c %s' %(input_file, output_dir, sequence_lenth)
print 'run cmd:', cmd
%run $cmd









    



==> figure saved: converted/sensortag_x.png
==> figure saved: converted/sensortag_y.png
==> figure saved: converted/sensortag_z.png
DELETED: sensortag_sequences
run cmd: convert_to_sequences.py -i sensortag/converted/sensortag_x.csv -o sensortag_sequences -c 100
skip label: int
skip label: C
skip label: int
skip label: C
Path to converted files: sensortag_sequences/






    





<matplotlib.figure.Figure at 0x1114c85d0>



In [28]:

    
phase = 'ALL'
datasets_dir = "sensortag_sequences"
dataset_name = 'y'
input_csv_path = os.path.join(datasets_dir, dataset_name, '%s_%s' % (dataset_name, phase))
df = load(input_csv_path)

label_names = range(0,8) # 7 labels
max_rows = 2
max_columns = None
row_offset = 0
col_offset = 0
ymin=-10
ymax=10
plot_size = 1



In [29]:

    
plot_sequences(df, label_names, max_rows, max_columns, row_offset, col_offset, ymin, ymax, plot_size)









    



number of sequences: 97
number of sequences for label 1: 15
number of sequences for label 2: 14
number of sequences for label 3: 14
number of sequences for label 4: 15
number of sequences for label 5: 14
number of sequences for label 6: 13
number of sequences for label 7: 12

	label	t0	t1	t2	t3	t4	t5	t6	t7	t8	...	t190	t191	t192	t193	t194	t195	t196	t197	t198	t199
sequence_0	4	0.000181	0.010139	0.009276	0.005066	0.010810	0.004045	0.004757	0.006214	0.003307	...	-0.000222	0.001576	0.003531	0.002285	-0.000420	-0.003738	-0.006706	-0.003148	0.000733	0.000668
sequence_1	4	0.002162	-0.000946	-0.006476	-0.003423	-0.000610	-0.002929	-0.001796	0.000956	0.002311	...	0.001109	-0.003149	-0.008882	-0.010483	-0.004482	0.004528	0.008167	0.002929	-0.004487	-0.004717
sequence_2	4	-0.001637	-0.000097	0.001614	0.002619	0.004765	0.005851	0.002579	0.000677	0.002138	...	-0.000400	0.002377	0.005650	0.004639	0.001717	-0.002094	-0.006847	-0.005862	-0.002603	-0.003230
sequence_3	4	-0.001015	0.001832	0.001169	0.000362	-0.002587	-0.002581	0.001470	0.003026	0.003734	...	-0.002441	-0.002222	-0.000514	-0.000866	-0.000462	-0.000653	-0.000948	-0.000315	-0.000946	-0.000635
sequence_4	4	-0.000353	0.000120	0.002108	0.002159	0.001069	0.000453	0.000548	0.000977	0.000505	...	-0.003866	-0.002742	-0.002602	-0.001981	0.001679	0.003066	0.001767	0.000439	0.000822	-0.000282