In [1]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data


In [2]:
# Numeric data
num_dat = func.load_data_file('train_numeric')

y = num_dat['data']['y']
data = num_dat['data']['features']
ids = num_dat['data']['ids']
n_f_names = num_dat['data']['feature_names'][1:]

del num_dat


Returning <open file '/Volumes/My Book/kaggle_bosch/train_numeric.pkl', mode 'rb' at 0x115b36a50>.pkl

In [3]:
# Load numeric data test
num_dat = func.load_data_file('test_numeric')
ids_test = num_dat['data']['ids']
data = vstack([data, num_dat['data']['features']], format='csr')
del num_dat


Returning <open file '/Volumes/My Book/kaggle_bosch/test_numeric.pkl', mode 'rb' at 0x115b36a50>.pkl

In [4]:
ids_all = pd.concat([ids, ids_test], axis=0)

In [5]:
n_1 = y[y.Response==1].index.values
n_0 = y[y.Response==0].index.values

In [6]:
ids_test.head(3)


Out[6]:
Id
0 1
1 2
2 3

In [7]:
y = y.reindex(pd.concat([ids.Id, ids_test.Id]))

In [8]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)


Out[8]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [10]:
# Read start time per station (discard features related to t delta)
t_station = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_date_station.csv')).iloc[:,:129]
t_station.head(3)


Out[10]:
Id t_0.0 t_1.0 t_2.0 t_3.0 t_4.0 t_5.0 t_6.0 t_7.0 t_8.0 ... t_42.0 t_43.0 t_44.0 t_45.0 t_46.0 t_47.0 t_48.0 t_49.0 t_50.0 t_51.0
0 4 8224.0 8224.0 8224.0 NaN 8226.0 NaN NaN 8226.0 8227.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 6 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 7 161870.0 161870.0 161870.0 NaN NaN 161872.0 161872.0 NaN 161873.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3 rows × 129 columns


In [11]:
data.shape[0]


Out[11]:
2367495

Visualize each feature


In [18]:
def visualize_detrended_numeric_feature(ft_nr, showplot=True, cluster = 'cluster_n8'):
    # For a given numeric features
    # Get station of feature
    # Get station time per sample
    # Get samples with value at this feature
    # Combine both to dataframe
    # Group dataframe by sec and take mean
    # Subtract this mean from each value
    #n = 0.0
    
    
    
    # Get station number 
    station = lut[lut.feature_nr==ft_nr].station_V2.values[0]
    
    # Column nr in sparse array
    n_col = lut[lut.feature_nr==ft_nr].col_num.values[0]
    
    if not n_col>=0:
        return None
    
    # Samples with value for this feature
    i_n = data[:, n_col].nonzero()[0]
    
    # Timestamps for these values from station
    i_t = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).index.values

    # Samples with value and timestamp
    i = list(set(i_n) & set(i_t))
    
    # Get value, timestamp, Response and cluster for each sample and combine in DataFrame
    v = data[i, n_col].todense().A1
    t = 1.0/100*t_station.iloc[i, t_station.columns.get_loc('t_' + str(station))].values
    r = y.iloc[i]['Response'].values
    c = cluster_info.iloc[i][cluster].values
    c50 = cluster_info.iloc[i]['cluster_n50'].values
    c150 = cluster_info.iloc[i]['cluster_n150'].values
    c500 = cluster_info.iloc[i]['cluster_n500'].values
    
    df = pd.DataFrame({'i':i, 't':t,'v':v, 'r':r, 'c':c, 'c50':c50, 'c150':c150, 'c500':c500})

    # Check nan in t (if large, adjust code for t)
    print('F nr: {} | Feature values: {} | Time values: {} | Nans found: {}'.format(ft_nr,
                                                                                    len(i_n),
                                                                         len(i_t),
                                                                         (np.isnan(t)).sum()))

    # Prepare grouping window
    t_window = 0.02
    df['t_w'] = (df['t']/t_window).round(0)*t_window

    # Calculate average numeric value per time window and add to DataFrame
    mean_t_w = df.groupby('t_w')[['v']].agg('median').rename(columns={'v':'mu_v'})
    df = df.merge(mean_t_w, how='left', left_on='t_w', right_index=True)
    
    # Calculate average numeric value per time window and cluster and add to DataFrame
    mean_t_w_c = df.groupby(['t_w','c'])[['v']].agg('median').rename(columns={'v':'mu_v_c'})
    df = df.set_index(['t_w','c']).merge(mean_t_w_c, how='left', left_index=True, right_index=True).reset_index()

    # Above is faster than transform:
    # df['mu_v'] = df.groupby('t_w')[['v']].transform('mean')
    
    # Calculate some differences
    df['v_diff_abs'] = df['v'] - df['mu_v_c']
    df['v_diff_abs_abs'] = df['v_diff_abs'].abs()
    df['v_diff_rel'] = (df['v'] / df['mu_v']).abs()
    
    if showplot:
        # Define figure and axes
        plt.figure(figsize=(16,10))
        gs = gridspec.GridSpec(4, 5)
        ax0 = plt.subplot(gs[0,:]) # Raw data
        ax1 = plt.subplot(gs[1,:]) # Detrended data
        ax5 = plt.subplot(gs[2,:]) # Detrended data
        ax2 = plt.subplot(gs[3,0]) # Histogram of raw values
        ax3 = plt.subplot(gs[3,1]) # Histogram of detrended values
        ax4 = plt.subplot(gs[3,2]) # Histogram of relative values
        ax6 = plt.subplot(gs[3,3]) # Cumulative plot
        ax7 = plt.subplot(gs[3,4]) # Relative values per bin

        means = df.groupby('r').mean()
        xlim = [0, 1750]

        plt.suptitle('Feature: {} | Station: {} | <v-> = {:.4f} | <v+> = {:.4f}'.format(ft_nr,
                                                                                       station,
                                                                                       means.loc[0,'v_diff_abs_abs'],
                                                                                       means.loc[1,'v_diff_abs_abs']))
        colors = ['r','g','b','k','c','w','m','y']

        # Raw values
        for cl in cluster_info[cluster].unique():
            cl_x = df[df['c']==cl].t
            cl_y = df[df['c']==cl].v
            
            if cl_x.shape[0]>0:
                ax0.scatter(cl_x, cl_y, marker='.', color=colors[cl % 8], label='{}: {:.2f}%'.format(cl,
                                                                                             float(cl_x.shape[0])/df.shape[0]*100))

        ax0.scatter(df.t_w, df.mu_v,color='c', marker='x', s=5)
        ax0.set_ylabel('Raw value')
        ax0.set_xlim(xlim)

        # Add legend
        ax0.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=cluster_info[cluster].nunique() + 1, mode="expand", borderaxespad=0.)



        # Detrended values
        ax1.scatter(df.t, df.v_diff_abs, marker='.', color='g')
        ax1.scatter(df.t[df.r==1], df.v_diff_abs[df.r==1], color='r', marker='.', s=100)
        ax1.set_ylabel('Detrended')
        ax1.set_xlim(xlim)

        # Error rates
        e_rates = df.groupby('t_w')['r'].agg(['sum','count'])
        e_rates['r'] = e_rates['sum'] / e_rates['count']
        e_rates['r'].plot(marker='.', linestyle='None', markersize='10', ax=ax5)
        ax5.set_ylabel('Error rate')
        ax5.set_xlim(xlim)

        # Histogram of raw values
        nbins = 100
        rmax = 0.4
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v'], bins=nbins, range=[-rmax, rmax], density=True)

        ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        ax2.set_ylabel('Raw values')

        # Histogram of values after detrending
        nbins = 100
        rmax = 0.4
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)

        ax3.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        ax3.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        ax3.set_ylabel('Detrended')

        # Relative values per bin
        ax4.bar(bins[1:], freq_1/freq_0, color='b', width=2*rmax/nbins, alpha=0.5)
        ax4.set_ylabel('Difference')

        # Cumulative sum of abs values
        df['dummy'] = 1
        tmp0 = df[df['r']==0].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
        tmp1 = df[df['r']==1].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
        ax6.plot(tmp0.index, tmp0.values.astype(float) / tmp0.max(), color='g')
        ax6.plot(tmp1.index, tmp1.values.astype(float) / tmp1.max(), color='r')
        ax6.set_ylabel('Cumsum')

        # Histogram of relative values by detrending
        nbins = 40
        rmax = 40
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)

        ax7.bar(bins[1:], freq_0, color='g', width=rmax/nbins, alpha=0.5)
        ax7.bar(bins[1:], freq_1, color='r', width=rmax/nbins, alpha=0.5)
        ax7.set_ylabel('Relative detrending')

        plt.show()
    
    return df

In [92]:
a = visualize_detrended_numeric_feature(3354, cluster='cluster_n25', showplot=False)


F nr: 3354 | Feature values: 2239066 | Time values: 2239066 | Nans found: 0

In [ ]:
data_detrended = data.copy()

for f_nr in lut['feature_nr'].unique():
    a = visualize_detrended_numeric_feature(f_nr, showplot=False, cluster='cluster_n25')
    if isinstance(a, pd.DataFrame):
        # Update values in sparse array
        n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
        i_n = data[:, n_col].nonzero()[0]
        
        # To not mess up sparse structure
        a.loc[a['v_diff_abs']==0,'v_diff_abs'] = 1e-5
        
        data_detrended[a['i'].values, n_col] = a['v_diff_abs'].values.reshape(-1,1)
    else:
        print('No numeric data for feature {}'.format(f_nr))


F nr: 0 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2
F nr: 2 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2
F nr: 4 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2

In [36]:
with open(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.pkl'), 'wb') as f:
    pickle.dump(data_detrended, f, pickle.HIGHEST_PROTOCOL)
    
#pd.concat([ids,pd.DataFrame(data_detrended.todense())], axis=1).to_csv(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.csv'), index=False)

In [21]:
def visualize_values_per_cluster(a):
    ''' Visualizes the distribution of numeric values per clusters
    a is the output of visualize_detrended_numeric_feature()
    '''
    cl_col = 'c50'
    n_col = 50

    for cl in range(n_col):
        print a[a[cl_col]==cl].shape[0], a[a[cl_col]==cl].v.nunique()
        if a[a[cl_col]==cl].shape[0]>100:
            f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharex=True, figsize=(16,6))
            #a[a.c500==cl].v.hist(bins=100)

            # Histogram of raw values
            nbins = 50
            rmax = 0.3
            freq_2, bins = np.histogram(a.loc[(a[cl_col]==cl),'v'], bins=nbins, range=[-rmax, rmax])
            freq_1, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
            freq_0, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
            freq_1r, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
            ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
            ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
            ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))

            ax1.bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
            ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
            ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
            ax3.bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
            plt.show()
            
def visualize_per_time(a):
    ''' Visualizes the distribution of numeric values per timeunit
    a is the output of visualize_detrended_numeric_feature()
    '''
    # square root of number of time splits
    n = 5

    f, ax = plt.subplots(n,n,sharex=True, figsize=(16,16))

    t_split = np.linspace(0, a['t'].max(), n ** 2 +1)

    for i in range(n ** 2):

        tmin = t_split[i]
        tmax = t_split[i+1]
        #print tmin, tmax
        # Histogram of raw values
        nbins = 50
        rmax = 0.3
        freq_2, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax),'v'], bins=nbins, range=[-rmax, rmax])
        freq_1, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_1r, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
        #ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
        #ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
        #ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))

        ax[i / n, i % n].bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
    plt.show()

In [20]:
def check_timestamp_coverage_per_feature():
    for n in lut.feature_nr.unique():

        station = lut[lut.feature_nr==n].station_V2.values[0]

        t_col = lut[lut.feature_nr==n].col_dat.values[0]
        # TO-DO: GET TIMING FROM FEATURE WITH TIMES

        n_col = lut[lut.feature_nr==n].col_num.values[0]

        if np.isnan(n_col):
            continue

        i_n = data[:, n_col].nonzero()[0]

        if np.isnan(t_col):
            i_t = 0.0
        else:
            i_t = set(i_n) & set(t[:, t_col].nonzero()[0])
            i_t = 1.0 * len(i_t)


        i_ts = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).sum()

        i_n = 1.0 * len(i_n)

        print('F: {} | S: {} | n: {} | t: {} | r: {}% t(S): {}'.format(n, 
                                                                       station, 
                                                                       i_n, 
                                                                       i_t, 
                                                                       i_t/i_n*100,
                                                                       i_ts))

In [31]:
def visualize_feature(f_nr):
    
    t_col = lut[lut.feature_nr==f_nr].col_dat.values[0]
    n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
    
    if np.isnan(n_col):
        print('No numeric data for {}'.format(f_nr))
        return
    
    if np.isnan(t_col):
        print('No timestamp data for {}'.format(f_nr))
        return
    
    #try:
    if 1:
        # Get rows that have values for t AND numeric data
        nrow = list((set(t[:, t_col].nonzero()[0]) & set(data[:, n_col].nonzero()[0])))
        nrow_1 = list(set(nrow) & set(n_1))
        nrow_0 = list(set(nrow) & set(n_0))

        t0 = t[nrow_0[:250000], t_col].todense()
        d0 = data[nrow_0[:250000], n_col].todense()
        t1 = t[nrow_1, t_col].todense()
        d1 = data[nrow_1, n_col].todense()

        # Sort arrays
        s_t0 = np.argsort(t0, axis=0)
        s_t1 = np.argsort(t1, axis=0)

        t0 = t0[s_t0].ravel().T
        d0 = d0[s_t0].ravel().T
        t1 = t1[s_t1].ravel().T
        d1 = d1[s_t1].ravel().T

        df_0 = pd.DataFrame(np.concatenate([t0,d0], axis=1), columns=['t0','d0'])
        df_1 = pd.DataFrame(np.concatenate([t1,d1], axis=1), columns=['t1','d1'])
        df_0.loc[:, 't0'] = df_0['t0'].round(0)
        df_1.loc[:, 't1'] = df_1['t1'].round(0)

        df_0_agg = df_0.groupby('t0').agg(['mean','count'])
        df_1_agg = df_1.groupby('t1').agg(['mean','count'])

        df_all = df_0_agg.merge(df_1_agg, left_index=True, right_index=True, how='outer').fillna(0)

        f, (ax1, ax2, ax3, ax4) = plt.subplots(4,1,figsize=(16,6), sharex=True)
        ax1.plot(t0, d0, linestyle='None', marker='.', color='g')
        ax1.plot(t1, d1, linestyle='None', marker='.', color='r')
        ax1.set_ylabel('Value')
        ax1.set_title('Line {} | Station {} | Feature {} '.format(lut[lut.feature_nr==f_nr].line.values[0],
                                                                  lut[lut.feature_nr==f_nr].station_V2.values[0],
                                                                  f_nr))

        ax2.plot(df_0_agg.index, df_0_agg['d0']['mean'], color='g')
        ax2.plot(df_1_agg.index, df_1_agg['d1']['mean'], color='r')
        ax2.set_ylabel('Moving average')

        ax3.plot(df_all.index, df_all['d1']['count'].astype(float).div(df_all['d0']['count']), color='b')
        ax3.set_ylabel('Error rate')
        ax3.set_xlabel('Timestamp')
        ax3.set_ylim([0, 0.1])

        plt.show()
        
        return df_all

In [ ]: