In [1]:

    
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data



In [2]:

    
# Numeric data
num_dat = func.load_data_file('train_numeric')

y = num_dat['data']['y']
data = num_dat['data']['features']
ids = num_dat['data']['ids']
n_f_names = num_dat['data']['feature_names'][1:]

del num_dat









    



Returning <open file '/Volumes/My Book/kaggle_bosch/train_numeric.pkl', mode 'rb' at 0x115b36a50>.pkl



In [3]:

    
# Load numeric data test
num_dat = func.load_data_file('test_numeric')
ids_test = num_dat['data']['ids']
data = vstack([data, num_dat['data']['features']], format='csr')
del num_dat









    



Returning <open file '/Volumes/My Book/kaggle_bosch/test_numeric.pkl', mode 'rb' at 0x115b36a50>.pkl



In [4]:

    
ids_all = pd.concat([ids, ids_test], axis=0)



In [5]:

    
n_1 = y[y.Response==1].index.values
n_0 = y[y.Response==0].index.values



In [6]:

    
ids_test.head(3)



In [7]:

    
y = y.reindex(pd.concat([ids.Id, ids_test.Id]))



In [8]:

    
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)









    Out[8]:






  
    
      
      line
      station
      feature_nr
      feat_nr_dat
      name_dat
      name_cat
      name_num
      col_dat
      col_num
      col_cat
      station_V2
      line_V2
    
  
  
    
      0
      0
      0
      0
      1.0
      L0_S0_D1
      NaN
      L0_S0_F0
      0.0
      0.0
      NaN
      0.0
      1.0
    
    
      1
      0
      0
      2
      3.0
      L0_S0_D3
      NaN
      L0_S0_F2
      1.0
      1.0
      NaN
      0.0
      1.0
    
    
      2
      0
      0
      4
      5.0
      L0_S0_D5
      NaN
      L0_S0_F4
      2.0
      2.0
      NaN
      0.0
      1.0



In [10]:

    
# Read start time per station (discard features related to t delta)
t_station = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_date_station.csv')).iloc[:,:129]
t_station.head(3)









    Out[10]:






  
    
      
      Id
      t_0.0
      t_1.0
      t_2.0
      t_3.0
      t_4.0
      t_5.0
      t_6.0
      t_7.0
      t_8.0
      ...
      t_42.0
      t_43.0
      t_44.0
      t_45.0
      t_46.0
      t_47.0
      t_48.0
      t_49.0
      t_50.0
      t_51.0
    
  
  
    
      0
      4
      8224.0
      8224.0
      8224.0
      NaN
      8226.0
      NaN
      NaN
      8226.0
      8227.0
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      6
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      7
      161870.0
      161870.0
      161870.0
      NaN
      NaN
      161872.0
      161872.0
      NaN
      161873.0
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

3 rows × 129 columns



In [11]:

    
data.shape[0]









    Out[11]:





2367495

Visualize each feature



In [18]:

    
def visualize_detrended_numeric_feature(ft_nr, showplot=True, cluster = 'cluster_n8'):
    # For a given numeric features
    # Get station of feature
    # Get station time per sample
    # Get samples with value at this feature
    # Combine both to dataframe
    # Group dataframe by sec and take mean
    # Subtract this mean from each value
    #n = 0.0
    
    
    
    # Get station number 
    station = lut[lut.feature_nr==ft_nr].station_V2.values[0]
    
    # Column nr in sparse array
    n_col = lut[lut.feature_nr==ft_nr].col_num.values[0]
    
    if not n_col>=0:
        return None
    
    # Samples with value for this feature
    i_n = data[:, n_col].nonzero()[0]
    
    # Timestamps for these values from station
    i_t = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).index.values

    # Samples with value and timestamp
    i = list(set(i_n) & set(i_t))
    
    # Get value, timestamp, Response and cluster for each sample and combine in DataFrame
    v = data[i, n_col].todense().A1
    t = 1.0/100*t_station.iloc[i, t_station.columns.get_loc('t_' + str(station))].values
    r = y.iloc[i]['Response'].values
    c = cluster_info.iloc[i][cluster].values
    c50 = cluster_info.iloc[i]['cluster_n50'].values
    c150 = cluster_info.iloc[i]['cluster_n150'].values
    c500 = cluster_info.iloc[i]['cluster_n500'].values
    
    df = pd.DataFrame({'i':i, 't':t,'v':v, 'r':r, 'c':c, 'c50':c50, 'c150':c150, 'c500':c500})

    # Check nan in t (if large, adjust code for t)
    print('F nr: {} | Feature values: {} | Time values: {} | Nans found: {}'.format(ft_nr,
                                                                                    len(i_n),
                                                                         len(i_t),
                                                                         (np.isnan(t)).sum()))

    # Prepare grouping window
    t_window = 0.02
    df['t_w'] = (df['t']/t_window).round(0)*t_window

    # Calculate average numeric value per time window and add to DataFrame
    mean_t_w = df.groupby('t_w')[['v']].agg('median').rename(columns={'v':'mu_v'})
    df = df.merge(mean_t_w, how='left', left_on='t_w', right_index=True)
    
    # Calculate average numeric value per time window and cluster and add to DataFrame
    mean_t_w_c = df.groupby(['t_w','c'])[['v']].agg('median').rename(columns={'v':'mu_v_c'})
    df = df.set_index(['t_w','c']).merge(mean_t_w_c, how='left', left_index=True, right_index=True).reset_index()

    # Above is faster than transform:
    # df['mu_v'] = df.groupby('t_w')[['v']].transform('mean')
    
    # Calculate some differences
    df['v_diff_abs'] = df['v'] - df['mu_v_c']
    df['v_diff_abs_abs'] = df['v_diff_abs'].abs()
    df['v_diff_rel'] = (df['v'] / df['mu_v']).abs()
    
    if showplot:
        # Define figure and axes
        plt.figure(figsize=(16,10))
        gs = gridspec.GridSpec(4, 5)
        ax0 = plt.subplot(gs[0,:]) # Raw data
        ax1 = plt.subplot(gs[1,:]) # Detrended data
        ax5 = plt.subplot(gs[2,:]) # Detrended data
        ax2 = plt.subplot(gs[3,0]) # Histogram of raw values
        ax3 = plt.subplot(gs[3,1]) # Histogram of detrended values
        ax4 = plt.subplot(gs[3,2]) # Histogram of relative values
        ax6 = plt.subplot(gs[3,3]) # Cumulative plot
        ax7 = plt.subplot(gs[3,4]) # Relative values per bin

        means = df.groupby('r').mean()
        xlim = [0, 1750]

        plt.suptitle('Feature: {} | Station: {} | <v-> = {:.4f} | <v+> = {:.4f}'.format(ft_nr,
                                                                                       station,
                                                                                       means.loc[0,'v_diff_abs_abs'],
                                                                                       means.loc[1,'v_diff_abs_abs']))
        colors = ['r','g','b','k','c','w','m','y']

        # Raw values
        for cl in cluster_info[cluster].unique():
            cl_x = df[df['c']==cl].t
            cl_y = df[df['c']==cl].v
            
            if cl_x.shape[0]>0:
                ax0.scatter(cl_x, cl_y, marker='.', color=colors[cl % 8], label='{}: {:.2f}%'.format(cl,
                                                                                             float(cl_x.shape[0])/df.shape[0]*100))

        ax0.scatter(df.t_w, df.mu_v,color='c', marker='x', s=5)
        ax0.set_ylabel('Raw value')
        ax0.set_xlim(xlim)

        # Add legend
        ax0.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=cluster_info[cluster].nunique() + 1, mode="expand", borderaxespad=0.)



        # Detrended values
        ax1.scatter(df.t, df.v_diff_abs, marker='.', color='g')
        ax1.scatter(df.t[df.r==1], df.v_diff_abs[df.r==1], color='r', marker='.', s=100)
        ax1.set_ylabel('Detrended')
        ax1.set_xlim(xlim)

        # Error rates
        e_rates = df.groupby('t_w')['r'].agg(['sum','count'])
        e_rates['r'] = e_rates['sum'] / e_rates['count']
        e_rates['r'].plot(marker='.', linestyle='None', markersize='10', ax=ax5)
        ax5.set_ylabel('Error rate')
        ax5.set_xlim(xlim)

        # Histogram of raw values
        nbins = 100
        rmax = 0.4
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v'], bins=nbins, range=[-rmax, rmax], density=True)

        ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        ax2.set_ylabel('Raw values')

        # Histogram of values after detrending
        nbins = 100
        rmax = 0.4
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)

        ax3.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        ax3.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        ax3.set_ylabel('Detrended')

        # Relative values per bin
        ax4.bar(bins[1:], freq_1/freq_0, color='b', width=2*rmax/nbins, alpha=0.5)
        ax4.set_ylabel('Difference')

        # Cumulative sum of abs values
        df['dummy'] = 1
        tmp0 = df[df['r']==0].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
        tmp1 = df[df['r']==1].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
        ax6.plot(tmp0.index, tmp0.values.astype(float) / tmp0.max(), color='g')
        ax6.plot(tmp1.index, tmp1.values.astype(float) / tmp1.max(), color='r')
        ax6.set_ylabel('Cumsum')

        # Histogram of relative values by detrending
        nbins = 40
        rmax = 40
        freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)
        freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)

        ax7.bar(bins[1:], freq_0, color='g', width=rmax/nbins, alpha=0.5)
        ax7.bar(bins[1:], freq_1, color='r', width=rmax/nbins, alpha=0.5)
        ax7.set_ylabel('Relative detrending')

        plt.show()
    
    return df



In [92]:

    
a = visualize_detrended_numeric_feature(3354, cluster='cluster_n25', showplot=False)









    



F nr: 3354 | Feature values: 2239066 | Time values: 2239066 | Nans found: 0



In [ ]:

    
data_detrended = data.copy()

for f_nr in lut['feature_nr'].unique():
    a = visualize_detrended_numeric_feature(f_nr, showplot=False, cluster='cluster_n25')
    if isinstance(a, pd.DataFrame):
        # Update values in sparse array
        n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
        i_n = data[:, n_col].nonzero()[0]
        
        # To not mess up sparse structure
        a.loc[a['v_diff_abs']==0,'v_diff_abs'] = 1e-5
        
        data_detrended[a['i'].values, n_col] = a['v_diff_abs'].values.reshape(-1,1)
    else:
        print('No numeric data for feature {}'.format(f_nr))









    



F nr: 0 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2
F nr: 2 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2
F nr: 4 | Feature values: 1348366 | Time values: 1348366 | Nans found: 2



In [36]:

    
with open(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.pkl'), 'wb') as f:
    pickle.dump(data_detrended, f, pickle.HIGHEST_PROTOCOL)
    
#pd.concat([ids,pd.DataFrame(data_detrended.todense())], axis=1).to_csv(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.csv'), index=False)



In [21]:

    
def visualize_values_per_cluster(a):
    ''' Visualizes the distribution of numeric values per clusters
    a is the output of visualize_detrended_numeric_feature()
    '''
    cl_col = 'c50'
    n_col = 50

    for cl in range(n_col):
        print a[a[cl_col]==cl].shape[0], a[a[cl_col]==cl].v.nunique()
        if a[a[cl_col]==cl].shape[0]>100:
            f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharex=True, figsize=(16,6))
            #a[a.c500==cl].v.hist(bins=100)

            # Histogram of raw values
            nbins = 50
            rmax = 0.3
            freq_2, bins = np.histogram(a.loc[(a[cl_col]==cl),'v'], bins=nbins, range=[-rmax, rmax])
            freq_1, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
            freq_0, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
            freq_1r, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
            ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
            ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
            ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))

            ax1.bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
            ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
            ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
            ax3.bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
            plt.show()
            
def visualize_per_time(a):
    ''' Visualizes the distribution of numeric values per timeunit
    a is the output of visualize_detrended_numeric_feature()
    '''
    # square root of number of time splits
    n = 5

    f, ax = plt.subplots(n,n,sharex=True, figsize=(16,16))

    t_split = np.linspace(0, a['t'].max(), n ** 2 +1)

    for i in range(n ** 2):

        tmin = t_split[i]
        tmax = t_split[i+1]
        #print tmin, tmax
        # Histogram of raw values
        nbins = 50
        rmax = 0.3
        freq_2, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax),'v'], bins=nbins, range=[-rmax, rmax])
        freq_1, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_0, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
        freq_1r, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
        #ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
        #ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
        #ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))

        ax[i / n, i % n].bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
        #ax[i / 9, 9 % i].bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
    plt.show()



In [20]:

    
def check_timestamp_coverage_per_feature():
    for n in lut.feature_nr.unique():

        station = lut[lut.feature_nr==n].station_V2.values[0]

        t_col = lut[lut.feature_nr==n].col_dat.values[0]
        # TO-DO: GET TIMING FROM FEATURE WITH TIMES

        n_col = lut[lut.feature_nr==n].col_num.values[0]

        if np.isnan(n_col):
            continue

        i_n = data[:, n_col].nonzero()[0]

        if np.isnan(t_col):
            i_t = 0.0
        else:
            i_t = set(i_n) & set(t[:, t_col].nonzero()[0])
            i_t = 1.0 * len(i_t)


        i_ts = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).sum()

        i_n = 1.0 * len(i_n)

        print('F: {} | S: {} | n: {} | t: {} | r: {}% t(S): {}'.format(n, 
                                                                       station, 
                                                                       i_n, 
                                                                       i_t, 
                                                                       i_t/i_n*100,
                                                                       i_ts))



In [31]:

    
def visualize_feature(f_nr):
    
    t_col = lut[lut.feature_nr==f_nr].col_dat.values[0]
    n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
    
    if np.isnan(n_col):
        print('No numeric data for {}'.format(f_nr))
        return
    
    if np.isnan(t_col):
        print('No timestamp data for {}'.format(f_nr))
        return
    
    #try:
    if 1:
        # Get rows that have values for t AND numeric data
        nrow = list((set(t[:, t_col].nonzero()[0]) & set(data[:, n_col].nonzero()[0])))
        nrow_1 = list(set(nrow) & set(n_1))
        nrow_0 = list(set(nrow) & set(n_0))

        t0 = t[nrow_0[:250000], t_col].todense()
        d0 = data[nrow_0[:250000], n_col].todense()
        t1 = t[nrow_1, t_col].todense()
        d1 = data[nrow_1, n_col].todense()

        # Sort arrays
        s_t0 = np.argsort(t0, axis=0)
        s_t1 = np.argsort(t1, axis=0)

        t0 = t0[s_t0].ravel().T
        d0 = d0[s_t0].ravel().T
        t1 = t1[s_t1].ravel().T
        d1 = d1[s_t1].ravel().T

        df_0 = pd.DataFrame(np.concatenate([t0,d0], axis=1), columns=['t0','d0'])
        df_1 = pd.DataFrame(np.concatenate([t1,d1], axis=1), columns=['t1','d1'])
        df_0.loc[:, 't0'] = df_0['t0'].round(0)
        df_1.loc[:, 't1'] = df_1['t1'].round(0)

        df_0_agg = df_0.groupby('t0').agg(['mean','count'])
        df_1_agg = df_1.groupby('t1').agg(['mean','count'])

        df_all = df_0_agg.merge(df_1_agg, left_index=True, right_index=True, how='outer').fillna(0)

        f, (ax1, ax2, ax3, ax4) = plt.subplots(4,1,figsize=(16,6), sharex=True)
        ax1.plot(t0, d0, linestyle='None', marker='.', color='g')
        ax1.plot(t1, d1, linestyle='None', marker='.', color='r')
        ax1.set_ylabel('Value')
        ax1.set_title('Line {} | Station {} | Feature {} '.format(lut[lut.feature_nr==f_nr].line.values[0],
                                                                  lut[lut.feature_nr==f_nr].station_V2.values[0],
                                                                  f_nr))

        ax2.plot(df_0_agg.index, df_0_agg['d0']['mean'], color='g')
        ax2.plot(df_1_agg.index, df_1_agg['d1']['mean'], color='r')
        ax2.set_ylabel('Moving average')

        ax3.plot(df_all.index, df_all['d1']['count'].astype(float).div(df_all['d0']['count']), color='b')
        ax3.set_ylabel('Error rate')
        ax3.set_xlabel('Timestamp')
        ax3.set_ylim([0, 0.1])

        plt.show()
        
        return df_all



In [ ]:

	feature_nr	feat_nr_dat	name_dat	name_cat	name_num	col_dat	col_num	col_cat	line_V2
0	0	1.0	L0_S0_D1	NaN	L0_S0_F0	0.0	0.0	NaN	1.0
1	2	3.0	L0_S0_D3	NaN	L0_S0_F2	1.0	1.0	NaN	1.0
2	4	5.0	L0_S0_D5	NaN	L0_S0_F4	2.0	2.0	NaN	1.0

	Id	t_0.0	t_1.0	t_2.0	t_3.0	t_4.0	t_5.0	t_6.0	t_7.0	t_8.0	...	t_42.0	t_43.0	t_44.0	t_45.0	t_46.0	t_47.0	t_48.0	t_49.0	t_50.0	t_51.0
0	4	8224.0	8224.0	8224.0	NaN	8226.0	NaN	NaN	8226.0	8227.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	7	161870.0	161870.0	161870.0	NaN	NaN	161872.0	161872.0	NaN	161873.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN