In [3]:

    
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data



In [2]:

    
print const.TRAIN_FILES









    



['train_numeric', 'train_categorical_to_num', 'train_date']



In [57]:

    
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)









    Out[57]:






  
    
      
      line
      station
      feature_nr
      feat_nr_dat
      name_dat
      name_cat
      name_num
      col_dat
      col_num
      col_cat
      station_V2
      line_V2
    
  
  
    
      0
      0
      0
      0
      1.0
      L0_S0_D1
      NaN
      L0_S0_F0
      0.0
      0.0
      NaN
      0.0
      1.0
    
    
      1
      0
      0
      2
      3.0
      L0_S0_D3
      NaN
      L0_S0_F2
      1.0
      1.0
      NaN
      0.0
      1.0
    
    
      2
      0
      0
      4
      5.0
      L0_S0_D5
      NaN
      L0_S0_F4
      2.0
      2.0
      NaN
      0.0
      1.0



In [127]:

    
dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']

dat = func.load_data_file(const.TEST_FILES[2])

dat_data = vstack([dat_train, dat['data']['features']], format='csr')
ids = pd.concat([id_train, dat['data']['ids']], axis=0)









    



Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115eba540>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115eba540>.pkl



In [128]:

    
print dat_data.shape
print ids.shape
ids.head(3)









    



(2367495, 1156)
(2367495, 1)






    Out[128]:






  
    
      
      Id
    
  
  
    
      0
      4
    
    
      1
      6
    
    
      2
      7



In [20]:

    
dat_our = pd.read_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[2] + '.csv'), nrows=1000)
dat_our.head(3)









    Out[20]:






  
    
      
      Id
      L0_S0_D1
      L0_S0_D3
      L0_S0_D5
      L0_S0_D7
      L0_S0_D9
      L0_S0_D11
      L0_S0_D13
      L0_S0_D15
      L0_S0_D17
      ...
      L3_S50_D4246
      L3_S50_D4248
      L3_S50_D4250
      L3_S50_D4252
      L3_S50_D4254
      L3_S51_D4255
      L3_S51_D4257
      L3_S51_D4259
      L3_S51_D4261
      L3_S51_D4263
    
  
  
    
      0
      4
      82.24
      82.24
      82.24
      82.24
      82.24
      82.24
      82.24
      82.24
      82.24
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      6
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      7
      1618.70
      1618.70
      1618.70
      1618.70
      1618.70
      1618.70
      1618.70
      1618.70
      1618.70
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

3 rows × 1157 columns



In [162]:

    
# Load jayjay's features
#dat_jay = pd.read_csv('data_jayjay/train.csv', nrows=1000)
dat_jay = pd.read_csv('data_jayjay/train.csv')
#cat_cols = cat_jay.filter(like='CATEGORICAL').columns
#cat_jay = cat_jay[cat_cols]
#print cat_jay.shape
#cat_jay.head(3)



In [50]:

    
# On all samples
print (dat_our.iloc[:,1:].kurtosis(axis=1)-dat_jay['Kurtosis']).mean()
print (dat_our.iloc[:,1:].max(axis=1)!=dat_jay['Max']).sum() # Nans are different
print (dat_our.iloc[:,1:].min(axis=1)!=dat_jay['Min']).sum() # Nans are different
print (dat_our.iloc[:,1:].mean(axis=1)-dat_jay['Mean']).mean()
print (dat_our.iloc[:,1:].apply(lambda x: x.nunique(), axis=1)!=dat_jay['Unique count']).sum()









    



4.46731303101e-14
1
1
5.62173829899e-11
0



In [87]:

    
a=pd.Series([1,np.nan,2])
b=pd.Series([1,np.nan,2])



In [88]:

    
compare(a,b)









    Out[88]:





True



In [168]:

    
def compare(x,y):
    if x.equals(y):
        return True
    else:
        return 'Different: mean: {} sum: {}'.format((x-y).mean(), (x-y).sum())
    
    return False



In [113]:

    
# Lines 0
line=0
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 0')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L0_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].mean(axis=1), dat_jay['L0_Mean'])
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L0_Unique count'])
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['DATE_L0max']) # Nans are different









    



Line 0
True
True
8.2864247259e-11
-1.4875440359e-14
True
True



In [ ]:

    
# Lines 1
line=1
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 1')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L1_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L1_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L1kurt'])



In [111]:

    
# Lines 2
line=2
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 2')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L2_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Range'])









    



Line 2
True
True
True



In [115]:

    
# Lines 3
line=3
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 3')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L3_Max']) 
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Min']) 
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L3_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L3kurt'])









    



Line 3
True
True
-1.23021153797e-14
True
-1.89971991518e-13



In [116]:

    
# Between lines and stations
col_dateL0 = [int(i)+1 for i in lut[lut['line']==0].col_dat.values if not np.isnan(i)]
col_dateL2 = [int(i)+1 for i in lut[lut['line']==2].col_dat.values if not np.isnan(i)]
col_dateL3 = [int(i)+1 for i in lut[lut['line']==3].col_dat.values if not np.isnan(i)]
col_dateS37 = [int(i)+1 for i in lut[lut['station']==37].col_dat.values if not np.isnan(i)]
col_dateS26 = [int(i)+1 for i in lut[lut['station']==26].col_dat.values if not np.isnan(i)]
col_dateS30 = [int(i)+1 for i in lut[lut['station']==30].col_dat.values if not np.isnan(i)]
col_dateS34 = [int(i)+1 for i in lut[lut['station']==34].col_dat.values if not np.isnan(i)]

print('Line 3')
print compare(dat_our.iloc[:,col_dateL0].max(axis=1) - dat_our.iloc[:,col_dateL3].max(axis=1), dat_jay['L0max_L3max'])
print compare(dat_our.iloc[:,col_dateL2].max(axis=1) - dat_our.iloc[:,col_dateS37].max(axis=1), dat_jay['L2max_S37max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateL0].min(axis=1), dat_jay['L3max_L0min'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS26].max(axis=1), dat_jay['L3max_S26max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS30].max(axis=1), dat_jay['L3max_S30max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS34].min(axis=1), dat_jay['L3max_S34min'])









    



Line 3
-1.01295843022e-17
-5.11415987649e-16
2.70646024476e-15
2.45490308551e-16
-8.93419814403e-15
1.38480289532e-15



In [132]:

    
max(col_date)









    Out[132]:





1156



In [225]:

    
# Same station

# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    df['L{}max'.format(l)].round(2)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)



In [226]:

    
# Sort by ID
df.sort_values(['Id'], inplace=True)



In [227]:

    
for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

# Use only train id
df = df[df['Id'].isin(id_train.Id)]



In [228]:

    
df.head()









    Out[228]:






  
    
      
      index
      Id
      L0max
      L1max
      L2max
      L3max
      index_prev
      index_next
      Id_prev
      Id_next
      L0max_prev
      L0max_next
      L1max_prev
      L1max_next
      L2max_prev
      L2max_next
      L3max_prev
      L3max_next
    
  
  
    
      0
      0
      4
      82.269997
      NaN
      NaN
      87.290001
      1183749.0
      1183750.0
      3.0
      5.0
      NaN
      255.490005
      671.950012
      NaN
      704.109985
      255.500000
      711.080017
      256.279999
    
    
      1
      1
      6
      1313.150024
      NaN
      NaN
      1315.750000
      1183750.0
      2.0
      5.0
      7.0
      255.490005
      1618.729980
      NaN
      NaN
      255.500000
      NaN
      256.279999
      1624.420044
    
    
      2
      2
      7
      1618.729980
      NaN
      NaN
      1624.420044
      1.0
      1183751.0
      6.0
      8.0
      1313.150024
      NaN
      NaN
      743.400024
      NaN
      760.929993
      1315.750000
      770.280029
    
    
      3
      3
      9
      1149.219971
      NaN
      NaN
      1154.160034
      1183751.0
      1183752.0
      8.0
      10.0
      NaN
      907.369995
      743.400024
      NaN
      760.929993
      NaN
      770.280029
      911.289978
    
    
      4
      4
      11
      602.669983
      NaN
      NaN
      606.020020
      1183752.0
      1183753.0
      10.0
      12.0
      907.369995
      602.669983
      NaN
      NaN
      NaN
      NaN
      911.289978
      623.280029



In [229]:

    
# Now compare
print('Line 0')
print compare(((df['L0max']==df['L0max_prev']) | (df['L0max'].isnull()) & (df['L0max_prev'].isnull())).astype(int), 
              dat_jay['sameL0'])
print compare(((df['L0max']==df['L0max_next']) | (df['L0max'].isnull()) & (df['L0max_next'].isnull())).astype(int), 
              dat_jay['sameL0 (#1)'])
print('Line 1')
print compare(((df['L1max']==df['L1max_prev']) | (df['L1max'].isnull()) & (df['L1max_prev'].isnull())).astype(int), 
              dat_jay['sameL1'])
print compare(((df['L1max']==df['L1max_next']) | (df['L1max'].isnull()) & (df['L1max_next'].isnull())).astype(int), 
              dat_jay['sameL1 (#1)'])
print('Line 2')
print compare(((df['L2max']==df['L2max_prev']) | (df['L2max'].isnull()) & (df['L2max_prev'].isnull())).astype(int), 
              dat_jay['sameL2'])
print compare(((df['L2max']==df['L2max_next']) | (df['L2max'].isnull()) & (df['L2max_next'].isnull())).astype(int), 
              dat_jay['sameL2 (#1)'])
print('Line 3')
print compare(((df['L3max']==df['L3max_prev']) | (df['L3max'].isnull()) & (df['L3max_prev'].isnull())).astype(int), 
              dat_jay['sameL3'])
print compare(((df['L3max']==df['L3max_next']) | (df['L3max'].isnull()) & (df['L3max_next'].isnull())).astype(int), 
              dat_jay['sameL3 (#1)'])









    



Line 0
True
True
Line 1
True
True
Line 2
True
True
Line 3
True
True



In [8]:

    
date_cols = ['Kurtosis', 'Max', 'Mean', 'Min', 'Range','Unique count',
             'L0_Max', 'L0_Mean', 'L0_Min', 'L0_Range', 'L0_Unique count', 'DATE_L0max',
             'L1_Max', 'L1_Min', 'L1_Range', 'L1_Unique count', 'DATE_L1kurt',
             'L2_Max', 'L2_Min', 'L2_Range', 
             'L3_Max', 'L3_Min', 'L3_Range', 'L3_Unique count', 'DATE_L3kurt', 'DATE_L3min',
             'L0max_L3max', 'L2max_S37max', 'L3max_L0min', 'L3max_S26max', 'L3max_S30max', 'L3max_S34min',
             'sameL0', 'sameL0 (#1)', 'sameL1', 'sameL1 (#1)', 'sameL2', 'sameL2 (#1)', 'sameL3', 'sameL3 (#1)',
'L1_L1_Missing value count',
'L3_L3_Missing value count',
'L3_L3_Unique count',]



In [9]:

    
jay_feat_diffs = ['S13min_S33min',
'S22max_S32min',
'S22min_S32min',
'S26min_S24min',
'S26min_S37min',
'S27min_S32min',
'S29max_S35max',
'S29min_S32min',
'S29min_S34min',
'S29min_S37min',
'S30min_S35min',
'S30min_S37min',
'S32max_S37min',
'S32min_S10min',
'S32min_S30min',
'S32min_S34min',
'S33max_S29min',
'S33min_S30min',
'S33min_S34min',
'S33min_S35min',
'S33min_S36min',
'S33min_S37min',
'S34min_S35min',
'S35min_L1min',
'S36max_S29min',
'S37min_S34min']
jay_feat_dates = [
'DATE_S0_max',
'DATE_S1_max',
'DATE_S10max',
'DATE_S10min',
'DATE_S11max',
'DATE_S13max',
'DATE_S13min',
'DATE_S18max',
'DATE_S19max',
'DATE_S2_max',
'DATE_S20max',
'DATE_S20min',
'DATE_S21max',
'DATE_S21min',
'DATE_S23max',
'DATE_S24max',
'DATE_S24min',
'DATE_S25max',
'DATE_S25min',
'DATE_S26max',
'DATE_S27max',
'DATE_S28max',
'DATE_S3_max',
'DATE_S30max',
'DATE_S32max',
'DATE_S32min',
'DATE_S33max',
'DATE_S34max',
'DATE_S35max',
'DATE_S36max',
'DATE_S37max',
'DATE_S38max',
'DATE_S4_max',
'DATE_S40max',
'DATE_S43max',
'DATE_S44max',
'DATE_S45min',
'DATE_S47max',
'DATE_S49max',
'DATE_S50max',
'DATE_S6_max',
'DATE_S7_max',
'DATE_S8_max',
'DATE_S8_min',
'DATE_S9_max']



In [6]:

    
cat_jay.filter(like='Max').columns









    Out[6]:





Index([u'L0_Max', u'L3_Max', u'L1_Max', u'L2_Max', u'Max',
       u'CATEGORICAL_Max______1', u'CATEGORICAL_Max______3'],
      dtype='object')



In [8]:

    
cat_jay.columns[:100]









    Out[8]:





Index([u'Id', u'Response', u'ratio', u'FOR100_Sum', u'L3_S33_F3857',
       u'FOR165_Sum', u'BAC165_Sum', u'BAC100_Sum', u'sameL0 (#1)',
       u'L3_S30_F3754', u'L3_S30_F3809', u'FOR165_log_lag_L2', u'L3_S30_F3744',
       u'BAC100_log_lag_L3', u'DATE_L3kurt', u'sameL3', u'sameL3 (#1)',
       u'FOR100_log_lag_L3', u'FOR165_log_lag', u'BAC165_log_lag',
       u'L3_S30_F3774', u'sameL1 (#1)', u'L3_S33_F3859', u'L3_S30_F3494',
       u'L3_S29_F3333', u'BAC165_log_lag_L3', u'L3_S30_F3759', u'L0_S1_F28',
       u'L3_S29_F3373', u'Range', u'Kurtosis', u'L3_S33_F3865', u'BAC30_Sum_S',
       u'DATE_S33max', u'FOR165_log_lag_L3', u'FOR60_Sum_S3', u'L3_S29_F3348',
       u'sameL2 (#1)', u'L0_Max', u'L3_S30_F3769', u'Response(-1)',
       u'L3_S30_F3749', u'BAC100_log_lag', u'L3_S29_F3351', u'L3_S36_F3920',
       u'FOR100_log_lag', u'BAC165_log_lag_L2', u'BAC60_Sum_S3',
       u'L3_S29_F3339', u'L3_S30_F3804', u'L3max_L0min', u'L3_S29_F3379',
       u'L3_S29_F3479', u'FOR100_log_lag_L2', u'BAC165_log_lag_L1',
       u'BAC165_log_lag_L0', u'FOR30_Sum_S', u'BAC100_log_lag_L2',
       u'L3_S29_F3342', u'L3_S29_F3327', u'L0_S2_F44', u'L3_S30_F3604',
       u'L0_S3_F100', u'L3_S30_F3534', u'L3_S29_F3354', u'L3_S30_F3574',
       u'L1_S24_F1844', u'L0_S0_F20_*_L0_S0_F20', u'L3_S30_F3639',
       u'FOR165_log_lag_L1', u'FOR100_log_lag_L1', u'FOR165_log_lag_L0',
       u'L3_S30_F3504', u'L3_S30_F3609', u'L0_S7_F138', u'L3_S29_F3321',
       u'L3_Min', u'L3_S29_F3336', u'Response(-1) (#1)', u'L0max_L3max',
       u'BAC100_log_lag_L1', u'L3_S30_F3709', u'L3_S30_F3829',
       u'FOR60_log_lag_S33', u'L3_S29_F3315', u'L0_S0_F18', u'L3_S30_F3544',
       u'L3_S29_F3330', u'L0_S1_F24', u'L0_S5_F116', u'L3_S29_F3324',
       u'CATEGORICAL_Last_____1', u'FOR60_log_lag_S30', u'L0_S0_F0',
       u'L0_S4_F109', u'L0_S9_F180', u'L3_S35_F3889', u'L1_S24_F1581',
       u'CATEGORICAL_out_out_L3_S32_F3854_class2', u'L2_Min'],
      dtype='object')

Reproduce JayJay's features



In [48]:

    
jay_means = cat_jay.mean()
jay_sums = cat_jay.sum()



In [95]:

    
def value_last_element_row(X):
    ''' Return last value of each row of sparse csr matrix X'''
    
    # Get element where new row starts -1
    last = X.indptr[1:] - 1
    
    output = X.data[last]
    
    # Replace row with zero non-zero elements by nan
    output[np.diff(X.indptr)==0] = np.nan
    
    return output

def max_element_row(X):
    ''' Return maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    
    output = X.max(1).todense().A1
    
    output[output==0] = np.nan
    
    return output

def alpha_num_max_element_row(X):
    ''' Return alpha num maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            data = ['{:d}'.format(int(x)) for x in set(X[n, nz].todense().A1)]
            output.append( int(float(max(data))))
        else:
            #output.append(np.nan)
            output.append(0)
    
    return output

def nunique_row(X):
    ''' Return number of unique per row'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            output.append( len(set(X[n, nz].todense().A1)))
        else:
            output.append(0)
        
    return output



In [10]:

    
# CATEGORICAL_Last_____1
n_last = cat_data[n,:].nonzero()[1][-1]
sum([2, 4, 514] == cat_data[n, n_last])
pd.Series(value_last_element_row(cat_data)).isin([2, 4, 514]).mean()









    Out[10]:





0.0041672756087238238



In [65]:

    
# CATEGORICAL_Last_____2
pd.Series(value_last_element_row(cat_data)).isin([16, 48]).mean()









    Out[65]:





0.0034889211968435821



In [149]:

    
## CATEGORICAL_Missing value count
pd.Series(cat_data.shape[1] - np.diff(cat_data.indptr)).mean()









    Out[149]:





2082.8501723763607



In [102]:

    
# CATEGORICAL_Max______1 (takes a while)
list1 = [2, 8389632, 514]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list1).mean()









    Out[102]:





0.0027488982020651372



In [103]:

    
# CATEGORICAL_Max______3 (takes a while)
list3 = [3, 145, 4, 143, 8, 512, 6, 32]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list3).mean()









    Out[103]:





0.30736635446594585



In [148]:

    
# CATEGORICAL_Unique count
pd.Series(nunique_row(cat_data)).mean()









    Out[148]:





1.433054529388459



In [15]:

    
# CATEGORICAL_out_L3_S32_F3854_class1
# CATEGORICAL_out_L3_S32_F3854_class1           0.003434
col_nr = lut[lut['name_cat']=='L3_S32_F3854'].col_cat.values[0].astype(int)



In [25]:

    
d = pd.Series(cat_data[:, 1986].todense().A1)
d.replace(0, np.nan, inplace=True)



In [52]:

    
tmp = np.zeros(d.shape)
tmp[(d==16).values] = 1
tmp[(d==48).values] = 1



In [53]:

    
tmp.mean()









    Out[53]:





0.0034340108148109352



In [58]:

    
# CATEGORICAL_out_L3_S32_F3854_class2
# CATEGORICAL_out_out_L3_S32_F3854_class2       0.008123
tmp = np.zeros(d.shape)
tmp[(d==2).values] = 2
tmp[(d==4).values] = 2



In [59]:

    
tmp.mean()









    Out[59]:





0.0081233574403990049



In [ ]:

	feature_nr	feat_nr_dat	name_dat	name_cat	name_num	col_dat	col_num	col_cat	line_V2
0	0	1.0	L0_S0_D1	NaN	L0_S0_F0	0.0	0.0	NaN	1.0
1	2	3.0	L0_S0_D3	NaN	L0_S0_F2	1.0	1.0	NaN	1.0
2	4	5.0	L0_S0_D5	NaN	L0_S0_F4	2.0	2.0	NaN	1.0

	Id	L0_S0_D1	L0_S0_D3	L0_S0_D5	L0_S0_D7	L0_S0_D9	L0_S0_D11	L0_S0_D13	L0_S0_D15	L0_S0_D17	...	L3_S50_D4246	L3_S50_D4248	L3_S50_D4250	L3_S50_D4252	L3_S50_D4254	L3_S51_D4255	L3_S51_D4257	L3_S51_D4259	L3_S51_D4261	L3_S51_D4263
0	4	82.24	82.24	82.24	82.24	82.24	82.24	82.24	82.24	82.24	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	7	1618.70	1618.70	1618.70	1618.70	1618.70	1618.70	1618.70	1618.70	1618.70	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	index	Id	L0max	L1max	L2max	L3max	index_prev	index_next	Id_prev	Id_next	L0max_prev	L0max_next	L1max_prev	L1max_next	L2max_prev	L2max_next	L3max_prev	L3max_next
0	0	4	82.269997	NaN	NaN	87.290001	1183749.0	1183750.0	3.0	5.0	NaN	255.490005	671.950012	NaN	704.109985	255.500000	711.080017	256.279999
1	1	6	1313.150024	NaN	NaN	1315.750000	1183750.0	2.0	5.0	7.0	255.490005	1618.729980	NaN	NaN	255.500000	NaN	256.279999	1624.420044
2	2	7	1618.729980	NaN	NaN	1624.420044	1.0	1183751.0	6.0	8.0	1313.150024	NaN	NaN	743.400024	NaN	760.929993	1315.750000	770.280029
3	3	9	1149.219971	NaN	NaN	1154.160034	1183751.0	1183752.0	8.0	10.0	NaN	907.369995	743.400024	NaN	760.929993	NaN	770.280029	911.289978
4	4	11	602.669983	NaN	NaN	606.020020	1183752.0	1183753.0	10.0	12.0	907.369995	602.669983	NaN	NaN	NaN	NaN	911.289978	623.280029