In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# print(os.listdir("../input"))

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

# import matplotlib.pyplot as plt
# import seaborn as sns

# import dask.dataframe as dd
# from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)
import concurrent.futures

import pickle

from leak_cols import *
leak_list = LEAK_LIST

# Any results you write to the current directory are saved as output.
path = "../../Data/"
train = pd.read_csv(path + "train.csv", index_col = 'ID')
test = pd.read_csv(path + "test.csv", index_col = 'ID')

debug = False
if debug:
    train = train[:1000]
    test = test[:1000]

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'] 

from multiprocessing import Pool


/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

In [2]:
IsTrain = False
CPU_CORES = 1
NZ_NUM = 2

In [3]:
# def _get_leak(df, cols, search_ind, lag=0):
#     """ To get leak value, we do following:
#        1. Get string of all values after removing first two time steps
#        2. For all rows we shift the row by two steps and again make a string
#        3. Just find rows where string from 2 matches string from 1
#        4. Get 1st time step of row in 3 (Currently, there is additional condition to only fetch value if we got exactly one match in step 3)"""
#     f1 = [] #cols[:((lag+2) * -1)]
#     f2 = [] #cols[(lag+2):]
#     for ef in leak_list:
#         f1 += ef[:((lag+2) * -1)]
#         f2 += ef[(lag+2):]
#     series_str = df[f2]
#     nz = series_str.apply(lambda x: len(x[x!=0]), axis=1)
#     series_str = series_str[nz >= NZ_NUM]
#     series_str = series_str.apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
#     series_str = series_str.drop_duplicates(keep = False) #[(~series_str.duplicated(keep = False)) | (df[cols[lag]] != 0)]
#     series_shifted_str = df.loc[search_ind, f1].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
#     target_rows = series_shifted_str.progress_apply(lambda x: np.where(x == series_str.values)[0])
#     # print(target_rows)
#     # del series_str, series_shifted_str
#     # target_vals = target_rows.apply(lambda x: df.loc[series_str.index[x[0]], cols[lag]] if len(x)==1 else 0)
#     target_vals = target_rows.apply(lambda x: df.loc[series_str.index[x[0]], cols[lag]] if len(x) == 1 else 0)
#         # if (len(x) > 0 and df.loc[series_str.index[x], cols[lag]].nunique() == 1) else 0)
#     return target_vals, lag

def _get_leak(df, cols, search_ind, lag=0):
    """ To get leak value, we do following:
       1. Get string of all values after removing first two time steps
       2. For all rows we shift the row by two steps and again make a string
       3. Just find rows where string from 2 matches string from 1
       4. Get 1st time step of row in 3 (Currently, there is additional condition to only fetch value if we got exactly one match in step 3)"""
    f1 = [] #cols[:((lag+2) * -1)]
    f2 = [] #cols[(lag+2):]
    for ef in leak_list:
        f1 += ef[:((lag+2) * -1)]
        f2 += ef[(lag+2):]
    series_str = df[f2]
    nz = series_str.apply(lambda x: len(x[x!=0]), axis=1)
    series_str = series_str[nz >= NZ_NUM]
    series_str['key'] = series_str.apply(tuple), axis=1)
    series_str = series_str[~series_str.duplicated(['key'], keep=False)] 
    series_str['pred'] = df[cols[lag]]
    series_str = series_str[['key', 'pred']]
    
    series_shifted_str = df.loc[search_ind, f1]
    series_shifted_str['key'] = series_shifted_str.apply(tuple), axis=1)
    series_shifted_str = series_shifted_str.reset_index()[['key', 'ID']]
    
    target_vals = series_shifted_str.merge(series_str, how='left', on='key').set_index('ID')
#     print (target_vals.head())
    target_vals = target_vals.pred.fillna(0)
    print ('Done')
    return target_vals, lag

In [4]:
def get_all_leak(df, cols=None, nlags=15):
    """
    We just recursively fetch target value for different lags
    """
    df =  df.copy()
#     print(df.head())
    if True: #with Pool(processes=CPU_CORES) as p:
        begin_ind = 0
        end_ind = 0
        leak_target = pd.Series(0, index = df.index)
        while begin_ind < nlags:
            end_ind = min(begin_ind + CPU_CORES, nlags)
            search_ind = (leak_target == 0)
            # print(search_ind)
            print('begin_ind: ', begin_ind, 'end_ind: ', end_ind, "search_ind_len: ", search_ind.sum())
#             res = [p.apply_async(_get_leak, args=(df, cols, search_ind, i)) for i in range(begin_ind, end_ind)]
#             for r in res:
#                 target_vals, lag = r.get()
# #                 print ('target_vale', target_vals.head())
#                 # leak_target[target_vals.index] = target_vals
#                 df['leak_target_' + str(lag)] = target_vals
            target_vals, lag = _get_leak(df, cols, search_ind, begin_ind)
            df['leak_target_' + str(lag)] = target_vals
            for i in range(begin_ind, end_ind):
                leak_target[leak_target == 0] = df.loc[leak_target == 0, 'leak_target_' + str(i)]
            leak_train = 0 #leak_target[train.index]
            leak_train_len = 0 #leak_train[leak_train != 0].shape[0]
            leak_test_len = 0 #leak_target[test.index][leak_target != 0].shape[0]
            leak_train_right_len = 0 #leak_train[leak_train.round(0) == train['target'].round(0)].shape[0]
            leak_train_right_ratio = 0 #leak_train_right_len / leak_train_len
            if IsTrain:
                leak_train = leak_target[train.index]
#                 print (leak_train.head())
                leak_train_len = leak_train[leak_train != 0].shape[0]
                leak_train_right_len = leak_train[leak_train.round(0) == train['target'].round(0)].shape[0]
                leak_train_right_ratio = leak_train_right_len / leak_train_len
            else:
                leak_test_len = leak_target[test.index][leak_target != 0].shape[0]
            print('Find leak in train and test: ', leak_train_len, leak_test_len, \
                "leak train right: ", leak_train_right_len, leak_train_right_ratio)
            begin_ind = end_ind
    # for i in range(nlags):
    #     df.loc[df['leak_target'] == 0, 'leak_target'] = df.loc[df['leak_target'] == 0, 'leak_target_' + str(i)]
    df['leak_target'] = leak_target
    return df

def get_pred(data, lag=2):
    d1 = data[FEATURES[:-lag]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = data[FEATURES[lag:]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = data[FEATURES[lag - 2]]
    d3 = d2[~d2.duplicated(['key'], keep=False)]
    return d1.merge(d3, how='left', on='key').pred.fillna(0)

def get_all_pred(data, max_lag):
    target = pd.Series(index=data.index, data=np.zeros(data.shape[0]))
    for lag in range(2, max_lag + 1):
        pred = get_pred(data, lag)
        mask = (target == 0) & (pred != 0)
        target[mask] = pred[mask]
    return target

test["target"] = 0 #train["target"].mean()

# all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
# all_df = pd.concat([train[["target"] + cols], test[["target"]+ cols]]) #.reset_index(drop=True)
# all_df.head()

NLAGS = 38 #Increasing this might help push score a bit
used_col = ["target"] + [col for cols in leak_list for col in cols]
print ('used_col length: ', len(used_col))
if IsTrain:
    all_df = get_all_leak(train[used_col], cols=cols, nlags=NLAGS)
else:
    all_df = get_all_leak(test[used_col], cols=cols, nlags=NLAGS)


used_col length:  3001
begin_ind:  0 end_ind:  1 search_ind_len:  49342
Done
Find leak in train and test:  0 2943 leak train right:  0 0
begin_ind:  1 end_ind:  2 search_ind_len:  46399
Done
Find leak in train and test:  0 4212 leak train right:  0 0
begin_ind:  2 end_ind:  3 search_ind_len:  45130
Done
Find leak in train and test:  0 4949 leak train right:  0 0
begin_ind:  3 end_ind:  4 search_ind_len:  44393
Done
Find leak in train and test:  0 5512 leak train right:  0 0
begin_ind:  4 end_ind:  5 search_ind_len:  43830
Done
Find leak in train and test:  0 5935 leak train right:  0 0
begin_ind:  5 end_ind:  6 search_ind_len:  43407
Done
Find leak in train and test:  0 6235 leak train right:  0 0
begin_ind:  6 end_ind:  7 search_ind_len:  43107
Done
Find leak in train and test:  0 6442 leak train right:  0 0
begin_ind:  7 end_ind:  8 search_ind_len:  42900
Done
Find leak in train and test:  0 6592 leak train right:  0 0
begin_ind:  8 end_ind:  9 search_ind_len:  42750
Done
Find leak in train and test:  0 6763 leak train right:  0 0
begin_ind:  9 end_ind:  10 search_ind_len:  42579
Done
Find leak in train and test:  0 6898 leak train right:  0 0
begin_ind:  10 end_ind:  11 search_ind_len:  42444
Done
Find leak in train and test:  0 7013 leak train right:  0 0
begin_ind:  11 end_ind:  12 search_ind_len:  42329
Done
Find leak in train and test:  0 7109 leak train right:  0 0
begin_ind:  12 end_ind:  13 search_ind_len:  42233
Done
Find leak in train and test:  0 7185 leak train right:  0 0
begin_ind:  13 end_ind:  14 search_ind_len:  42157
Done
Find leak in train and test:  0 7259 leak train right:  0 0
begin_ind:  14 end_ind:  15 search_ind_len:  42083
Done
Find leak in train and test:  0 7318 leak train right:  0 0
begin_ind:  15 end_ind:  16 search_ind_len:  42024
Done
Find leak in train and test:  0 7376 leak train right:  0 0
begin_ind:  16 end_ind:  17 search_ind_len:  41966
Done
Find leak in train and test:  0 7424 leak train right:  0 0
begin_ind:  17 end_ind:  18 search_ind_len:  41918
Done
Find leak in train and test:  0 7462 leak train right:  0 0
begin_ind:  18 end_ind:  19 search_ind_len:  41880
Done
Find leak in train and test:  0 7490 leak train right:  0 0
begin_ind:  19 end_ind:  20 search_ind_len:  41852
Done
Find leak in train and test:  0 7538 leak train right:  0 0
begin_ind:  20 end_ind:  21 search_ind_len:  41804
Done
Find leak in train and test:  0 7576 leak train right:  0 0
begin_ind:  21 end_ind:  22 search_ind_len:  41766
Done
Find leak in train and test:  0 7605 leak train right:  0 0
begin_ind:  22 end_ind:  23 search_ind_len:  41737
Done
Find leak in train and test:  0 7640 leak train right:  0 0
begin_ind:  23 end_ind:  24 search_ind_len:  41702
Done
Find leak in train and test:  0 7661 leak train right:  0 0
begin_ind:  24 end_ind:  25 search_ind_len:  41681
Done
Find leak in train and test:  0 7678 leak train right:  0 0
begin_ind:  25 end_ind:  26 search_ind_len:  41664
Done
Find leak in train and test:  0 7692 leak train right:  0 0
begin_ind:  26 end_ind:  27 search_ind_len:  41650
Done
Find leak in train and test:  0 7705 leak train right:  0 0
begin_ind:  27 end_ind:  28 search_ind_len:  41637
Done
Find leak in train and test:  0 7718 leak train right:  0 0
begin_ind:  28 end_ind:  29 search_ind_len:  41624
Done
Find leak in train and test:  0 7730 leak train right:  0 0
begin_ind:  29 end_ind:  30 search_ind_len:  41612
Done
Find leak in train and test:  0 7743 leak train right:  0 0
begin_ind:  30 end_ind:  31 search_ind_len:  41599
Done
Find leak in train and test:  0 7757 leak train right:  0 0
begin_ind:  31 end_ind:  32 search_ind_len:  41585
Done
Find leak in train and test:  0 7764 leak train right:  0 0
begin_ind:  32 end_ind:  33 search_ind_len:  41578
Done
Find leak in train and test:  0 7770 leak train right:  0 0
begin_ind:  33 end_ind:  34 search_ind_len:  41572
Done
Find leak in train and test:  0 7773 leak train right:  0 0
begin_ind:  34 end_ind:  35 search_ind_len:  41569
Done
Find leak in train and test:  0 7779 leak train right:  0 0
begin_ind:  35 end_ind:  36 search_ind_len:  41563
Done
Find leak in train and test:  0 7782 leak train right:  0 0
begin_ind:  36 end_ind:  37 search_ind_len:  41560
Done
Find leak in train and test:  0 7785 leak train right:  0 0
begin_ind:  37 end_ind:  38 search_ind_len:  41557
Done
Find leak in train and test:  0 7797 leak train right:  0 0

In [5]:
import time
time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
    
if IsTrain:
    all_df[['target', 'leak_target']].to_csv(path + 'train_target_leaktarget_' + str(NLAGS) + "_" + str(NZ_NUM) + time_label +  '.csv')
else:
    all_df[['target', 'leak_target']].to_csv(path + 'test_target_leaktarget_' + str(NLAGS) + "_" + str(NZ_NUM) + time_label + '.csv')
# with open(path + 'leak_target_' + str(NLAGS) + '.pickle', 'wb+') as handle:
#     pickle.dump(all_df[['target', 'leak_target']], handle, protocol=pickle.HIGHEST_PROTOCOL)

sub = pd.read_csv(path + 'sub_2018_08_13_03_19_33.csv', index_col = 'ID')
leak_target = all_df['leak_target'][test.index]
# print(leak_target)
sub.loc[leak_target[leak_target != 0].index, 'target'] = leak_target[leak_target != 0]

if not IsTrain:
    sub.to_csv(path + 'leak_sub_' + str(NLAGS) + "_" + time_label + '.csv')