In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [2]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)
Out[2]:
In [7]:
use_sets = 0 # 0: both, 1: train, 2:test
dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']
dat = func.load_data_file(const.TEST_FILES[2])
if use_sets==0:
dat_data = vstack([dat_train, dat['data']['features']], format='csr')
ids = pd.concat([id_train, dat['data']['ids']], axis=0)
elif use_sets==1: # Only train
dat_data = dat_train
ids = id_train
elif use_sets==2: # Only test
dat_data = dat['data']['features']
ids = dat['data']['ids']
del dat, dat_train, id_train
In [8]:
# Prepare dataframe with maximum timestamp per sample
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
# Get column numbers of sparse matrix belonging to this line
col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]
# Get maximum timestamp for tis line
df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
# Because of sparse matrix NA were encoded as zero
df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
#
df['L{}max'.format(l)].round(2)
# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)
# Sort by ID
df.sort_values(['Id'], inplace=True)
# Add columns with next and previous timestamp per line
for col in df.columns:
df[col + '_prev'] = df[col].shift(1)
df[col + '_next'] = df[col].shift(-1)
df.set_index('Id', inplace=True)
In [9]:
# List to keep track of all the columns we generated. Useful for some operations later.
feat_cols = []
for l in range(4):
# Samples are the same if they are both NA or have the same timestamp
df['sameL{}_next'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
# Samples are the same if they are both NA or have the same timestamp
df['sameL{}_prev'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]
In [11]:
df.sort_values('index', inplace=True)
In [25]:
df.head()
Out[25]:
In [13]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_{}.csv'.format(use_sets)), index_label='ID')
In [32]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]
df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
df['L{}max'.format(l)].round(2)
# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)
# Sort by ID
df.sort_values(['Id'], inplace=True)
for col in df.columns:
df[col + '_prev'] = df[col].shift(1)
df[col + '_next'] = df[col].shift(-1)
df.set_index('Id', inplace=True)
In [33]:
feat_cols = []
for l in range(4):
df['sameL{}_next'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
df['sameL{}_prev'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]
In [34]:
df.sort_values('index', inplace=True)
In [35]:
feat_cols
Out[35]:
In [17]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new_{}.csv'.format(use_sets)), index_label='ID')
In [18]:
line_V2s = lut['line_V2'].unique()
print line_V2s
In [19]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L{}_V2_MAX'.format(x) for x in line_V2s], index=ids.Id)
for l in line_V2s:
col_date = [int(i) for i in lut[lut['line_V2']==l].col_dat.values if not np.isnan(i)]
df['L{}_V2_MAX'.format(l)] = dat_data[:, col_date].max(1).todense().A1
df['L{}_V2_MAX'.format(l)].replace(0, np.nan, inplace=True)
# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)
# Sort by ID
df.sort_values(['Id'], inplace=True)
for col in df.columns:
df[col + '_prev'] = df[col].shift(1)
df[col + '_next'] = df[col].shift(-1)
df.set_index('Id', inplace=True)
In [20]:
feat_cols = []
for l in line_V2s:
df['sameL{}_V2_next'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_next'.format(l)]).astype(int) + \
1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_next'.format(l)].isnull())).astype(int)
df['sameL{}_V2_prev'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_prev'.format(l)]).astype(int) + \
1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_prev'.format(l)].isnull())).astype(int)
feat_cols += ['sameL{}_V2_prev'.format(l), 'sameL{}_V2_next'.format(l)]
In [21]:
df.sort_values('index', inplace=True)
In [22]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_V2_same_L_new_{}.csv'.format(use_sets)), index_label='ID')