In [ ]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import time
In [ ]:
# load full data
trn = pd.read_csv('../input/train_ver2.csv')
tst = pd.read_csv('../input/test_ver2.csv')
labels = pd.read_csv('../input/labels.csv').astype(int)
# prepare lag data
trn_dates = ['2015-01-28','2015-02-28','2015-03-28','2015-04-28','2015-05-28']
tst_dates = ['2016-01-28','2016-02-28','2016-03-28','2016-04-28','2016-05-28']
temp = trn[trn['fecha_dato'] == '2015-06-28']['ncodpers']
trn_ncodpers = temp[(labels[trn['fecha_dato'] == '2015-06-28'].sum(axis=1) > 0).values].values.tolist()
tst_ncodpers = np.unique(tst['ncodpers']).tolist()
trn_trim = trn[trn['fecha_dato'].isin(trn_dates)]
trn_trim = trn_trim[trn_trim['ncodpers'].isin(trn_ncodpers)]
tst_trim = trn[trn['fecha_dato'].isin(tst_dates)]
tst_trim = tst_trim[tst_trim['ncodpers'].isin(tst_ncodpers)]
# melt labels for trn
fecha_dato = trn['fecha_dato']
train_index = (labels[fecha_dato == '2015-06-28'].sum(axis=1) > 0)
train_index = train_index[train_index == True]
train = trn.ix[train_index.index]
train.iloc[:,24:] = labels.ix[train_index.index]
trn_june = []
for ind, (run, row) in enumerate(train.iterrows()):
for i in range(24):
if row[24+i] == 1:
temp = row[:24].values.tolist()
temp.append(i)
trn_june.append(temp)
# define and save target separately
target = pd.DataFrame(trn_june)[24].values.tolist()
target = pd.DataFrame(target)
print('# target shape : ({})'.format(len(target)))
# make full data set
trn_june = pd.DataFrame(trn_june, columns=trn.columns[:25]).iloc[:,:-1]
trn = pd.concat([trn_trim, trn_june], axis=0)
tst = pd.concat([tst_trim, tst], axis=0)
print(trn.shape, tst.shape)
In [ ]:
# clean data
skip_cols = ['fecha_dato','ncodpers']
target_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
for col in trn.columns:
if col in skip_cols:
continue
if col == 'ind_empleado':
trn[col].fillna('S', inplace=True)
elif col == 'age':
trn[col].replace(' NA',0,inplace=True)
trn[col] = trn[col].astype(str).astype(int)
trn[col] = trn[col].astype(str).astype(int)
continue
elif col == 'fecha_alta':
trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col].fillna('2015-07-01')))/ np.timedelta64(1, 'D')).astype(int)
tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col]))/np.timedelta64(1, 'D')).astype(int)
continue
elif col == 'antiguedad':
trn[col].replace(' NA',-1,inplace=True)
trn[col] = trn[col].astype(str).astype(int)
tst[col] = tst[col].astype(str).astype(int)
continue
elif col == 'ult_fec_cli_1t':
trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col].fillna('2015-06-30')))/ np.timedelta64(1, 'D')).astype(int)
tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col].fillna('2016-01-03')))/np.timedelta64(1, 'D')).astype(int)
continue
elif col == 'indrel_1mes':
tst[col].replace('1','1.0',inplace=True)
tst[col].replace('2','1.0',inplace=True)
tst[col].replace('2.0','1.0',inplace=True)
tst[col].replace(2.0,'1.0',inplace=True)
tst[col].replace('3','3.0',inplace=True)
tst[col].replace('4','3.0',inplace=True)
tst[col].replace(4.0,'3.0',inplace=True)
tst[col].replace('4.0','3.0',inplace=True)
tst[col].replace('P','3.0',inplace=True)
elif col == 'tiprel_1mes':
tst[col].replace('N','I',inplace=True)
tst[col].replace('R','P',inplace=True)
elif col == 'indresi':
trn[col].fillna('N',inplace=True)
elif col == 'indext':
trn[col].fillna('S',inplace=True)
elif col == 'indfall':
trn[col].fillna('N',inplace=True)
elif col == 'tipodom':
trn.drop([col], axis=1, inplace=True)
tst.drop([col], axis=1, inplace=True)
continue
elif col == 'ind_actividad_cliente':
trn[col].fillna(0.0, inplace=True)
elif col == 'renta':
tst[col].replace(' NA',0,inplace=True)
trn[col].fillna(-1, inplace=True)
tst[col].fillna(-1, inplace=True)
trn[col] = trn[col].astype(str).astype(float).astype(int)
tst[col] = tst[col].astype(str).astype(float).astype(int)
continue
elif col in target_cols:
trn[col].fillna(0, inplace=True)
trn[col] = trn[col].astype(int)
tst[col].fillna(0, inplace=True)
tst[col] = tst[col].astype(int)
lb = LabelEncoder()
lb.fit(pd.concat([trn[col].astype(str), tst[col].astype(str)], axis=0))
trn[col] = lb.transform(trn[col].astype(str))
tst[col] = lb.transform(tst[col].astype(str))
In [ ]:
trn_june = trn[trn['fecha_dato'] == '2015-06-28'].drop(target_cols, axis=1)
trn_othr = trn[trn['fecha_dato'] != '2015-06-28']
tst_june = tst[tst['fecha_dato'] == '2016-06-28'].drop(target_cols, axis=1)
tst_othr = tst[tst['fecha_dato'] != '2016-06-28']
In [ ]:
st = time.time()
drop_cols = ['fecha_dato','ncodpers']
print('# Appending trn data.. {} rows'.format(trn_june.shape[0]))
trn_append = []
for i, ncodper in enumerate(trn_june['ncodpers']):
temp = trn_othr[trn_othr['ncodpers'] == ncodper].drop(drop_cols, axis=1)
if temp.shape[0] == 0:
row = ['NA']*225
else:
row = np.hstack([temp.shift(periods=i).iloc[-1,:] for i in range(temp.shape[0])]).tolist()
trn_append.append(trn_june.iloc[i].drop(drop_cols).values.tolist() + row)
if i % int(trn_june.shape[0]/10) == 0:
print('# {} rows.. {} secs..'.format(i, round(time.time() - st),2))
st = time.time()
print('# Appending tst data.. {} rows'.format(tst_june.shape[0]))
tst_append = []
for i, ncodper in enumerate(tst_june['ncodpers']):
temp = tst_othr[tst_othr['ncodpers'] == ncodper].drop(drop_cols, axis=1)
if temp.shape[0] == 0:
row = ['NA']*225
else:
row = np.hstack([temp.shift(periods=i).iloc[-1,:] for i in range(temp.shape[0])]).tolist()
tst_append.append(tst_june.iloc[i].drop(drop_cols).values.tolist() + row)
if i % int(tst_june.shape[0]/10) == 0:
print('# {} rows.. {} secs..'.format(i, round(time.time() - st),2))
# 150 secs for trn
# 9964 secs for tst
In [ ]:
colnames = trn_june.drop(drop_cols, axis=1).columns.values.tolist()
suffixes = ['_lag_one','_lag_two','_lag_thr','_lag_fou','_lag_fiv']
for suffix in suffixes:
for col in trn_othr.drop(drop_cols, axis=1).columns.values.tolist():
colnames.append(col+suffix)
print(len(colnames))
# initialize column names
trn = pd.DataFrame(trn_append, columns=colnames)
tst = pd.DataFrame(tst_append, columns=colnames)
print('# trn : {} | tst : {}'.format(trn.shape, tst.shape))
In [ ]:
trn.to_csv('../input/train_append_lb_lag.csv', index=False)
tst.to_csv('../input/test_append_lb_lag.csv', index=False)