In [1]:
import numpy as np
import pandas as pd
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, GlobalMaxPooling2D, MaxPooling1D
from keras.layers import Reshape
import keras
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input
#from keras.layers.recurrent import GRU
from keras.layers import Flatten
import pickle
In [2]:
# process input data - NOT USED in stacker.py
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')
train_info = train_df[['listing_id', 'interest_level']].copy()
tgts = ['low', 'medium', 'high']
target_num_map = {'low':0, 'medium':1, 'high':2}
train_info['interest_level'] = np.array(train_info['interest_level'].apply(lambda x: target_num_map[x]))
train_info.sort_values('listing_id', inplace=True)
train_info.set_index('listing_id', inplace=True)
info = (train_info, sorted(test_df.listing_id.values))
pickle.dump(info, open('stacker-info.pkl', 'wb'))
In [5]:
# process StackNet models (Level 1)
sncv_trainids = pd.read_csv('../stacknet-preleak/train_ids.csv', header=None)
sncv_preds = pd.read_csv('../stacknet-preleak/cv_preds1.csv', header=None)
ids = sncv_trainids[0].values.astype(np.int32)
#sn_tgts = ['sn_high', 'sn_medium', 'sn_low']
#sncv_preds.columns = sn_tgts
sncv_preds['listing_id'] = ids
sncv_preds.sort_values('listing_id', inplace=True)
sncv_preds.set_index('listing_id', inplace=True)
sn_testids = pd.read_csv('../stacknet-preleak/test_stacknet.csv', header=None)
sn_testpreds = pd.read_csv('../stacknet-preleak/cv_preds_test1.csv', header=None)
ids = sn_testids[0].values.astype(np.int32)
#sn_testpreds.columns = sn_tgts
sn_testpreds['listing_id'] = ids
sn_testpreds.sort_values('listing_id', inplace=True)
sn_testpreds.set_index('listing_id', inplace=True)
sn_preds1 = pd.concat([sn_testpreds, sncv_preds]).sort_index()
#sn_preds1.to_pickle('stacker-sn-l1.pkl')
In [7]:
import pickle
sn_testpredsa = [sn_testpreds.copy() for i in range(0, 5)]
pickle.dump((sn_preds1, sn_testpredsa), open('stacker-sn-l1.pkl', 'wb'))
In [ ]:
# process StackNet models (Level 2)
sncv_trainids = pd.read_csv('../stacknet/train_ids.csv', header=None)
sncv_preds = pd.read_csv('../stacknet/cv_preds2.csv', header=None)
ids = sncv_trainids[0].values.astype(np.int32)
sn_tgts = ['sn_high', 'sn_medium', 'sn_low']
sncv_preds.columns = sn_tgts
sncv_preds['listing_id'] = ids
sncv_preds.sort_values('listing_id', inplace=True)
sncv_preds.set_index('listing_id', inplace=True)
sn_testids = pd.read_csv('../stacknet/test_stacknet.csv', header=None)
sn_testpreds = pd.read_csv('../stacknet/sigma_stack_pred.csv', header=None)
ids = sn_testids[0].values.astype(np.int32)
sn_testpreds.columns = sn_tgts
sn_testpreds['listing_id'] = ids
sn_testpreds.sort_values('listing_id', inplace=True)
sn_testpreds.set_index('listing_id', inplace=True)
sn_preds2 = pd.concat([sn_testpreds, sncv_preds]).sort_index()
sn_preds2.to_pickle('stacker-sn-l2.pkl')
In [58]:
print(log_loss(train_info.interest_level, sn_preds2.loc[train_info.index, ['sn_low', 'sn_medium', 'sn_high']]))
Code below here should wind up in stacker.py
In [34]:
class Stacker:
def __init__(self, use, nn_shape = [(32, .1), (16, .1)]):
self.train_info, self.test_ids = pd.read_pickle('stacker-info.pkl')
self.nn_shape = nn_shape.copy()
df_nn = use[0].copy()
for i, df in enumerate(use[1:]):
df_nn = pd.merge(df_nn, df, left_index = True, right_index = True)
self.df_nn_train = df_nn.loc[self.train_info.index]
self.df_nn_test = df_nn.loc[self.test_ids]
self.test_x = np.array(self.df_nn_test.values)
self.models = []
self.df_folds = []
self.test_preds = []
# plenty of code to do this, but it's simple enough
def oneheat(self, y):
rv = np.zeros((len(y), 3))
for i in [0, 1, 2]:
rv[:,i] = (y == i)
return rv
def buildmodel(self, num_inputs, shape=[(32, .1), (16, .1)]):
layers = [Input(shape=(num_inputs,))]
for s in shape:
layers.append(Dense(s[0], activation='relu')(layers[-1]))
layers.append(Dropout(s[1])(layers[-1]))
output = Dense(3, activation='softmax', name='output')(layers[-1])
model = Model(inputs=layers[0], outputs=output)
model.compile(loss='categorical_crossentropy',
optimizer='adam')
return model
def run_fold(self, train_idx, valid_idx):
model = self.buildmodel(len(self.df_nn_train.keys()), shape=self.nn_shape)
nn_fold_train = self.df_nn_train.iloc[train_idx]
nn_fold_valid = self.df_nn_train.iloc[valid_idx]
tmp_train_x = np.array(nn_fold_train.values)
tmp_valid_x = np.array(nn_fold_valid.values)
tmp_train_y = self.oneheat(self.train_info.iloc[train_idx].interest_level)
tmp_valid_y = self.oneheat(self.train_info.iloc[valid_idx].interest_level)
bst_model_path = 'tmpnnx.h5'
ES = keras.callbacks.EarlyStopping(patience=12)
MC = keras.callbacks.ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
history = model.fit(tmp_train_x, tmp_train_y, batch_size=128, epochs=100, verbose=2, validation_data=(tmp_valid_x, tmp_valid_y), callbacks=[MC, ES])
model.load_weights(bst_model_path)
tpreds = model.predict(np.array(nn_fold_valid))
df_tmp = pd.DataFrame(tpreds)
df_tmp.columns = [['low', 'medium', 'high']]
df_tmp['listing_id'] = nn_fold_valid.index
df_tmp.set_index('listing_id', inplace=True)
tgts = ['low', 'medium', 'high']
print(log_loss(self.train_info.iloc[valid_idx].interest_level, df_tmp[tgts]))
self.df_folds.append(df_tmp)
self.test_preds.append(model.predict(self.test_x))
self.models.append(model)
return df_tmp
def run(self, folds = 8, seed = 5000):
#print(folds)
#self.kf_nn = model_selection.StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
#self.folds_nn = [(k[0], k[1]) for k in self.kf_nn.split(self.df_nn_train.index, self.train_info.interest_level)]
self.kf = model_selection.KFold(n_splits=folds, shuffle=True, random_state=seed)
self.folds_nn = [(k[0], k[1]) for k in self.kf.split(self.df_nn_train.index)]
for fold in self.folds_nn:
self.run_fold(fold[0], fold[1])
self.df_cv = pd.concat(self.df_folds).sort_index()
#log_loss(train_info.interest_level, df_cv[tgts])
testarray = np.array(self.test_preds.copy())
self.df_test = pd.DataFrame(testarray.mean(axis=0))
self.df_test.columns = [['low', 'medium', 'high']]
self.df_test['listing_id'] = self.test_ids
self.df_test.set_index('listing_id', inplace=True)
return self.df_cv, self.df_test
In [35]:
sn_preds1 = pd.read_pickle('stacker-sn-l1.pkl')
set_x1 = pd.read_pickle('0417-test.pkl')
In [ ]:
In [36]:
s = Stacker([set_x1, sn_preds1])
In [37]:
df_cv, df_test = s.run(folds=8)
In [38]:
log_loss(s.train_info.interest_level, df_cv[['low', 'medium', 'high']])
Out[38]:
In [39]:
df_test.to_csv('stacker-test2.csv.gz', compression='gzip')
In [46]:
import time
In [49]:
int(np.floor(time.time() * 1000))
Out[49]:
In [ ]:
'abcd'