Silver solution (118-th place)
Original submission is here
Public LB: 0.50762
Private LB: 0.50665
Author: Igor Ivanov (vecxoz)
Email: vecxoz@gmail.com
MIT License
kag17_2sigma_renthoptrain.json and test.json files into kag17_2sigma_renthop/data. You can download this files from competition data page$ python3 solution.py or just run all cells of this notebook solution.ipynbsolution.py and solution.ipynb contain completely identical Python codekag17_2sigma_renthop/reproduced_submission/reproduced_submission.csvThe dataset for this competition is just amazing.
We have all kinds of features: numerical, categorical, geospatial (lat/lon), text, pictures...
Just endless possibilities for feature engineering.
Let's look at some training example:
>>> train_df.iloc[12]
bathrooms 1.000000
bedrooms 2
building_id 67c9b420da4a365bc26a6cd0ef4a5320
created 2016-04-19 05:37:25
description ***LOW FEE. Beautiful CHERRY OAK WOODEN FLOORS...
display_address E 38th St
features [Doorman, Elevator, Laundry in Building, No Fee]
interest_level high
latitude 40.748800
listing_id 6895442
longitude -73.977000
manager_id 537e06890f6a86dbb70c187db5be4d55
photos [https://photos.renthop.com/2/6895442_34d617a5...
price 3000
street_address 137 E 38th St
The final submission is an ensemble (weighted average) of 3 first-level models.
Each first-level model is meta-model by nature itself.
First-level models are built based on the concept of 'mixed stacking':
Algorithms used:
In [1]:
# Basics
import os
import sys
import gc
import re
from subprocess import check_output
# Math stack
import numpy as np
np.set_printoptions(suppress = True)
import pandas as pd
# pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.options.mode.chained_assignment = None # default = 'warn'
from scipy import sparse
from scipy.optimize import minimize
# Preprocessing and scoring
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Text (vectorizing, stamming, sentiment)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from textblob import TextBlob
# Models
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
# Default value to fill NaN
fill_val = 0
In [2]:
data_dir = './data/'
train_df = pd.read_json(data_dir + 'train.json')
test_df = pd.read_json(data_dir + 'test.json')
subm_df = pd.read_csv(data_dir + 'sample_submission.csv')
# Load "magic feature"
time_df = pd.read_csv(data_dir + 'listing_image_time.csv')
# Rename columns
time_df.columns = ['listing_id', 'timestamp']
#
print(train_df.shape) # (49352, 15)
print(test_df.shape) # (74659, 14)
In [3]:
y_col = 'interest_level'
r, c = train_df.shape
test_df.loc[:, y_col] = 'na'
tt_df = pd.concat([train_df, test_df], ignore_index = True)
# Merge with "magic feature"
tt_df = pd.merge(tt_df, time_df, on = 'listing_id', how = 'left')
print(tt_df.shape) # (124011, 16)
In [4]:
# bathrooms
tt_df.loc[69023, 'bathrooms'] = 2 # was 112
tt_df.loc[72329, 'bathrooms'] = 2 # was 20
tt_df.loc[113071, 'bathrooms'] = 2 # was 20
tt_df.loc[1990, 'bathrooms'] = 1 # was 10
# lat/lon - just another city - e.g. LA - leave as is
# price
tt_df.loc[25538, 'price'] = 1025 # was 111111 # real number from dscription
tt_df.loc[tt_df['price'] > 100000, 'price'] = 100000 # low interest_level for all
# timestamp
tt_df.loc[35264, 'timestamp'] = 1479787252 # was 1491289977 (only one record from april) # replace with last timestamp excluding this record
In [5]:
#---------------------------------------------------------------------------
# Features denoting presence of NaNs, zeros, outliers
#---------------------------------------------------------------------------
tt_df['building_id_is_zero'] = (tt_df['building_id'].apply(len) == 1).astype(np.int64)
#---------------------------------------------------------------------------
# Count/len
#---------------------------------------------------------------------------
tt_df['num_photos'] = tt_df['photos'].apply(len) # number of photos
tt_df['num_features'] = tt_df['features'].apply(len) # number of 'features'
tt_df['num_description_words'] = tt_df['description'].apply( lambda x: len(x.split(' ')) ) # number of words in description
#---------------------------------------------------------------------------
# Date/Time
#---------------------------------------------------------------------------
tt_df['created'] = pd.to_datetime(tt_df['created']) # convert the created column to datetime
# tt_df['year'] = tt_df['created'].dt.year # year is constant for this dataset
tt_df['month'] = tt_df['created'].dt.month
tt_df['day'] = tt_df['created'].dt.day
tt_df['hour'] = tt_df['created'].dt.hour
#---------------------------------------------------------------------------
# Rooms
#---------------------------------------------------------------------------
tt_df['bad_plus_bath'] = tt_df['bedrooms'] + tt_df['bathrooms']
tt_df['more_bed'] = (tt_df['bedrooms'] > tt_df['bathrooms']).astype(np.int64)
tt_df['more_bath'] = (tt_df['bedrooms'] < tt_df['bathrooms']).astype(np.int64)
tt_df['bed_bath_equal'] = (tt_df['bedrooms'] == tt_df['bathrooms']).astype(np.int64)
tt_df['bed_bath_diff'] = tt_df['bedrooms'] - tt_df['bathrooms']
tt_df['bed_bath_ration'] = tt_df['bedrooms'] / tt_df['bathrooms']
tt_df['bed_bath_ration'] = tt_df['bed_bath_ration'].replace([np.inf], np.max(tt_df.loc[tt_df['bed_bath_ration'] != np.inf, 'bed_bath_ration']) + 1)
tt_df['bed_bath_ration'].fillna(0, inplace = True)
tt_df['bath_is_int'] = (0 == tt_df['bathrooms'] % 1).astype(np.int64)
# tt_df['diff_rooms_photos'] = tt_df['num_photos'] - tt_df['bad_plus_bath']
tt_df['bed_bath_photos_ration'] = tt_df['bad_plus_bath'] / tt_df['num_photos']
tt_df['bed_bath_photos_ration'] = tt_df['bed_bath_photos_ration'].replace([np.inf], np.max(tt_df.loc[tt_df['bed_bath_photos_ration'] != np.inf, 'bed_bath_photos_ration']) + 1)
tt_df['bed_bath_photos_ration'] = tt_df['bed_bath_photos_ration'].replace([np.nan], 0)
#---------------------------------------------------------------------------
# Price
#---------------------------------------------------------------------------
tt_df['price_per_bed'] = tt_df['price'] / tt_df['bedrooms']
tt_df['price_per_bed'] = tt_df['price_per_bed'].replace([np.inf], np.max(tt_df.loc[tt_df['price_per_bed'] != np.inf, 'price_per_bed']) + 1)
tt_df['price_per_bath'] = tt_df['price'] / tt_df['bathrooms']
tt_df['price_per_bath'] = tt_df['price_per_bath'].replace([np.inf], np.max(tt_df.loc[tt_df['price_per_bath'] != np.inf, 'price_per_bath']) + 1)
tt_df['price_per_bed_plus_bath'] = tt_df['price'] / (tt_df['bedrooms'] + tt_df['bathrooms'])
tt_df['price_per_bed_plus_bath'] = tt_df['price_per_bed_plus_bath'].replace([np.inf], np.max(tt_df.loc[tt_df['price_per_bed_plus_bath'] != np.inf, 'price_per_bed_plus_bath']) + 1)
tt_df['price_per_photo'] = tt_df['price'] / tt_df['num_photos']
tt_df['price_per_photo'] = tt_df['price_per_photo'].replace([np.inf], np.max(tt_df.loc[tt_df['price_per_photo'] != np.inf, 'price_per_photo']) + 1)
#---------------------------------------------------------------------------
# Address (case may contain info)
#---------------------------------------------------------------------------
tt_df['street_address'] = tt_df['street_address'].apply(lambda x: x.lower())
tt_df['display_address'] = tt_df['display_address'].apply(lambda x: x.lower())
# tt_df['disp_addr_is_not_in_street_addr'] = tt_df[['street_address', 'display_address']].apply(lambda x: np.int(-1 == x.street_address.find(x.display_address)), axis = 1)
#---------------------------------------------------------------------------
# Lat/Lon
#---------------------------------------------------------------------------
# # latlon count (density of points)
tt_df['latlon'] = tt_df['longitude'].round(3).astype(str) + '_' + tt_df['latitude'].round(3).astype(str)
latlon_count = tt_df['latlon'].value_counts()
latlon_count = latlon_count.reset_index().rename(columns = {'index':'latlon', 'latlon':'density'})
tt_df = pd.merge(tt_df, latlon_count, on = 'latlon', how = 'left')
# Distance to New-Yourk center
center_lat = 40.785091
center_lon = -73.968285
tt_df['euclid_dist_to_center'] = np.sqrt((tt_df['latitude'] - center_lon) ** 2 + (tt_df['longitude'] - center_lat) ** 2)
# Rotation for different angles
for angle in [15,30,45,60]:
namex = 'rot' + str(angle) + '_x'
namey = 'rot' + str(angle) + '_y'
alpha = np.pi / (180 / angle)
tt_df[namex] = tt_df['latitude'] * np.cos(alpha) + tt_df['longitude'] * np.sin(alpha)
tt_df[namey] = tt_df['longitude'] * np.cos(alpha) - tt_df['latitude'] * np.sin(alpha)
#---------------------------------------------------------------------------
# Categotical
#---------------------------------------------------------------------------
# Label encoding
categorical_cols = ['display_address', 'manager_id', 'building_id', 'street_address']
for col in categorical_cols:
le = LabelEncoder()
tt_df.loc[:, col] = le.fit_transform(tt_df[col].values)
# Manager count
man_count = tt_df['manager_id'].value_counts()
man_count = man_count.reset_index().rename(columns = {'index':'manager_id', 'manager_id':'man_count'})
tt_df = pd.merge(tt_df, man_count, on = 'manager_id', how = 'left')
# Building count
build_count = tt_df['building_id'].value_counts()
build_count = build_count.reset_index().rename(columns = {'index':'building_id', 'building_id':'build_count'})
tt_df = pd.merge(tt_df, build_count, on = 'building_id', how = 'left')
# Top5 building
build_count = tt_df['building_id'].value_counts()
p = np.percentile(build_count.values, 95)
tt_df['top_5_building'] = tt_df['building_id'].apply( lambda x: np.int(x in build_count.index.values[build_count.values >= p]) )
#---------------------------------------------------------------------------
# Dscription
# Description in fact is the list of features, so probably it can add little values to 'features'
#---------------------------------------------------------------------------
tt_df['number_of_new_lines'] = tt_df['description'].apply(lambda x: x.count('<br /><br />'))
tt_df['website_redacted'] = tt_df['description'].str.contains('website_redacted').astype(np.int)
#---------------------------------------------------------------------------
# Strange
#---------------------------------------------------------------------------
tt_df['price_is_round_sousand'] = (0 == tt_df['price'] % 1000).astype(np.int64)
tt_df['price_is_round_hundred'] = (0 == tt_df['price'] % 100).astype(np.int64)
#---------------------------------------------------------------------------
# Image timestamp ('magic feature')
#---------------------------------------------------------------------------
tt_df['ts_date'] = pd.to_datetime(tt_df['timestamp'], unit = 's')
# tt_df['ts_days_passed'] = (tt_df['ts_date'].max() - tt_df['ts_date']).astype('timedelta64[D]').astype(int)
tt_df['ts_month'] = tt_df['ts_date'].dt.month
tt_df['ts_week'] = tt_df['ts_date'].dt.week
tt_df['ts_day'] = tt_df['ts_date'].dt.day
# tt_df['ts_dayofweek'] = tt_df['ts_date'].dt.dayofweek
tt_df['ts_dayofyear'] = tt_df['ts_date'].dt.dayofyear
tt_df['ts_hour'] = tt_df['ts_date'].dt.hour
tt_df['ts_tensdays'] = tt_df['ts_day'].apply(lambda x: 1 if x < 10 else 2 if x < 20 else 3)
#---------------------------------------------------------------------------
# Check NaNs
#---------------------------------------------------------------------------
print(tt_df.shape) # (124011, 60)
print('NaN: %s' % tt_df.isnull().mean().any())
In [6]:
train_df = tt_df[:r]
test_df = tt_df[r:]
In [7]:
def get_prob(df, col = None, agg_func = None):
"""
Params
------
df - Panadas dataframe
col - column of interest
agg_func - aggregation function
Return
------
Pandas dataframe ready to merge with df on manager_id
Logic
-----
We have this: We want to get this:
------------- --------------------
interest_level manager_id manager_id prob_high prob_low prob_medium
low foo bar 0.333333 NaN 0.666667
medium bar foo 0.200000 0.4 0.400000
medium foo
high bar
medium foo
medium bar
low foo
high foo
"""
aggregate_df = df.groupby(['manager_id', 'interest_level'])[[col]].aggregate(agg_func).rename(columns = {col: 'aggregate'}).reset_index()
sum_df = aggregate_df.groupby(['manager_id'])[['aggregate']].sum().rename(columns = {'aggregate': 'sum'}).reset_index()
aggregate_df = pd.merge(aggregate_df, sum_df, on = 'manager_id', how = 'left')
aggregate_df['prob'] = aggregate_df['aggregate'] / aggregate_df['sum']
piv_df = pd.pivot_table(aggregate_df, values='prob', columns=['interest_level'], index = 'manager_id').reset_index()
name = col + '_' + agg_func
piv_df.rename(columns = {'high': 'prob_high_' + name, 'low': 'prob_low_' + name, 'medium': 'prob_medium_' + name}, inplace = True)
return piv_df
In [8]:
# Init CV
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)
# Init aggregation
col = 'interest_level'
agg_func = 'count'
# Init new columns
for i in ['prob_high_', 'prob_low_', 'prob_medium_']: # alphabetically
train_df.loc[:, i + col + '_' + agg_func] = fill_val
# For train set
for train_index, test_index in kf.split(train_df):
tr_df = train_df.iloc[train_index]
te_df = train_df.iloc[test_index]
piv_df = get_prob(tr_df, col = col, agg_func = agg_func)
te_df = pd.merge(te_df, piv_df, on = 'manager_id', how = 'left')
train_df.iloc[test_index, -3:] = te_df.iloc[:, -3:].values
print('Fold done')
# For test set
piv_df = get_prob(train_df, col = col, agg_func = agg_func)
test_df = pd.merge(test_df, piv_df, on = 'manager_id', how = 'left')
# Fill NaN
train_df.fillna(fill_val, inplace = True)
test_df.fillna(fill_val, inplace = True)
print(train_df.shape) # (49352, 63)
print(test_df.shape) # (74659, 63)
In [9]:
train_df.loc[:, 'interest_level'] = train_df['interest_level'].map({'high': 0, 'medium': 1, 'low': 2})
y_train = train_df['interest_level'].values
In [10]:
# Combine train and test
tt_df = pd.concat([train_df, test_df], ignore_index = True)
print(tt_df.shape) # (124011, 63)
In [11]:
# Text features from 'features'
tt_df['features'] = tt_df['features'].apply(lambda x: ' '.join(['_'.join(i.split(' ')) for i in x]))
vectorizer = CountVectorizer(stop_words = 'english', max_features = 200)
tt_sparse = vectorizer.fit_transform(tt_df['features'])
In [12]:
# Text features from 'description'
# tt_df['sentiment_polarity'] = tt_df['description'].apply(lambda x: TextBlob(x).sentiment.polarity)
# tt_df['sentiment_subjectivity'] = tt_df['description'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
In [13]:
X_cols = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price',
'listing_id', 'num_photos', 'num_features', 'num_description_words',
'month', 'day', 'hour', 'bad_plus_bath', 'more_bed',
'more_bath', 'bed_bath_equal', 'bed_bath_diff', 'bed_bath_ration',
'price_per_bed', 'price_per_bath', 'price_per_bed_plus_bath',
'price_per_photo', 'price_is_round_sousand', 'price_is_round_hundred',
'building_id_is_zero', 'bath_is_int', 'bed_bath_photos_ration',
'density', 'euclid_dist_to_center',
'prob_high_interest_level_count', 'prob_low_interest_level_count', 'prob_medium_interest_level_count',
'display_address', 'manager_id', 'building_id', 'street_address',
'man_count', 'build_count', 'top_5_building',
'ts_month', 'ts_week', 'ts_day', 'ts_dayofyear', 'ts_hour', 'ts_tensdays',
'rot15_x', 'rot15_y', 'rot30_x', 'rot30_y',
'rot45_x', 'rot45_y', 'rot60_x', 'rot60_y',
'number_of_new_lines', 'website_redacted',
]
TT = sparse.hstack([tt_df[X_cols], tt_sparse]).tocsr()
# TT = sparse.csr_matrix(tt_df[X_cols]) # without text features
# Check for NaN, INF, -INF
print('NaN -> ', np.bool(np.mean(np.isnan(TT.toarray())))) # should be False
print('+INF -> ', np.bool(np.mean(np.isinf(TT.toarray())))) # should be False
print('-INF -> ', np.bool(np.mean(np.isneginf(TT.toarray())))) # should be False
# Check for constant fetures
print('CONST -> ', np.bool(np.mean(TT[0] == TT.mean(axis = 0)))) # should be False
# Check for duplicate entries in column (feature) list
print('DUPL -> ', len(X_cols) != len(set(X_cols))) # should be False
# Split
X_train = TT[:r]
X_test = TT[r:]
# Shape
print('SHAPE -> ', X_train.shape, X_test.shape) # (49352, 255) (74659, 255)
In [14]:
# Init model
model = XGBClassifier(seed = 0, objective = 'multi:softprob',
learning_rate = 0.1, n_estimators = 100,
max_depth = 6, min_child_weight = 1,
subsample = 0.7, colsample_bytree = 0.7)
# Crate sklearn scorer
scorer = make_scorer(log_loss, needs_proba = True)
# Run CV and get mean score
print(np.mean(cross_val_score(model, X_train, y_train, cv = 3, scoring = scorer)))
In [15]:
def oof(model, X_train, y_train, X_test, oof_test = True):
"""
Parameters
----------
Self-explanatory
oof_test - if True, then predict test set
Return
------
S_train - OOF predictions for train set
S_test - prediction for test set (fit model on full train set)
"""
# Init CV
kf = KFold(n_splits = 3, shuffle = True, random_state = 0)
# Create empty numpy arrays for stacking features
S_train = np.zeros((X_train.shape[0], 3))
S_test = np.zeros((X_test.shape[0], 3))
# Create oof predictions for train set
for i, (train_index, test_index) in enumerate(kf.split(X_train)):
X_tr = X_train[train_index]
y_tr = y_train[train_index]
X_te = X_train[test_index]
y_te = y_train[test_index]
model = model.fit(X_tr, y_tr)
y_te_pred = model.predict_proba(X_te)
S_train[test_index, :] = y_te_pred
print( 'Fold %d: %.6f' % (i, log_loss(y_te, y_te_pred)) )
# Score over full dataset (mean)
print( 'Mean: %.6f' % log_loss(y_train, S_train) )
# Create prediction for test set (fit on full train)
if oof_test:
model = model.fit(X_train, y_train)
S_test = model.predict_proba(X_test)
return (S_train, S_test)
In [16]:
# Parameters
params = {'seed': 0,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'num_class': 3,
'eta': 0.02,
'max_depth': 6,
'min_child_weight': 1,
'subsample': 0.7,
'colsample_bytree': 0.7,
'silent': 1,
}
# Convert data to DMatrices
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test)
# Using 3-fold CV
res = xgb.cv(
params,
dtrain,
num_boost_round = 10000,
early_stopping_rounds = 50,
nfold = 3,
seed = 0,
stratified = False,
show_stdv = True,
verbose_eval = 100
)
# Output result
n_part = res.shape[0]
n_full = np.int(res.shape[0] + (1/3) * res.shape[0])
print('\ncv mean + std -> [%.6f + %.6f]\nntrees -> [%d]\nntrees for full data (+1/3) -> [%d]' % (res.iloc[-1, 0], res.iloc[-1, 1], n_part, n_full))
In [17]:
# Init model
model = XGBClassifier(seed = 0, objective = 'multi:softprob',
learning_rate = 0.02, n_estimators = n_part,
max_depth = 6, min_child_weight = 1,
subsample = 0.7, colsample_bytree = 0.7)
# Get oof
xgb_oof_train, xgb_oof_test = oof(model, X_train, y_train, X_test, oof_test = False)
# Init model for test (as we train on full train set we need more rounds)
model = XGBClassifier(seed = 0, objective = 'multi:softprob',
learning_rate = 0.02, n_estimators = n_full,
max_depth = 6, min_child_weight = 1,
subsample = 0.7, colsample_bytree = 0.7)
# Fit model on full train
model = model.fit(X_train, y_train)
# Predict test
xgb_oof_test = model.predict_proba(X_test)
# Export to txt files
np.savetxt(data_dir + 'xgb_oof_train.csv', xgb_oof_train, delimiter = ',', fmt = '%.5f')
np.savetxt(data_dir + 'xgb_oof_test.csv', xgb_oof_test, delimiter = ',', fmt = '%.5f')
In [18]:
#-------------------------------------------------------------------------------
# First column in train - labels
# First column in test - dummy (indices)
#-------------------------------------------------------------------------------
# Get test index to use as first dummy column in test set for StackNet
ids = test_df['listing_id'].values
# Concat oof and predictions from best model (xgb)
TT_dense = np.c_[TT.toarray(), np.r_[xgb_oof_train, xgb_oof_test]] # (124011, 258)
# Scale
scaler = StandardScaler()
TT_dense = scaler.fit_transform(TT_dense)
# Split
X_train_dense = TT_dense[:r] # (49352, 258)
X_test_dense = TT_dense[r:] # (74659, 258)
# Append target to train
X_train_dense = np.c_[y_train, X_train_dense] # (49352, 259)
# Append id to test
X_test_dense = np.c_[ids, X_test_dense] # (74659, 259)
# Export to txt files
np.savetxt(data_dir + 'train_std.csv', X_train_dense, delimiter = ',', fmt = '%.5f')
np.savetxt(data_dir + 'test_std.csv', X_test_dense, delimiter = ',', fmt = '%.5f')
In [19]:
# Run StackNet and get output
stacknet_log = check_output(['bash', 'run.sh']).decode(sys.stdout.encoding)
# Save output to file
with open(data_dir + 'stacknet_log.txt', 'w') as f:
str_len = f.write(stacknet_log)
In [20]:
# Load StackNet oof
stacknet_oof_train = np.loadtxt('stacknet_oof2.csv', delimiter = ',')
stacknet_oof_test = np.loadtxt('stacknet_oof_test2.csv', delimiter = ',')
In [21]:
# Inint model
model = ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators = 1000,
criterion = 'entropy', max_depth = None)
# Get oof
et_oof_train, et_oof_test = oof(model, np.c_[X_train_dense[:, 1:], stacknet_oof_train],
y_train, np.c_[X_test_dense[:, 1:], stacknet_oof_test], oof_test = True)
# Export to txt files
np.savetxt(data_dir + 'et_oof_train.csv', et_oof_train, delimiter = ',', fmt = '%.5f')
np.savetxt(data_dir + 'et_oof_test.csv', et_oof_test, delimiter = ',', fmt = '%.5f')
In [22]:
# Output oof scores
print('XGB: %.6f' % log_loss(y_train, xgb_oof_train))
print('StackNet: %.6f' % log_loss(y_train, stacknet_oof_train))
print('ET: %.6f' % log_loss(y_train, et_oof_train))
In [23]:
#-------------------------------------------------------------------------------
# One parameter for each column
#-------------------------------------------------------------------------------
def cost(params):
y_pred = params[:3] * xgb_oof_train + params[3:6] * stacknet_oof_train + params[6:9] * et_oof_train
return log_loss(y_train, y_pred)
def con1(params):
return params[0] + params[3] + params[6] - 1
def con2(params):
return params[1] + params[4] + params[7] - 1
def con3(params):
return params[2] + params[5] + params[8] - 1
# params = [0.33] * 9
# print(cost(params)) # 0.511137
n = 9
init = [0.33] * n
cons = ({'type': 'eq', 'fun': con1},
{'type': 'eq', 'fun': con2},
{'type': 'eq', 'fun': con3})
bounds = [(0, 1)] * n
res = minimize(cost, init, method = 'SLSQP', bounds = bounds, constraints = cons, options = {'maxiter': 100000})
In [25]:
print(res)
In [26]:
params = res['x']
y_pred = params[:3] * xgb_oof_test + params[3:6] * stacknet_oof_test + params[6:9] * et_oof_test
subm_df.loc[:, 'listing_id'] = test_df['listing_id'].values
subm_df.iloc[:, 1:] = y_pred
subm_df.to_csv('./reproduced_submission/reproduced_submission.csv', index = False)