In [31]:
#Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [41]:
#Get data
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
combined = pd.concat([train, test], axis=0)
combined.info()
In [51]:
#Preprocessing
train_c = train.copy()
test_c = test.copy()
train_c['label'] = train_c.y
train_c.drop(['ID', 'y'], inplace=True, axis=1)
test_c.drop('ID', inplace=True, axis=1)
In [34]:
#Histogram of y; has four peaks
train_c.label.hist(bins=1000)
plt.show()
In [35]:
#Drop the y-outlier
train_c = train_c[train_c.label < 175]
train_c.label.hist(bins=1000)
plt.show()
In [52]:
#divide features into qualitative and quantitative
qual = []
quan = []
for col in train_c.columns[:-1]:
if train_c[col].dtype == 'object':
qual.append(col)
elif train_c[col].dtype != 'object':
quan.append(col)
len(qual), len(quan)
Out[52]:
There is a lot of room for feature engineering the 8 qualitative features, but we'll reserve it for later
In [53]:
#Drop quantitative features for which most samples take 0 or 1
for cols in quan:
if train_c[cols].mean() < 0.01 or train_c[cols].mean() > 0.99:
train_c.drop(cols, inplace=True, axis=1)
test_c.drop(cols, inplace=True, axis=1)
In [60]:
#For now we only use the quantitative features left to make predictions
quan_features = train_c.columns[8:-1]
In [61]:
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
From now we try a range of estimators and use GridSearch to iteratively tune their hyperparameters
In [62]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge_cv = GridSearchCV(estimator=ridge, param_grid={'alpha':np.arange(1, 50, 1)}, cv=5)
ridge_cv.fit(train_c[quan_features], train_c.label)
Out[62]:
In [69]:
ridge_cv.best_score_
Out[69]:
In [72]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso_cv = GridSearchCV(estimator=lasso, param_grid={'alpha':np.arange(0, 0.05, 0.005)}, cv=5)
lasso_cv.fit(train_c[quan_features], train_c.label)
Out[72]:
In [73]:
lasso_cv.best_score_
Out[73]:
In [81]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
params = {'max_depth':np.arange(5,8),
'min_samples_split':np.arange(3, 6)}
rf_cv = GridSearchCV(estimator=rf, param_grid=params, cv=5)
rf_cv.fit(train_c[quan_features], train_c.label)
Out[81]:
In [82]:
rf_cv.best_score_
Out[82]:
In [85]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()
params = {'alpha':np.arange(0.01, 0.05, 0.005),
'l1_ratio': np.arange(0.1, 0.9, 0.1)}
en_cv = GridSearchCV(estimator=en, param_grid=params, cv=5)
en_cv.fit(train_c[quan_features], train_c.label)
Out[85]:
In [86]:
en_cv.best_score_
Out[86]:
In [96]:
In [98]:
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
lin=LinearRegression()
basic_regressors= [ridge_cv.best_estimator_, lasso_cv.best_estimator_,
rf_cv.best_estimator_, en_cv.best_estimator_]
stacker=StackingRegressor(regressors=basic_regressors, meta_regressor=lin)
stacker.fit(train_c[quan_features], train_c.label)
pred = stacker.predict(train_c[quan_features])
r2_score(train_c.label, pred)
Out[98]:
In [99]:
result = pd.DataFrame()
result['ID']=test.ID
result['y']=stacker.predict(test_c[quan_features])
result.to_csv('./stackedprediction.csv', index=False)