In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble
import xgboost as xgb
from sklearn.metrics import log_loss, make_scorer, accuracy_score
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import f_classif
import random
# random numbers replication
random.seed(2016)
%matplotlib inline
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
Out[2]:
In [3]:
train.describe()
Out[3]:
In [4]:
# Check for null values
train.isnull().any().any()
Out[4]:
In [14]:
# features that are string
types = train.columns.to_series().groupby(train.dtypes).groups
ctext = types[np.dtype('object')]
ctext
Out[14]:
In [13]:
types[np.dtype('object')]
Out[13]:
In [6]:
# fill na with mode for categorical data
for c in ctext:
mode = train[c].mode()[0]
train[c] = train[c].fillna(mode)
mode = test[c].mode()[0]
test[c] = test[c].fillna(mode)
train[ctext].head()
Out[6]:
In [7]:
# transform string columns to numbers
enc = LabelEncoder()
for c in ctext:
train[c] = enc.fit_transform(train[c])
test[c] = enc.fit_transform(test[c])
train[ctext].head()
Out[7]:
In [8]:
# fill na with mean for numerical data
columns = list(test.columns)
for c in columns:
mean = train[c].mean()
train[c] = train[c].fillna(mean)
mean = test[c].mean()
test[c] = test[c].fillna(mean)
train.head()
Out[8]:
In [9]:
# Check for null values
train.isnull().any().any(), test.isnull().any().any()
Out[9]:
In [10]:
# Save transform datasets
train.to_csv("train_new.csv", index=False)
test.to_csv("test_new.csv", index=False)
In [11]:
y_train = train['target']
train_data = train.drop(['target'], axis=1)
features_names = list(train_data.columns.values)
train_new = SelectKBest(f_classif, k=40).fit(train_data, y_train)
train_data.shape
Out[11]:
In [12]:
fe=train_new.get_support()
fe
Out[12]:
In [13]:
features_selected= [features_names[i] for i in list(fe.nonzero()[0])]
features_selected
Out[13]:
In [14]:
# dataset with selected features
train_new = pd.concat([train['ID'], train[features_selected]], axis=1)
test_new = pd.concat([test['ID'], test[features_selected]], axis=1)
train_new.head()
Out[14]:
In [18]:
# models parameters
g={'ne':200,'md':20,'mf':40,'rs':2016}
etc = ensemble.ExtraTreesClassifier(n_estimators=g['ne'], max_depth=g['md'],
max_features=g['mf'], random_state=g['rs'], criterion='entropy',
min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
etr = ensemble.ExtraTreesRegressor(n_estimators=g['ne'], max_depth=g['md'],
max_features=g['mf'], random_state=g['rs'], min_samples_split= 4,
min_samples_leaf= 2, verbose = 0, n_jobs =-1)
rfc = ensemble.RandomForestClassifier(n_estimators=g['ne'], max_depth=g['md'],
max_features=g['mf'], random_state=g['rs'], criterion='entropy',
min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
rfr = ensemble.RandomForestRegressor(n_estimators=g['ne'], max_depth=g['md'],
max_features=g['mf'], random_state=g['rs'], min_samples_split= 4,
min_samples_leaf= 2, verbose = 0, n_jobs =-1)
xgr = xgb.XGBRegressor(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85,
objective='reg:linear')
xgc = xgb.XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85,
objective='binary:logistic') #try 'binary:logitraw'
#clf = {'etc':etc, 'etr':etr, 'rfc':rfc, 'rfr':rfr, 'xgr':xgr, 'xgc':xgc}
#clf = {'etc':etc, 'rfc':rfc, 'xgc':xgc}
clf = {'etr':etr, 'rfr':rfr, 'rfc':rfc}
In [19]:
# train models with gridSerach
import time
start_time = time.time()
id_test = test['ID']
y_pred = []
best_score = 0.0
id_results = id_test[:]
LL = make_scorer(log_loss, greater_is_better=False)
for c in clf:
if c[:1] != "x": #not xgb
model = GridSearchCV(
estimator=clf[c],
param_grid={},
n_jobs=-1,
cv=2,
verbose=0,
scoring=LL)
model.fit(train_new, y_train.values)
if c[-1:] != "c": #not classifier
y_pred = model.predict(test_new)
print("Ensemble Model: ", c, " Best CV score: ", model.best_score_,
" Time: ", round(((time.time() - start_time) / 60), 2))
else: #classifier
best_score = (
log_loss(y_train.values, model.predict_proba(train))) * -1
y_pred = model.predict_proba(test_new)[:, 1]
print("Ensemble Model: ", c, " Best CV score: ", best_score,
" Time: ", round(((time.time() - start_time) / 60), 2))
else: #xgb
X_fit, X_eval, y_fit, y_eval = train_test_split(
train_new,
y_train,
test_size=0.35,
train_size=0.65,
random_state=g['rs'])
model = clf[c]
model.fit(
X_fit,
y_fit.values,
early_stopping_rounds=20,
eval_metric="logloss",
eval_set=[(X_eval, y_eval)],
verbose=0)
if c == "xgr": #xgb regressor
best_score = (
log_loss(y_train.values, model.predict(train_new))) * -1
y_pred = model.predict(test_new)
else: #xgb classifier
best_score = (
log_loss(y_train.values, model.predict_proba(train_new))) * -1
y_pred = model.predict_proba(test_new)[:, 1]
print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ",
round(((time.time() - start_time) / 60), 2))
for i in range(len(y_pred)):
if y_pred[i] < 0.0:
y_pred[i] = 0.0
if y_pred[i] > 1.0:
y_pred[i] = 1.0
df_in = pd.DataFrame({"ID": id_test, c: y_pred})
id_results = pd.concat([id_results, df_in[c]], axis=1)
In [20]:
id_results['avg'] = id_results.drop('ID', axis=1).apply(np.average, axis=1)
id_results['min'] = id_results.drop('ID', axis=1).apply(min, axis=1)
id_results['max'] = id_results.drop('ID', axis=1).apply(max, axis=1)
id_results['diff'] = id_results['max'] - id_results['min']
for i in range(10):
print(i, len(id_results[id_results['diff'] > (i / 10)]))
id_results.to_csv("results_analysis.csv", index=False)
ds = id_results[['ID', 'avg']]
ds.columns = ['ID', 'PredictedProb']
ds.to_csv('submission.csv', index=False)
In [ ]: