In [1]:
import pandas
from pandas import DataFrame
import statsmodels
import matplotlib.pyplot as plt
import pylab as pl
import numpy
import imp
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
import xgboost
import multiprocessing
# N_JOBS = multiprocessing.cpu_count()
N_JOBS = 7
In [23]:
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
In [4]:
train.head()
Out[4]:
In [5]:
train.describe()
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
test.describe()
Out[7]:
In [41]:
%matplotlib inline
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
fig, axes = plt.subplots(ncols = 4, nrows = 2)
fig.set_size_inches(18, 5)
train.loc()[train.Cabin.isnull(), "Cabin"] = "U"
train.insert(len(train.columns), "CabinN", [x[0] for x in train["Cabin"]])
train.pivot_table('PassengerId', 'Pclass', 'Survived', 'count').plot(ax = axes[0][0], kind='bar', stacked=True)
train.pivot_table('PassengerId', 'Sex', 'Survived', 'count').plot(ax = axes[0][1], kind='bar', stacked=True)
train.pivot_table('PassengerId', 'Embarked', 'Survived', 'count').plot(ax = axes[0][2], kind='bar', stacked=True)
train.pivot_table('PassengerId', ['SibSp'], 'Survived', 'count').plot(ax=axes[0][3], title='SibSp')
train.pivot_table('PassengerId', ['Parch'], 'Survived', 'count').plot(ax=axes[1][0], title='Parch')
train.pivot_table('PassengerId', ['CabinN'], 'Survived', 'count').plot(ax=axes[1][1], kind='bar', stacked=True, title='CabinN')
Out[41]:
In [54]:
%matplotlib inline
plt.scatter(train['Fare'], train['Survived'])
plt.show()
In [44]:
#
# Preprocessing and scaling
#
import llama
llama = imp.reload(llama)
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
test_pid = test['PassengerId']
train.loc()[train.Cabin.isnull(), "Cabin"] = "U"
train.insert(len(train.columns), "CabinN", [x[0] for x in train["Cabin"]])
test.loc()[test.Cabin.isnull(), "Cabin"] = "U"
test.insert(len(test.columns), "CabinN", [x[0] for x in test["Cabin"]])
llama.replace_nan_fair(train)
llama.replace_nan_age(train)
llama.replace_nan_fair(test)
llama.replace_nan_age(test)
llama.set_family_size(train)
llama.set_family_size(test)
llama.set_title_column(train, test)
train = train.drop("Title", 1)
test = test.drop("Title", 1)
# columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch']
columns_to_drop2 = ['Cabin', 'CabinN_U']
dummy_columns = ['Pclass', 'Sex', 'Embarked', 'CabinN']
train = llama.make_dummies(llama.drop_columns(train, columns_to_drop), dummy_columns)
test = llama.make_dummies(llama.drop_columns(test, columns_to_drop), dummy_columns)
train = llama.drop_columns(train, columns_to_drop2)
test = llama.drop_columns(test, columns_to_drop2)
test.insert(len(test.columns), 'CabinN_T', 0)
llama.normalise(train, test, ['Fare', 'Age'])
print(train.columns)
print(len(train.columns))
print(test.columns)
print(len(test.columns))
train.describe()
train.head()
Out[44]:
In [56]:
%matplotlib inline
train.insert(0, 'PassengerId', [x for x in range(len(train))])
train.pivot_table('PassengerId', 'FamilySize', 'Survived', 'count').plot(kind='bar', stacked=True)
Out[56]:
In [46]:
#
# Shuffling for Cross Validation
#
if "PassengerId" in train.columns:
train = train.drop("PassengerId", 1)
train_y = train['Survived']
train_X = train.drop('Survived', 1)
train_shuf = StratifiedShuffleSplit(train_y, n_iter = 5, test_size = .2, random_state = 123)
In [47]:
#
# Classifiers
#
def run_grid_search(train_X, train_y, clf, params, cv, n_jobs = N_JOBS):
gs = GridSearchCV(clf, params, n_jobs = n_jobs, cv = cv, verbose=1)
gs = gs.fit(train_X, train_y)
print("Best estimator:\n", gs.best_estimator_)
print("Grid search score:\t", gs.best_score_)
clf = gs.best_estimator_
cv_new = StratifiedShuffleSplit(train_y, n_iter = 10, test_size = .2, random_state = 345)
score = cross_val_score(clf, train_X, train_y, cv=cv_new)
print("CV score:\t", score, '\n=> ', score.mean(), ' (+-', score.std(),')\n', sep = '')
return clf
# clfs = [ensemble.RandomForestClassifier(),
# linear_model.LogisticRegression(C = 1),
# svm.SVC(C = 10000)]
# for clf in clfs:
# print(clf)
# score = cross_val_score(clf, train_X, train_y, cv=train_shuf)
# print(score, '\n=> ', score.mean(), ' (+-', score.std(),')\n', sep = '')
# GRADIENT BOOSTING
# parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
# 'n_estimators': [30, 100, 300, 600, 1000]}
# 'max_depth': [1, 2, 3, 5, 7],
# 'subsample': [1],
# 'max_features': ['auto']}
# parameters = {'loss': ['deviance', 'exponential'],
# 'n_estimators': [15, 30, 50, 75],
# 'max_depth': [1, 2, 3],
# 'subsample': [1, .7, .3],
# 'max_features': [4, 5, 6]}
# clf = GradientBoostingClassifier()
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# best_logreg = gs.best_estimator_
# LOGISTIC REGRESSION
# parameters = {'C': 10.**numpy.arange(-1, 0, .05), 'penalty': ['l2']}
# clf = linear_model.LogisticRegression()
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# best_logreg = gs.best_estimator_
# SVM
# parameters = {'kernel': ['linear'],
# 'C': 10. ** numpy.arange(-1.8, -1.7, .001)
# }
# clf = svm.SVC()
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# print("LINEAR")
# print(gs.best_score_)
# print(gs.best_estimator_)
# parameters = {'kernel': ['rbf'],
# 'C': 10. ** numpy.arange(3, 3.4, .02),
# 'degree': [2],
# 'gamma': numpy.arange(.2, .3, .005)
# }
# clf = svm.SVC()
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# print("RBF")
# print(gs.best_score_)
# print(gs.best_estimator_)
# parameters = {'kernel': ['poly'],
# 'C': 10. ** numpy.arange(5, 6, .01),
# 'gamma': ['auto'],
# 'coef0': 10. ** numpy.arange(-5, 5, 1)
# }
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# print("POLY")
# print(gs.best_score_)
# print(gs.best_estimator_)
# clf = svm.SVC()
# parameters = {'kernel': ['sigmoid'],
# 'C': 10. ** numpy.arange(1.8, 2.5, .1),
# 'gamma': ['auto'],
# 'coef0': 10. ** numpy.arange(-5, 5, 1)
# }
# gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS,cv = train_shuf)
# gs = gs.fit(train_X, train_y)
# print("SIGMOID")
# print(gs.best_score_)
# print(gs.best_estimator_)
# clf_list.append(RandomForestClassifier())
# temp_par = {'n_estimators': [15, 30, 50, 75, 130, 200, 300, 400, 500, 600, 700],
# 'max_features': numpy.arange(2, len(train_X.columns), 2),
# 'bootstrap': [True, False],
# 'criterion': ['gini', 'entropy']
# }
# par_list.append(temp_par)
# clf_list.append(ExtraTreesClassifier())
# temp_par = {'n_estimators': [15, 30, 50, 75, 130, 200, 300, 400, 500, 600, 700],
# 'max_features': numpy.arange(2, len(train_X.columns), 2),
# 'bootstrap': [True, False],
# 'criterion': ['gini', 'entropy']
# }
# par_list.append(temp_par)
# ENSEMBLES
clf_list = []
par_list = []
clf_list.append(GradientBoostingClassifier())
temp_par = {'loss': ['deviance', 'exponential'],
'n_estimators': [15, 30, 40, 50, 65, 75, 85, 100],
'max_features': [11, 12, 13, 14],
'max_depth': [6, 7, 8, 9],
'min_samples_leaf': [2, 3, 4, 5],
'min_samples_split': [3, 4, 5, 6],
'learning_rate': [.007]
}
par_list.append(temp_par)
for i in range(len(clf_list)):
run_grid_search(train_X, train_y, clf_list[i], par_list[i], train_shuf)
In [ ]:
#
# Writing the output
#
clf = GradientBoostingClassifier(init=None, learning_rate=0.007, loss='deviance',
max_depth=7, max_features=13, max_leaf_nodes=None,
min_samples_leaf=2, min_samples_split=6,
min_weight_fraction_leaf=0.0, n_estimators=100,
presort='auto', random_state=None, subsample=1.0, verbose=0,
warm_start=False)
postfix = ".gb"
print(clf)
clf.fit(train_X, train_y)
result = DataFrame()
clf.predict(test)
result.insert(0, 'PassengerId', test_pid)
result.insert(1, "Survived", clf.predict(test))
result.to_csv("out" + postfix + ".csv", index = False)
In [60]:
#
# Stacking
#
import llama
llama = imp.reload(llama)
# Best SVM
'''
SVC(C=1737.8008287493763, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=2, gamma=0.21000000000000002,
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
'''
model_svm = SVC(C=1737.8008287493763, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=2, gamma=0.21000000000000002,
kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
# Best LogReg
'''
LogisticRegression(C=0.22387211385683412, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
'''
model_logreg = LogisticRegression(C=0.22387211385683412, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=N_JOBS, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# Best RF
'''
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
max_depth=None, max_features=3, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
'''
model_rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
max_depth=None, max_features=3, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=N_JOBS,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
# Best GB
model_gb = GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
max_depth=2, max_features=5, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=50, presort='auto', random_state=None, subsample=1,
verbose=0, warm_start=False)
df = DataFrame()
llama.insert_predictions(df, llama.stacking_model_predict(model_svm, train_X, train_y, train_X), 'SVM')
llama.insert_predictions(df, llama.stacking_model_predict(model_logreg, train_X, train_y, train_X), 'LogReg')
llama.insert_predictions(df, llama.stacking_model_predict(model_rf, train_X, train_y, train_X), 'RF')
df.head()
Out[60]:
In [61]:
# parameters = {'C': 10.**numpy.arange(-1.4, -.9, .0001), 'penalty': ['l1']}
# stack_model = linear_model.LogisticRegression()
stack_model = VotingClassifier(estimators = [#("svm", model_svm),
("logreg", model_logreg),
("rf", model_rf),
("gb", model_gb)],
voting = 'soft')
# gs = GridSearchCV(stack_model, parameters, n_jobs = N_JOBS, cv = train_shuf)
# gs = gs.fit(df, train_y)
# print(gs.best_estimator_)
# print(gs.best_score_)
print(stack_model)
score = cross_val_score(stack_model, train_X, train_y, cv=train_shuf)
print(score, '\n=> ', score.mean(), ' (+-', score.std(),')\n', sep = '')
In [62]:
#
# Writing the stacking output
#
# df = DataFrame()
# insert_predictions(df, stacking_model_predict(model_svm, train_X, train_y, test), 'SVM')
# insert_predictions(df, stacking_model_predict(model_logreg, train_X, train_y, test), 'LogReg')
# insert_predictions(df, stacking_model_predict(model_rf, train_X, train_y, test), 'RF')
# clf = gs.best_estimator_
# print(clf)
stack_model.fit(train_X, train_y)
result = DataFrame()
result.insert(0, 'PassengerId', test_pid)
result.insert(1, "Survived", stack_model.predict(test))
result.to_csv("out_stacking.csv", index = False)
In [63]:
new_train_X = train_X.append(test)
new_train_y = numpy.concatenate([train_y, stack_model.predict(test)], 0)
new_train_shuf = StratifiedShuffleSplit(new_train_y, n_iter = 10, test_size = .2, random_state = 123)
parameters = {'loss': ['deviance', 'exponential'],
'n_estimators': [15, 30, 50, 75],
'max_depth': [1, 2, 3],
'subsample': [1, .7, .3],
'max_features': [4, 5, 6]}
clf = GradientBoostingClassifier()
gs = GridSearchCV(clf, parameters, n_jobs = N_JOBS, cv = new_train_shuf)
gs = gs.fit(new_train_X, new_train_y)
print(gs.best_estimator_)
print(gs.best_score_)
In [64]:
clf = gs.best_estimator_
print(clf)
clf.fit(new_train_X, new_train_y)
result = DataFrame()
result.insert(0, 'PassengerId', test_pid)
result.insert(1, "Survived", clf.predict(test))
result.to_csv("out_usetest.csv", index = False)