In [26]:
import pandas
from pandas import DataFrame
import statsmodels
import matplotlib.pyplot as plt
import pylab as pl
import numpy
import imp
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import StratifiedShuffleSplit, KFold, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
import multiprocessing
# N_JOBS = multiprocessing.cpu_count()
N_JOBS = 7
In [7]:
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
In [4]:
train.head()
Out[4]:
In [5]:
train.describe()
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
test.describe()
Out[7]:
In [53]:
%matplotlib inline
fig, axes = plt.subplots(ncols = 5)
fig.set_size_inches(18, 5)
train.pivot_table('PassengerId', 'Pclass', 'Survived', 'count').plot(ax = axes[0], kind='bar', stacked=True)
train.pivot_table('PassengerId', 'Sex', 'Survived', 'count').plot(ax = axes[1], kind='bar', stacked=True)
train.pivot_table('PassengerId', 'Embarked', 'Survived', 'count').plot(ax = axes[2], kind='bar', stacked=True)
train.pivot_table('PassengerId', ['SibSp'], 'Survived', 'count').plot(ax=axes[3], title='SibSp')
train.pivot_table('PassengerId', ['Parch'], 'Survived', 'count').plot(ax=axes[4], title='Parch')
Out[53]:
In [54]:
%matplotlib inline
plt.scatter(train['Fare'], train['Survived'])
plt.show()
In [17]:
#
# Preprocessing and scaling
#
import llama
llama = imp.reload(llama)
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
test_pid = test['PassengerId']
train.loc()[train.Cabin.isnull(), "Cabin"] = "U"
train.insert(len(train.columns), "CabinN", [x[0] for x in train["Cabin"]])
test.loc()[test.Cabin.isnull(), "Cabin"] = "U"
test.insert(len(test.columns), "CabinN", [x[0] for x in test["Cabin"]])
llama.replace_nan_fair(train)
llama.replace_nan_age(train)
llama.replace_nan_fair(test)
llama.replace_nan_age(test)
llama.set_family_size(train)
llama.set_family_size(test)
llama.set_title_column(train, test)
train = train.drop("Title", 1)
test = test.drop("Title", 1)
# columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch']
columns_to_drop2 = ['Cabin', 'CabinN_U']
dummy_columns = ['Pclass', 'Sex', 'Embarked', 'CabinN']
train = llama.make_dummies(llama.drop_columns(train, columns_to_drop), dummy_columns)
test = llama.make_dummies(llama.drop_columns(test, columns_to_drop), dummy_columns)
train = llama.drop_columns(train, columns_to_drop2)
test = llama.drop_columns(test, columns_to_drop2)
test.insert(len(test.columns), 'CabinN_T', 0)
llama.normalise(train, test, ['Fare', 'Age'])
print(train.columns)
print(len(train.columns))
print(test.columns)
print(len(test.columns))
train.describe()
train.head()
Out[17]:
In [56]:
%matplotlib inline
train.insert(0, 'PassengerId', [x for x in range(len(train))])
train.pivot_table('PassengerId', 'FamilySize', 'Survived', 'count').plot(kind='bar', stacked=True)
Out[56]:
In [31]:
#
# Shuffling for Cross Validation
#
if "PassengerId" in train.columns:
train = train.drop("PassengerId", 1)
train_y = train['Survived']
train_X = train.drop('Survived', 1)
train_shuf = StratifiedShuffleSplit(train_y, n_iter = 5, test_size = .3, random_state = 123)
# train_shuf = KFold(len(train_y), n_folds=10, shuffle=True)
In [35]:
#
# Classifiers
#
def run_grid_search(train_X, train_y, clf, params, cv, n_jobs = N_JOBS):
gs = GridSearchCV(clf, params, n_jobs = n_jobs, cv = cv, verbose=1)
gs = gs.fit(train_X, train_y)
print("Best estimator:\n", gs.best_estimator_)
print("Grid search score:\t", gs.best_score_)
clf = gs.best_estimator_
cv_new = StratifiedShuffleSplit(train_y, n_iter = 10, test_size = .2, random_state = 345)
# cv_new = KFold(train_y, n_folds=10, shuffle=True)
score = cross_val_score(clf, train_X, train_y, cv=cv_new)
print("CV score:\t", score, '\n=> ', score.mean(), ' (+-', score.std(),')\n', sep = '')
return clf
# help(xgb.XGBClassifier)
gbm = xgb.XGBClassifier
# ENSEMBLES
clf_list = []
par_list = []
clf_list.append(gbm(learning_rate = .004))
temp_par = {'max_depth': [2,3,4,5,6,7,8,9],
'n_estimators': [20, 50, 100, 200, 300],
'reg_alpha': [0, .3, .6, 1],
'base_score': [.3, .5, .75],
'objective': ['binary:logistic', 'reg:linear'],
'gamma': 100. ** numpy.arange(.1, 1, .1)
}
par_list.append(temp_par)
best_clf = run_grid_search(train_X, train_y, clf_list[0], par_list[0], train_shuf, 1)
In [36]:
#
# Writing the output
#
clf = best_clf
postfix = ".gbm"
print(clf)
clf.fit(train_X, train_y)
result = DataFrame()
clf.predict(test)
result.insert(0, 'PassengerId', test_pid)
result.insert(1, "Survived", clf.predict(test))
result.to_csv("out" + postfix + ".csv", index = False)