In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
#size of training dataset
train_samples = train.shape[0]
In [2]:
train.head(10)
Out[2]:
In [3]:
print("Missing values:")
for f in train.columns:
if pd.concat((train,test))[f].isnull().any():
print("- {}: {:.1f}%".format(f, 100 * pd.concat((train,test))[f].isnull().sum()/len(pd.concat((train,test)))))
In [4]:
train.Embarked.unique()
Out[4]:
In [5]:
train.groupby(by='Survived').PassengerId.count()
# double number of not survived that survived
Out[5]:
In [6]:
train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1] ).unique()
Out[6]:
In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def scale(X, fit_scaler=False):
# Input is a dataframe
#
# Note the way of scaling (df[df.columns])
# we want to mantain the dataframe (instead of numpy array)
if fit_scaler:
X[X.columns] = scaler.fit_transform(X[X.columns])
else:
X[X.columns] = scaler.transform(X[X.columns])
return X
def preprocess(df):
X = df[['Pclass','Sex']].copy()
# feature engineering
X.Sex = X.Sex.map({'female':1, 'male':0})
X['Family'] = df.Parch + df.Parch
return X
In [8]:
# plotting a scatter matrix
def plot_matrix(X_train, y_train):
colormap = {0:'firebrick',1:'steelblue'}
colors = np.vectorize(colormap.get)(y_train)
pd.plotting.scatter_matrix(X_train, c=colors, marker = 'o', s=30,
hist_kwds={'bins':15}, figsize=(9,9));
In [9]:
def save_to_file(clf, X_test):
import os
predictions = clf.predict(X_test)
passengerId = 892
file = "PassengerId,Survived" + os.linesep
for i in range(len(X_test)):
file += "{},{}".format(passengerId, (int)(predictions[i])) + os.linesep
passengerId += 1
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)
In [10]:
#baseline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
def baseline(X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
dummy = DummyClassifier(random_state=0)
#be aware of y as column vector
dummy.fit(X_train, y_train.values.reshape(-1))
acc = dummy.score(X_val.values, y_val.values.reshape(-1))
print('Accuracy: {:.2f}\n'.format(acc))
# Combined report with all above metrics
print(classification_report(y_val, dummy.predict(X_val), target_names=['Not Survived', 'Survived']))
In [11]:
X_train = preprocess(train)
y_train = train[['Survived']]
baseline(X_train, y_train)
In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score
def check_model(X, y):
rfc = RandomForestClassifier(random_state=0)
scores = cross_validate(rfc, X, y, cv=10, scoring='accuracy')
print("Train scores: {:.3f}".format(scores['train_score'].mean()))
print("Test scores: {:.3f}".format(scores['test_score'].mean()))
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)
print("Accuracy: {:.3f}".format(accuracy_score(y_val, y_pred)))
y_probs = rfc.predict_proba(X_val)
auc = roc_auc_score(y_val, y_probs[:,1])
print("AUC:{:.3f}".format(auc))
print(classification_report(y_val, y_pred, target_names=['Not Survived', 'Survived']))
return rfc
In [477]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score
def grid_search(X, y, test_size=0.25):
max_range = np.arange(3, X.shape[1]+1, 5)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size , random_state=0)
'''
Best params:{'min_samples_leaf': 3,
'n_estimators': 50,
'bootstrap': True,
'max_features': 'sqrt',
'max_depth': 8,
'min_samples_split': 13,
'class_weight': 'balanced'}
'''
params = {
'n_estimators': [50, 100, 150],
'max_features': ['sqrt'],
'max_depth' : [8, 10, 12, 50],
'class_weight': ['balanced', {1:2}],
'min_samples_split': [5, 10, 13, 15],
'min_samples_leaf': [1, 3, 5, 7],
'bootstrap': [True, False],
}
params = {'n_estimators': [150, 200, 300, 500],
'bootstrap': [True],
'class_weight': [{1:2}],
'max_depth' : [25],
'max_features': ['sqrt']}
rfc = RandomForestClassifier(n_jobs=2, random_state=0)
grid_rfc = GridSearchCV(rfc, param_grid=params, cv=10, n_jobs=2, scoring='accuracy')
grid_rfc.fit(X_train, y_train)
best_rfc = grid_rfc.best_estimator_
y_pred = best_rfc.predict(X_val)
print("Accuracy: {:.3f}".format(accuracy_score(y_val, y_pred)))
y_probs = best_rfc.predict_proba(X_val)
auc = roc_auc_score(y_val, y_probs[:,1])
print("AUC:{:.3f}".format(auc))
print("Best params: {}\n".format(grid_rfc.best_params_))
print(classification_report(y_val, y_pred, target_names=['Not Survived', 'Survived']))
return best_rfc
In [478]:
X_train = preprocess(train).values
y_train = train[['Survived']].values.reshape(-1)
rfc = check_model(X_train, y_train)
In [479]:
X_test = preprocess(test)
save_to_file(rfc, X_test)
In [480]:
def preprocess_2(df):
X = df[['Pclass','Sex']].copy()
# feature engineering
X.Sex = X.Sex.map({'female':1, 'male':0})
X['Family'] = (df.SibSp*df.Parch)/(df.SibSp + df.Parch + 0.0001)
X['Age'] = df.Age.fillna(df.Age.median())
group_pclass_fare = df.groupby(by='Pclass').Fare.median()
X['Fare'] = np.where(df.Fare.isnull(), group_pclass_fare[df.Pclass], df.Fare)
return X
In [481]:
X_train = preprocess_2(train)
rfc = check_model(X_train, y_train)
rfc = grid_search(X_train, y_train, test_size=0.1)
In [482]:
X_test = preprocess_2(test)
save_to_file(rfc, X_test)
In [483]:
def process_sex(data):
data.Sex = data.Sex.map({'female':0, 'male':1})
return data
def process_embarked(data):
#fill with most common
most_common = data['Embarked'].value_counts().index[0]
data.Embarked = data.Embarked.fillna(most_common)
#U of unknown
#data.Embarked = data.Embarked.fillna('U')
#data.Embarked = data.Embarked.map({'S':0,'C':1,'Q':2,'U':3})
dummies = pd.get_dummies(data.Embarked, prefix='Embarked')
data = pd.concat([data, dummies], axis=1)
return data
def process_family(data):
data['Family'] = data.SibSp + data.Parch
def getFamilySize(num):
if num == 0:
return 'alone'
elif num <= 2:
return 'small'
elif num == 3:
return 'medium'
else:
return 'large'
data['FamilySize'] = data.Family.apply(getFamilySize)
dummies = pd.get_dummies(data.FamilySize, prefix='FamilySize')
data = pd.concat([data, dummies], axis=1).drop('FamilySize', axis=1)
return data
def process_name(data):
dict_names = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
data['Name'] = data.Name.apply(lambda s: s.split(". ")[0].split(", ")[1])
data.Name = data.Name.map(dict_names)
dummies = pd.get_dummies(data.Name, prefix='Name')
data = pd.concat([data, dummies], axis=1)
return data
def process_age(data):
grouped_name = data.groupby(by=['Sex','Pclass','Name']).Age.median()
data.Age = data.apply(lambda r: grouped_name[r.Sex, r.Pclass, r.Name] if np.isnan(r.Age) else r.Age, axis=1)
#Just in case there is no median() por Sex-Pclass-Name
if(data.Age.isnull().any()):
grouped_name_2 = data.groupby(by=['Sex','Pclass']).Age.median()
data.Age = data.apply(lambda r: grouped_name_2[r.Sex, r.Pclass] if np.isnan(r.Age) else r.Age, axis=1)
print('Age from only 2 clasess')
return data
def process_fare(data):
group_pclass_fare = data.groupby(by='Pclass').Fare.median()
data.Fare = np.where(data.Fare.isnull(), group_pclass_fare[data.Pclass], data.Fare)
return data
def process_cabin(data):
data['Deck'] = data.Cabin.str[0]
data.loc[data.Deck.isnull(), 'Deck'] = 'U' #unknown
#data.Deck = data.Deck.map({'NaN':0, 'F':1, 'E':2, 'C':3, 'D':4, 'B':5, 'G':6, 'A':7, 'T':8})
dummies = pd.get_dummies(data.Deck, prefix='Deck')
data = pd.concat([data, dummies], axis=1)
#data['Room'] = np.where(data.Cabin.isnull(), 999, data.Cabin.str.split().str.get(0).str[1:])
#data.Room = pd.to_numeric(data.Room)
#data.loc[data.Room.isnull(), 'Room'] = 999
return data
def process_ticket(data):
#data['TicketNumber'] = data.Ticket.str.extractall("(.*\s)?(.+)")[1]
data['TicketNumber'] = data.Ticket.str.extract("(.*\s)?(.+)", expand=True)[1]
data['TicketCode'] = data.Ticket.str.extract("(.*\s)?(.+)", expand=True)[0]
data.TicketCode = data.TicketCode.fillna('NAN')
dummies = pd.get_dummies(data.TicketCode, prefix='TicketCode')
data = pd.concat([data, dummies], axis=1)
#special case LINE
data.TicketNumber.replace('LINE', '0', inplace=True)
data.TicketNumber = data.TicketNumber.astype('int64')
dummies = pd.get_dummies(data.TicketNumber, prefix='TicketNumber')
data = pd.concat([data, dummies], axis=1)
data["TicketGroupSize"] = data.groupby('Ticket')['Ticket'].transform('count')
return data
def process_pclass(data):
dummies = pd.get_dummies(data.Pclass, prefix='Pclass')
data = pd.concat([data, dummies], axis=1)
return data
def process(data):
data = process_sex(data)
data = process_embarked(data)
data = process_family(data)
data = process_name(data)
data = process_age(data)
data = process_fare(data)
data = process_cabin(data)
data = process_ticket(data)
data = process_pclass(data)
doNotInclude = ['PassengerId','Name','Pclass','Cabin','Deck','Ticket','Embarked','TicketCode','TicketNumber']
data = data.drop(doNotInclude, axis=1)
return data
In [484]:
#concat for auto generated dummie features from categorical
data = pd.concat([train,test])
data = process(data)
processed_train = data[:train_samples]
processed_test = data[train_samples:]
In [485]:
X_train = processed_train.drop('Survived', axis=1).values
y_train = processed_train[['Survived']].values.ravel()
check_model(X_train, y_train)
Out[485]:
In [486]:
clf = grid_search(X_train, y_train)
In [487]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, clf.predict(X_train))
Out[487]:
In [488]:
X_test = processed_test.drop('Survived', axis=1)
save_to_file(clf, X_test)
In [489]:
def CHECK():
import pandas as pd
from sklearn.metrics import accuracy_score
other = pd.read_csv('attempt_79904.txt')
mine = pd.read_csv('attempt.txt')
data = pd.merge(other,mine, on='PassengerId')
acc= accuracy_score(data.Survived_x, data.Survived_y)
print("Acc: {}".format(acc))
CHECK()
In [ ]: