In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')
train_df.sample(5)


Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
711 712 0 1 Klaber, Mr. Herman male NaN 0 0 113028 26.5500 C124 S
559 560 1 3 de Messemaeker, Mrs. Guillaume Joseph (Emma) female 36.0 1 0 345572 17.4000 NaN S
769 770 0 3 Gronnestad, Mr. Daniel Danielsen male 32.0 0 0 8471 8.3625 NaN S
626 627 0 2 Kirkland, Rev. Charles Leonard male 57.0 0 0 219533 12.3500 NaN Q
598 599 0 3 Boulos, Mr. Hanna male NaN 0 0 2664 7.2250 NaN C

In [3]:
test_df = pd.read_csv('../data/raw/test.csv')
test_df.sample(5)


Out[3]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
204 1096 2 Andrew, Mr. Frank Thomas male 25.0 0 0 C.A. 34050 10.5000 NaN S
103 995 3 Johansson Palmquist, Mr. Oskar Leander male 26.0 0 0 347070 7.7750 NaN S
205 1097 1 Omont, Mr. Alfred Fernand male NaN 0 0 F.C. 12998 25.7417 NaN C
202 1094 1 Astor, Col. John Jacob male 47.0 1 0 PC 17757 227.5250 C62 C64 C
169 1061 3 Hellstrom, Miss. Hilda Maria female 22.0 0 0 7548 8.9625 NaN S

In [4]:
# dummies for unseen data
# http://stackoverflow.com/a/37451867/436721
train_df["SibSp"] = train_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
train_df["Parch"] = train_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])

test_df["SibSp"] = test_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
test_df["Parch"] = test_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])

In [5]:
def xtract(df,test=False):
    cpy = df.copy()
      
    cpy = pd.concat([df,pd.get_dummies(df["Sex"],prefix='sex')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Pclass"],prefix='class')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["SibSp"],prefix='sib')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Parch"],prefix='parch')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Embarked"],prefix='emb')],axis=1)
    
    # only columns we'll actually use
    if test:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb')  or col == "Age" or col == "Survived" or col == "PassengerId" ] ]
        return cpy
    else:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb') or col == "Age" or col == "Survived" ] ]
        return cpy.dropna()
    
        
train_df_clean = xtract(train_df)
train_df_clean.head()


Out[5]:
Survived Age sex_female sex_male class_1 class_2 class_3 sib_0 sib_1 sib_2 ... parch_3 parch_4 parch_5 parch_6 parch_7 parch_8 parch_9 emb_C emb_Q emb_S
0 0 22.0 0 1 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
1 1 38.0 1 0 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
2 1 26.0 1 0 0 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 1 35.0 1 0 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
4 0 35.0 0 1 0 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 30 columns


In [6]:
test_df_clean = xtract(test_df,test=True)
test_df_clean.sample(5)


Out[6]:
PassengerId Age sex_female sex_male class_1 class_2 class_3 sib_0 sib_1 sib_2 ... parch_3 parch_4 parch_5 parch_6 parch_7 parch_8 parch_9 emb_C emb_Q emb_S
205 1097 NaN 0 1 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
0 892 34.5 0 1 0 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 1 0
355 1247 50.0 0 1 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
161 1053 7.0 0 1 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
245 1137 41.0 0 1 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 30 columns


In [7]:
X = []
y = []

for row in train_df_clean.values:
    X.append(row[1:])
    y.append(row[0])
    
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [8]:
# comment this block to see the difference
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
clf = MLPClassifier(random_state=1,solver='lbfgs')
clf.fit(X_train,y_train)
metrics.accuracy_score(y_test,clf.predict(X_test))


Out[9]:
0.70833333333333337

In [10]:
X_test.shape


Out[10]:
(72, 29)

In [11]:
passenger_ids = []
X_out = []

for row in test_df_clean.values:
    passenger_ids.append(row[0])
    X_out.append(row[1:])
    
X_out = np.array(X_out)

In [12]:
imp = Imputer()
imp.fit(X_train)
X_out = imp.transform(X_out)
X_out.shape


Out[12]:
(418, 29)

In [13]:
y_out = clf.predict(X_out)

In [14]:
out_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_out
})
out_df.head()


Out[14]:
PassengerId Survived
0 892.0 0.0
1 893.0 1.0
2 894.0 1.0
3 895.0 1.0
4 896.0 1.0

In [15]:
out_df["PassengerId"] = out_df["PassengerId"].apply(lambda dbl: int(dbl))
out_df["Survived"] = out_df["Survived"].apply(lambda dbl: int(dbl))
out_df.head()


Out[15]:
PassengerId Survived
0 892 0
1 893 1
2 894 1
3 895 1
4 896 1

In [16]:
out_df.to_csv("../data/interim/nn.csv", index=False)