In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
In [2]:
train_df = pd.read_csv('../data/raw/train.csv')
train_df.sample(5)
Out[2]:
In [3]:
test_df = pd.read_csv('../data/raw/test.csv')
test_df.sample(5)
Out[3]:
In [4]:
# dummies for unseen data
# http://stackoverflow.com/a/37451867/436721
train_df["SibSp"] = train_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
train_df["Parch"] = train_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
test_df["SibSp"] = test_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
test_df["Parch"] = test_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
In [5]:
def xtract(df,test=False):
cpy = df.copy()
cpy = pd.concat([df,pd.get_dummies(df["Sex"],prefix='sex')],axis=1)
cpy = pd.concat([cpy,pd.get_dummies(df["Pclass"],prefix='class')],axis=1)
cpy = pd.concat([cpy,pd.get_dummies(df["SibSp"],prefix='sib')],axis=1)
cpy = pd.concat([cpy,pd.get_dummies(df["Parch"],prefix='parch')],axis=1)
cpy = pd.concat([cpy,pd.get_dummies(df["Embarked"],prefix='emb')],axis=1)
# only columns we'll actually use
if test:
cpy = cpy[ [col for col in list(cpy) if col.startswith('sex') or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb') or col == "Age" or col == "Survived" or col == "PassengerId" ] ]
return cpy
else:
cpy = cpy[ [col for col in list(cpy) if col.startswith('sex') or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb') or col == "Age" or col == "Survived" ] ]
return cpy.dropna()
train_df_clean = xtract(train_df)
train_df_clean.head()
Out[5]:
In [6]:
test_df_clean = xtract(test_df,test=True)
test_df_clean.sample(5)
Out[6]:
In [7]:
X = []
y = []
for row in train_df_clean.values:
X.append(row[1:])
y.append(row[0])
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
In [8]:
# comment this block to see the difference
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [9]:
clf = MLPClassifier(random_state=1,solver='lbfgs')
clf.fit(X_train,y_train)
metrics.accuracy_score(y_test,clf.predict(X_test))
Out[9]:
In [10]:
X_test.shape
Out[10]:
In [11]:
passenger_ids = []
X_out = []
for row in test_df_clean.values:
passenger_ids.append(row[0])
X_out.append(row[1:])
X_out = np.array(X_out)
In [12]:
imp = Imputer()
imp.fit(X_train)
X_out = imp.transform(X_out)
X_out.shape
Out[12]:
In [13]:
y_out = clf.predict(X_out)
In [14]:
out_df = pd.DataFrame({
'PassengerId': passenger_ids,
'Survived': y_out
})
out_df.head()
Out[14]:
In [15]:
out_df["PassengerId"] = out_df["PassengerId"].apply(lambda dbl: int(dbl))
out_df["Survived"] = out_df["Survived"].apply(lambda dbl: int(dbl))
out_df.head()
Out[15]:
In [16]:
out_df.to_csv("../data/interim/nn.csv", index=False)