notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer



In [2]:

    
train_df = pd.read_csv('../data/raw/train.csv')
train_df.sample(5)









    Out[2]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      711
      712
      0
      1
      Klaber, Mr. Herman
      male
      NaN
      0
      0
      113028
      26.5500
      C124
      S
    
    
      559
      560
      1
      3
      de Messemaeker, Mrs. Guillaume Joseph (Emma)
      female
      36.0
      1
      0
      345572
      17.4000
      NaN
      S
    
    
      769
      770
      0
      3
      Gronnestad, Mr. Daniel Danielsen
      male
      32.0
      0
      0
      8471
      8.3625
      NaN
      S
    
    
      626
      627
      0
      2
      Kirkland, Rev. Charles Leonard
      male
      57.0
      0
      0
      219533
      12.3500
      NaN
      Q
    
    
      598
      599
      0
      3
      Boulos, Mr. Hanna
      male
      NaN
      0
      0
      2664
      7.2250
      NaN
      C



In [3]:

    
test_df = pd.read_csv('../data/raw/test.csv')
test_df.sample(5)









    Out[3]:






  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      204
      1096
      2
      Andrew, Mr. Frank Thomas
      male
      25.0
      0
      0
      C.A. 34050
      10.5000
      NaN
      S
    
    
      103
      995
      3
      Johansson Palmquist, Mr. Oskar Leander
      male
      26.0
      0
      0
      347070
      7.7750
      NaN
      S
    
    
      205
      1097
      1
      Omont, Mr. Alfred Fernand
      male
      NaN
      0
      0
      F.C. 12998
      25.7417
      NaN
      C
    
    
      202
      1094
      1
      Astor, Col. John Jacob
      male
      47.0
      1
      0
      PC 17757
      227.5250
      C62 C64
      C
    
    
      169
      1061
      3
      Hellstrom, Miss. Hilda Maria
      female
      22.0
      0
      0
      7548
      8.9625
      NaN
      S



In [4]:

    
# dummies for unseen data
# http://stackoverflow.com/a/37451867/436721
train_df["SibSp"] = train_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
train_df["Parch"] = train_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])

test_df["SibSp"] = test_df["SibSp"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])
test_df["Parch"] = test_df["Parch"].astype('category',categories=[0,1,2,3,4,5,6,7,8,9])



In [5]:

    
def xtract(df,test=False):
    cpy = df.copy()
      
    cpy = pd.concat([df,pd.get_dummies(df["Sex"],prefix='sex')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Pclass"],prefix='class')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["SibSp"],prefix='sib')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Parch"],prefix='parch')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Embarked"],prefix='emb')],axis=1)
    
    # only columns we'll actually use
    if test:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb')  or col == "Age" or col == "Survived" or col == "PassengerId" ] ]
        return cpy
    else:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col.startswith('sib') or col.startswith('parch') or col.startswith('emb') or col == "Age" or col == "Survived" ] ]
        return cpy.dropna()
    
        
train_df_clean = xtract(train_df)
train_df_clean.head()









    Out[5]:






  
    
      
      Survived
      Age
      sex_female
      sex_male
      class_1
      class_2
      class_3
      sib_0
      sib_1
      sib_2
      ...
      parch_3
      parch_4
      parch_5
      parch_6
      parch_7
      parch_8
      parch_9
      emb_C
      emb_Q
      emb_S
    
  
  
    
      0
      0
      22.0
      0
      1
      0
      0
      1
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1
      1
      38.0
      1
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      2
      1
      26.0
      1
      0
      0
      0
      1
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      3
      1
      35.0
      1
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      0
      35.0
      0
      1
      0
      0
      1
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 30 columns



In [6]:

    
test_df_clean = xtract(test_df,test=True)
test_df_clean.sample(5)









    Out[6]:






  
    
      
      PassengerId
      Age
      sex_female
      sex_male
      class_1
      class_2
      class_3
      sib_0
      sib_1
      sib_2
      ...
      parch_3
      parch_4
      parch_5
      parch_6
      parch_7
      parch_8
      parch_9
      emb_C
      emb_Q
      emb_S
    
  
  
    
      205
      1097
      NaN
      0
      1
      1
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      0
      892
      34.5
      0
      1
      0
      0
      1
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      355
      1247
      50.0
      0
      1
      1
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      161
      1053
      7.0
      0
      1
      0
      0
      1
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      245
      1137
      41.0
      0
      1
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 30 columns



In [7]:

    
X = []
y = []

for row in train_df_clean.values:
    X.append(row[1:])
    y.append(row[0])
    
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)



In [8]:

    
# comment this block to see the difference
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [9]:

    
clf = MLPClassifier(random_state=1,solver='lbfgs')
clf.fit(X_train,y_train)
metrics.accuracy_score(y_test,clf.predict(X_test))









    Out[9]:





0.70833333333333337



In [10]:

    
X_test.shape









    Out[10]:





(72, 29)



In [11]:

    
passenger_ids = []
X_out = []

for row in test_df_clean.values:
    passenger_ids.append(row[0])
    X_out.append(row[1:])
    
X_out = np.array(X_out)



In [12]:

    
imp = Imputer()
imp.fit(X_train)
X_out = imp.transform(X_out)
X_out.shape









    Out[12]:





(418, 29)



In [13]:

    
y_out = clf.predict(X_out)



In [14]:

    
out_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_out
})
out_df.head()









    Out[14]:






  
    
      
      PassengerId
      Survived
    
  
  
    
      0
      892.0
      0.0
    
    
      1
      893.0
      1.0
    
    
      2
      894.0
      1.0
    
    
      3
      895.0
      1.0
    
    
      4
      896.0
      1.0



In [15]:

    
out_df["PassengerId"] = out_df["PassengerId"].apply(lambda dbl: int(dbl))
out_df["Survived"] = out_df["Survived"].apply(lambda dbl: int(dbl))
out_df.head()









    Out[15]:






  
    
      
      PassengerId
      Survived
    
  
  
    
      0
      892
      0
    
    
      1
      893
      1
    
    
      2
      894
      1
    
    
      3
      895
      1
    
    
      4
      896
      1



In [16]:

    
out_df.to_csv("../data/interim/nn.csv", index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
711	712	0	1	Klaber, Mr. Herman	male	NaN	0	113028	26.5500	C124	S
559	560	1	3	de Messemaeker, Mrs. Guillaume Joseph (Emma)	female	36.0	1	345572	17.4000	NaN	S
769	770	0	3	Gronnestad, Mr. Daniel Danielsen	male	32.0	0	8471	8.3625	NaN	S
626	627	0	2	Kirkland, Rev. Charles Leonard	male	57.0	0	219533	12.3500	NaN	Q
598	599	0	3	Boulos, Mr. Hanna	male	NaN	0	2664	7.2250	NaN	C

	PassengerId	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
204	1096	2	Andrew, Mr. Frank Thomas	male	25.0	0	C.A. 34050	10.5000	NaN	S
103	995	3	Johansson Palmquist, Mr. Oskar Leander	male	26.0	0	347070	7.7750	NaN	S
205	1097	1	Omont, Mr. Alfred Fernand	male	NaN	0	F.C. 12998	25.7417	NaN	C
202	1094	1	Astor, Col. John Jacob	male	47.0	1	PC 17757	227.5250	C62 C64	C
169	1061	3	Hellstrom, Miss. Hilda Maria	female	22.0	0	7548	8.9625	NaN	S

	Survived	Age	sex_female	sex_male	class_1	class_3	sib_0	sib_1	...	emb_C	emb_S
0	0	22.0	0	1	0	1	0	1	...	0	1
1	1	38.0	1	0	1	0	0	1	...	1	0
2	1	26.0	1	0	0	1	1	0	...	0	1
3	1	35.0	1	0	1	0	0	1	...	0	1
4	0	35.0	0	1	0	1	1	0	...	0	1

	PassengerId	Age	sex_male	class_1	class_3	sib_0	sib_1	...	emb_C	emb_Q	emb_S
205	1097	NaN	1	1	0	1	0	...	1	0	0
0	892	34.5	1	0	1	1	0	...	0	1	0
355	1247	50.0	1	1	0	1	0	...	0	0	1
161	1053	7.0	1	0	1	0	1	...	1	0	0
245	1137	41.0	1	1	0	0	1	...	0	0	1