In [15]:
import pandas as pd
import numpy as np

In [9]:
data = pd.read_csv('resources/titanic.csv')

In [10]:
print(data.head(10))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54.0      0   
7                     Palsson, Master. Gosta Leonard    male   2.0      3   
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
5      0            330877   8.4583   NaN        Q  
6      0             17463  51.8625   E46        S  
7      1            349909  21.0750   NaN        S  
8      2            347742  11.1333   NaN        S  
9      0            237736  30.0708   NaN        C  

In [11]:
prepared_data = data[["Pclass", "Fare", "Age", "Sex"]]

In [13]:
print(type(prepared_data))
print(prepared_data.head(10))


<class 'pandas.core.frame.DataFrame'>
   Pclass     Fare   Age     Sex
0       3   7.2500  22.0    male
1       1  71.2833  38.0  female
2       3   7.9250  26.0  female
3       1  53.1000  35.0  female
4       3   8.0500  35.0    male
5       3   8.4583   NaN    male
6       1  51.8625  54.0    male
7       3  21.0750   2.0    male
8       3  11.1333  27.0  female
9       2  30.0708  14.0  female

In [35]:
pdf1 = prepared_data[np.isfinite(prepared_data['Age'])]
pdf = pdf1[pdf1['Sex'].notnull()]


from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
dicts = {}

label.fit(pdf.Sex.drop_duplicates())
dicts['Sex'] = list(label.classes_)
pdf.Sex = label.transform(pdf.Sex)

print(pdf.head(10))
print(len(pdf))


    Pclass     Fare   Age  Sex
0        3   7.2500  22.0    1
1        1  71.2833  38.0    0
2        3   7.9250  26.0    0
3        1  53.1000  35.0    0
4        3   8.0500  35.0    1
6        1  51.8625  54.0    1
7        3  21.0750   2.0    1
8        3  11.1333  27.0    0
9        2  30.0708  14.0    0
10       3  16.7000   4.0    0
714

In [33]:
surv = data[["Survived", "Age", "Sex"]]
surv1 = surv[np.isfinite(surv['Age'])]
survived = surv1[surv1['Sex'].notnull()]
survived.Sex = survived

print(survived.head(10))
print(len(survived))


    Survived   Age     Sex
0          0  22.0    male
1          1  38.0  female
2          1  26.0  female
3          1  35.0  female
4          0  35.0    male
6          0  54.0    male
7          0   2.0    male
8          1  27.0  female
9          1  14.0  female
10         1   4.0  female
714

In [37]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=241)
clf.fit(pdf, survived.Survived)


Out[37]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=241,
            splitter='best')

In [40]:
print(pdf.head(1))
print(clf.feature_importances_)


   Pclass  Fare   Age  Sex
0       3  7.25  22.0    1
[0.14000522 0.30343647 0.2560461  0.30051221]

In [41]:
print("Age", "Sex")


('Age', 'Sex')

In [ ]: