In [28]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [29]:
df = pd.read_csv('../data/titanic.csv', index_col='PassengerId')
df.head()
Out[29]:
In [30]:
df_no_missing = df[['Survived', 'Pclass', 'Fare', 'Age', 'Sex']].dropna()
X_train_withStrings = df_no_missing[['Pclass', 'Fare', 'Age', 'Sex']]
y_train = df_no_missing['Survived']
def strings_to_int(df, target_column):
df_mod = df.copy()
targets_to_rename = df_mod[target_column].unique()
map_to_int = {name: n for n, name in enumerate(targets_to_rename)}
df_mod[target_column] = df_mod[target_column].replace(map_to_int)
return df_mod
X_train = strings_to_int(X_train_withStrings, "Sex")
X_train.head()
Out[30]:
In [31]:
clf = DecisionTreeClassifier(random_state=241)
clf.fit(X_train, y_train)
Out[31]:
In [32]:
importances = pd.Series(clf.feature_importances_, index = list(X_train))
importances
print(' '.join(importances.sort_values(ascending=False).head(2).index.values))
Out[32]: