In [ ]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
# Pretty display for notebooks
%matplotlib inline
# Set a random seed
import random
random.seed(42)
# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)
# Print the first few entries of the RMS Titanic data
display(full_data.head())
下面是每位乘客具备的各种特征:
NaN)NaN)因为我们对每位乘客或船员的存活情况感兴趣,因此我们可以从此数据集中删除 Survived 特征,并将其存储在单独的变量 outcome 中。我们将使用这些结果作为预测目标。
运行以下代码单元格,以从数据集中删除特征 Survived 并将其存储到 outcome 中。
In [ ]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
features_raw = full_data.drop('Survived', axis = 1)
# Show the new dataset with 'Survived' removed
display(features_raw.head())
In [ ]:
features = pd.get_dummies(features_raw)
现在用 0 填充任何空白处。
In [ ]:
features = features.fillna(0.0)
display(features.head())
In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
In [ ]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier
# TODO: Define the classifier, and fit it to the data
model = None
In [ ]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The training accuracy is', test_accuracy)
In [ ]:
# TODO: Train the model
# TODO: Make predictions
# TODO: Calculate the accuracy