In [41]:
# import
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
In [22]:
# Load the train and test datasets to create two DataFrames
#train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
#train = pd.read_csv(train_url)
#train.to_csv('dataset/titanic_train.csv')
In [23]:
#test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
#test = pd.read_csv(test_url)
#test.to_csv('dataset/titanic_test.csv')
In [24]:
train = pd.read_csv('dataset/titanic_train.csv')
test = pd.read_csv('dataset/titanic_test.csv')
In [25]:
#Print the `head` of the train and test dataframes
print(train.head())
In [26]:
print(test.head())
In [27]:
# understanding the data
train.describe()
Out[27]:
In [28]:
print(train.shape, test.shape)
In [29]:
# value count
train["Survived"].value_counts()
Out[29]:
In [30]:
# proportion
train["Survived"].value_counts(normalize = True)
Out[30]:
In [31]:
# Passengers that survived vs passengers that passed away
print(train["Survived"].value_counts())
# As proportions
print(train["Survived"].value_counts(normalize = True))
# Males that survived vs males that passed away
print(train["Survived"][train["Sex"]=="male"].value_counts())
# Females that survived vs Females that passed away
print(train["Survived"][train["Sex"]=="female"].value_counts())
# Normalized male survival
print(train["Survived"][train["Sex"]=="male"].value_counts(normalize = True))
# Normalized female survival
print(train["Survived"][train["Sex"]=="female"].value_counts(normalize = True))
In [32]:
# Create the column Child and assign to 'NaN'
train["Child"] = float('NaN')
# Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
train["Child"][train["Age"]>=18] = 0
train["Child"][train["Age"]<18] = 1
In [33]:
# Print normalized Survival Rates for passengers under 18
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))
# Print normalized Survival Rates for passengers 18 or older
print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))
In [34]:
# Create a copy of test: test_one
test_one = test.copy()
# Initialize a Survived column to 0
test_one["Survived"] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one["Survived"][test_one["Sex"]=='female'] = 1
print(test_one["Survived"][:10])
we saw that the Age variable had some missing value. Missingness is a whole subject with and in itself, but we will use a simple imputation technique where we substitute each missing value with the median of the all present values.
train["Age"] = train["Age"].fillna(train["Age"].median())
In [35]:
# Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Age"] = train["Age"].fillna(train["Age"].median())
# Impute the Embarked variable as "S" where value is NaN
train["Embarked"] = train["Embarked"].fillna("S")
# Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
#Print the Sex and Embarked columns
print(train["Sex"][:4])
print(train["Embarked"][:4])
In [36]:
# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
print(features_one)
print(target)
#np.isnan(features_one["Sex"])
#np.isnan(features_one).any()
In [37]:
# Fit your first decision tree: my_tree_one
my_tree_one = DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))
In [38]:
# Convert the male and female groups to integer form
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Age"] = test["Age"].fillna(test["Age"].median())
# Impute the Embarked variable as "S" where value is NaN
test["Embarked"] = test["Embarked"].fillna("S")
# Convert the Embarked classes to integer form
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
In [39]:
# Impute the missing value with the median
test.Fare[152] = test["Fare"].median()
# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)
print(my_prediction)
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution)
# Check that your data frame has 418 entries
print(my_solution.shape)
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution.csv", index_label = ["PassengerId"])
In [42]:
# Create a new array with the added features: features_two
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)
#Print the score of the new decison tree
print(my_tree_two.score(features_two, target))
In [43]:
# Create train_two with the newly defined feature
train_two = train.copy()
train_two["family_size"] = np.sum(train.SibSp + train.Parch + 1)
# Create a new feature set and add the new feature
features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
# Define the tree classifier, then fit the model
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)
# Print the score of this decision tree
print(my_tree_three.score(features_three, target))
In [44]:
# Import the `RandomForestClassifier`
from sklearn.ensemble import RandomForestClassifier
# We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
# Building and fitting my_forest
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(features_forest, target)
# Print the score of the fitted random forest
print(my_forest.score(features_forest, target))
# Compute predictions on our test set features then print the length of the prediction vector
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(pred_forest))
In [45]:
#Request and print the `.feature_importances_` attribute
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)
#Compute and print the mean accuracy score for both models
print(my_tree_two.score(features_two, target))
print(my_forest.score(features_two, target))
In [ ]: