In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import datasets, svm,tree, preprocessing, metrics, cross_validation
import sklearn.ensemble as ske
from IPython.display import display
%matplotlib inline
In [2]:
# Displaying data
full_data = pd.read_csv("train.csv")
# Print the first few entries of the Titanic data
display(full_data.head())
In [3]:
#I am droping PassengerId as it has no impact
full_data = full_data.drop("PassengerId" , axis = 1)
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
In [4]:
# Creating a accuracy function
def accuracy(truth, pred):
""" Returns accuracy score for input truth and predictions. """
# Ensure that the number of predictions matches number of outcomes
if len(truth) == len(pred):
# Calculate and return the accuracy as a percent
return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
else:
return "Number of predictions does not match number of outcomes!"
Using the RMS Titanic data, how accurate would a prediction be that none of the passengers survived?
In [5]:
print(accuracy(outcomes, np.zeros(outcomes.count())))
In [6]:
full_data.groupby('Pclass').mean()
Out[6]:
In [7]:
class_sex_group = full_data.groupby(['Pclass','Sex']).mean()
display(class_sex_group)
In [8]:
class_sex_group['Survived'].plot.bar()
Out[8]:
In [9]:
group_by_age = pd.cut(full_data["Age"], np.arange(0, 90, 10))
age_grouping = full_data.groupby(group_by_age).mean()
age_grouping['Survived'].plot.bar()
Out[9]:
In [10]:
full_data.count()
# droping the Cabin coulmn as that information is not given for all data
full_data = full_data.drop("Cabin" , axis = 1)
full_data = full_data.dropna()
In [11]:
def preprocess_titanic_df(df):
processed_df = df.copy()
le = preprocessing.LabelEncoder()
processed_df.Sex = le.fit_transform(processed_df.Sex)
processed_df.Embarked = le.fit_transform(processed_df.Embarked)
processed_df = processed_df.drop(['Name','Ticket'],axis=1)
return processed_df
processed_df = preprocess_titanic_df(full_data)
In [12]:
X = processed_df.drop(['Survived'], axis=1).values
y = processed_df['Survived'].values
In [13]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
In [14]:
clf_dt = tree.DecisionTreeClassifier(max_depth=10)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)
Out[14]:
In [15]:
import graphviz
dot_data = tree.export_graphviz(clf_dt, out_file=None)
graph = graphviz.Source(dot_data)
dot_data = tree.export_graphviz(clf_dt, out_file=None,
feature_names=full_data.feature_names,
class_names=full_data.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
In [ ]:
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)
def test_classifier(clf):
scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
In [ ]:
test_classifier(clf_dt)
In [ ]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
test_classifier(clf_rf)
In [ ]:
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)
In [ ]:
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)