In [116]:
# This line configures matplotlib to show figures embedded in the notebook,
# instead of opening a new window for each figure.
%matplotlib inline
In [117]:
# Use the pandas library to read in the csv file
import pandas as pd
# This will create a pandas dataframe and assign it to the titanic variable
titanic = pd.read_csv("titanic_data.csv")
In [118]:
# Print the first 5 rows of the dataframe
titanic.head(5)
Out[118]:
In [119]:
titanic.shape
Out[119]:
In [120]:
# Show data types of the class label and features
titanic.dtypes
Out[120]:
In [121]:
titanic.describe()
Out[121]:
In [122]:
import matplotlib as plt
fig = plt.pyplot.figure()
axis = fig.add_subplot(111)
axis.hist(titanic['Age'], bins=10, range=(titanic['Age'].min(), titanic['Age'].max()))
plt.pyplot.title('Age Distribution')
plt.pyplot.xlabel('Age')
plt.pyplot.ylabel('# Passengers')
plt.pyplot.show()
In [123]:
# Get the counts of males and females in the data
titanic.groupby('Sex').count()
Out[123]:
In [124]:
# Get summary stats grouped by Sex field
titanic.groupby('Sex').describe()
Out[124]:
In [125]:
# Retrieve unique values for the Sex element
titanic['Sex'].unique()
Out[125]:
In [126]:
crosstab = pd.crosstab(titanic['Sex'], titanic['Survived'].astype(bool))
crosstab.plot(kind='bar', stacked=True, color=['red','green'], grid=False)
Out[126]:
In [127]:
# Start here with post 2 code
In [128]:
import matplotlib.pyplot as plt
# set display to a a more appealing style
pd.options.display.mpl_style = 'default'
# plot the numeric variables using box plot chart
titanic.boxplot(column=['Age', 'SibSp', 'Parch', 'Fare'])
Out[128]:
In [129]:
titanic.hist(column=['Age', 'SibSp', 'Parch', 'Fare'], figsize=[10,10])
Out[129]:
In [130]:
titanic[titanic['Fare'] > 300][['PassengerId', 'Name', 'Sex', 'Fare']]
Out[130]:
In [131]:
titanic.info()
In [132]:
titanic['Age'][0:10]
Out[132]:
In [133]:
titanic['Age'].mean()
Out[133]:
In [134]:
titanic['Age'].median()
Out[134]:
In [135]:
# Create a new column in the data frame and assign it values from existing column
titanic['AgeBackFill'] = titanic['Age']
In [136]:
# Check if the values were copied over
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)
Out[136]:
In [137]:
titanic.loc[titanic['Age'].isnull(), 'AgeBackFill'] = titanic['Age'].median()
In [138]:
# Check if the values were backfilled
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)
Out[138]:
In [139]:
titanic['Gender'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
In [140]:
# Check the transformation
titanic[['Sex', 'Gender']].head(10)
Out[140]:
In [141]:
titanic[['PassengerId', 'Embarked']].groupby('Embarked').count()
Out[141]:
In [142]:
# Create a new column
titanic['EmbarkedBackFill'] = titanic['Embarked']
# Backfill missing values
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].fillna('S')
# Confirm backfill worked
titanic[titanic['Embarked'].isnull()][['Embarked', 'EmbarkedBackFill']]
Out[142]:
In [143]:
titanic.loc[titanic['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
titanic.loc[titanic['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
titanic.loc[titanic['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].astype(int)
In [144]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['Age*Class'] = titanic['AgeBackFill'] * titanic['Pclass']
In [145]:
titanic.info()
In [146]:
# create a new data frame and remove the columns we will not use
titanic_clean = titanic.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)
In [147]:
titanic_clean.info()
In [148]:
train_data = titanic_clean.values
train_data
Out[148]:
In [149]:
titanic_clean.groupby('Survived').AgeBackFill.hist(alpha=0.4)
Out[149]:
In [150]:
titanic_clean.groupby('Survived').Fare.hist(alpha=0.4)
Out[150]:
In [151]:
titanic_clean.groupby('Survived').FamilySize.hist(alpha=0.4)
Out[151]:
In [152]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(titanic_clean[['Survived', 'Pclass','AgeBackFill', 'Gender', 'EmbarkedBackFill', 'FamilySize', 'Fare']], alpha=0.2, figsize=(15, 15), diagonal='kde')
Out[152]:
In [153]:
# Start here with post 3 code
In [154]:
# Import function to split the data
from sklearn.cross_validation import train_test_split
# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]
# Specify target column
target = "Survived"
# Set features
X = titanic_clean[predictors]
# Set targets
y = titanic_clean[target]
# Split data into 80% training and 20% test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.8, random_state=0)
In [155]:
import numpy as np
# Utility method to draw a confusion matrix
def plot_confusion_matrix(cm, target, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target.unique()))
plt.xticks(tick_marks, target.unique(), rotation=45)
plt.yticks(tick_marks, target.unique())
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
for y in range(cm.shape[0]):
for x in range(cm.shape[1]):
plt.text(x, y, '%.0f' % cm[y, x],
horizontalalignment='center',
verticalalignment='center',
color='red',
fontsize=20
)
In [156]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Initialize our algorithm class
algo = LinearRegression()
# fit the model
algo.fit(X_train, y_train)
# predict on our test data
y_pred = algo.predict(X_test)
In [157]:
from sklearn.metrics import confusion_matrix
y_pred[y_pred > .5] = 1
y_pred[y_pred <=.5] = 0
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])
In [158]:
# Import the logistic regression class
from sklearn.linear_model import LogisticRegression
# Initialize our algorithm class
algo = LogisticRegression(random_state=0)
# fit the model
algo.fit(X_train, y_train)
# predict on our test data
y_pred = algo.predict(X_test)
In [159]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])
In [160]:
# Import the Decision tree classifier
from sklearn.tree import DecisionTreeClassifier
# Initialize our algorithm class
algo = DecisionTreeClassifier(random_state=0)
# fit the model
algo.fit(X_train, y_train)
# predict on our test data
y_pred = algo.predict(X_test)
In [161]:
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])
In [162]:
# Import the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
algo = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model
algo.fit(X_train, y_train)
# predict on our test data
y_pred = algo.predict(X_test)
In [163]:
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])
In [ ]:
# Start here with post 4 code
In [164]:
# Use the pandas library to read in the csv file
import pandas as pd
# This will create a pandas dataframe and assign it to the titanic variable
validation = pd.read_csv("validation_ob.csv")
In [165]:
# transform
validation['AgeBackFill'] = validation['Age']
validation.loc[validation['Age'].isnull(), 'AgeBackFill'] = 28 # use same value used in training data
validation['Gender'] = validation['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
validation['EmbarkedBackFill'] = validation['Embarked']
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].fillna('S')
validation.loc[validation['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
validation.loc[validation['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
validation.loc[validation['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].astype(int)
validation['FamilySize'] = validation['SibSp'] + validation['Parch']
validation['Age*Class'] = validation['AgeBackFill'] * validation['Pclass']
data = validation.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)
# transform end
In [166]:
# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]
# Specify target column
target = "Survived"
# Set features
X = data[predictors]
# Set targets
y = data[target]
In [167]:
y_pred = algo.predict(X)
cm = confusion_matrix(y, y_pred)
plt.figure()
plot_confusion_matrix(cm, data[target])