In [1]:
##############################################################################
#
# Workshop: How to develop a personalised machine learning-based application
#
# Notebook 2: Classification
#
##############################################################################
In [2]:
# jupyter notebook instructions:
# - Every cell can be executed seperately from the rest.
# - You can execute cells in a non-sequential order (but be carefull of
# the dependencies between them).
# - Execute a cell by pressing the play button or Shift+Enter.
In [37]:
# Import necessary modules
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
In [12]:
# Define the sigmoid function
def sigmoid(t):
return (1/(1 + np.e**(-t)))
In [13]:
# Define a value range
x = np.arange(-6, 6, 0.1)
y = sigmoid(x)
In [15]:
# Plot the sigmoid function for the given value space
plt.plot(x, y)
Out[15]:
In [16]:
# Kaggle's Titanic problem is a great problem to experiment with
# data analysis and various algorithms.
# It can be found here: https://www.kaggle.com/c/titanic
# We'll use logistic regression to predict the survival rate.
In [17]:
# Load the data in a Pandas dataframe
titanic_df = pd.read_csv("../data/titanic_train_set.csv")
In [43]:
# Lets have a look in the data
print(titanic_df.columns)
titanic_df.sample(3)
Out[43]:
In [22]:
# Prepare the data so they can be used by Scikit Learn
cabin = titanic_df["cabin"].astype(str) # Convert the cabin to string
cabin2 = np.array([cabin[0] for cabin in cabin]) # Take the first letter of the cabin class
titanic_df["cabin"] = pd.Categorical(cabin2) # Replace the old Cabin attribute with its categorical version
# Add the median age as the default value when age is unknown (NA)
titanic_df["age"] = np.where(titanic_df["age"].isnull(), # If there is no value in 'age'
28, # Set it to 28
titanic_df["age"]) # Otherwise keep the existing
In [32]:
# Initialize encoder
label_encoder = preprocessing.LabelEncoder()
# Convert Sex, Class and Cabin attributes to a numeric ones
encoded_sex = label_encoder.fit_transform(titanic_df["sex"])
encoded_class = label_encoder.fit_transform(titanic_df["pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_df["cabin"])
features_training = pd.DataFrame([encoded_class,
encoded_cabin,
encoded_sex,
titanic_df["age"]]).T
# Initialize the model
log_res_model = linear_model.LogisticRegression()
# Train the model
log_res_model.fit(X = features_training,
y = titanic_df['survived'])
# Print the model's intercept and coefficients
print("Intercept: %s" % log_res_model.intercept_)
print("Coefficients: %s" % log_res_model.coef_)
In [33]:
# Predict for the given data
predictions = log_res_model.predict(X = features_training)
In [42]:
# And the predictions look like...
predictions[:10]
Out[42]:
In [34]:
# Compare predictions vs actual figures (Confusion Matrix)
pd.crosstab(predictions, titanic_df["survived"])
Out[34]:
In [35]:
# Calculate the model's accuracy
log_res_model.score(X = features_training,
y = titanic_df["survived"])
Out[35]:
In [40]:
# Or view a summary of common classification metrics
#
# Precision: tp / (tp + fp)
# Recall: tp / (tp + fn)
# f1 (or F-beat): Weighted harmonic mean of the precision and recall,
# where an F-beta score reaches its best value at 1
# and worst score at 0.
# support: The number of occurrences of each class in the actual y.
print(metrics.classification_report(y_true = titanic_df["survived"],
y_pred = predictions))
In [ ]: