notebook.community

Edit and run



In [1]:

    
##############################################################################
#
# Workshop: How to develop a personalised machine learning-based application
#
# Notebook 2: Classification
#
##############################################################################



In [2]:

    
# jupyter notebook instructions:
# - Every cell can be executed seperately from the rest.
# - You can execute cells in a non-sequential order (but be carefull of 
#   the dependencies between them).
# - Execute a cell by pressing the play button or Shift+Enter.



In [37]:

    
# Import necessary modules
import numpy as np
import scipy.stats as stats
import pandas as pd

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')



In [12]:

    
# Define the sigmoid function
def sigmoid(t):
    return (1/(1 + np.e**(-t)))



In [13]:

    
# Define a value range
x = np.arange(-6, 6, 0.1)
y = sigmoid(x)



In [15]:

    
# Plot the sigmoid function for the given value space
plt.plot(x, y)









    Out[15]:





[<matplotlib.lines.Line2D at 0x7fa0b30e2f90>]



In [16]:

    
# Kaggle's Titanic problem is a great problem to experiment with 
# data analysis and various algorithms. 
# It can be found here: https://www.kaggle.com/c/titanic
# We'll use logistic regression to predict the survival rate.



In [17]:

    
# Load the data in a Pandas dataframe
titanic_df = pd.read_csv("../data/titanic_train_set.csv")



In [43]:

    
# Lets have a look in the data
print(titanic_df.columns)
titanic_df.sample(3)









    



Index([u'survived', u'pclass', u'name', u'sex', u'age', u'sibsp', u'parch',
       u'ticket', u'fare', u'cabin', u'embarked'],
      dtype='object')






    Out[43]:






  
    
      
      survived
      pclass
      name
      sex
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
  
  
    
      354
      0
      3
      Yousif, Mr. Wazli
      male
      28.0
      0
      0
      2647
      7.2250
      n
      C
    
    
      479
      1
      3
      Hirvonen, Miss. Hildur E
      female
      2.0
      0
      1
      3101298
      12.2875
      n
      S
    
    
      589
      0
      3
      Murdlin, Mr. Joseph
      male
      28.0
      0
      0
      A./5. 3235
      8.0500
      n
      S



In [22]:

    
# Prepare the data so they can be used by Scikit Learn
cabin = titanic_df["cabin"].astype(str) # Convert the cabin to string

cabin2 = np.array([cabin[0] for cabin in cabin]) # Take the first letter of the cabin class

titanic_df["cabin"] = pd.Categorical(cabin2) # Replace the old Cabin attribute with its categorical version

# Add the median age as the default value when age is unknown (NA)
titanic_df["age"] = np.where(titanic_df["age"].isnull(), # If there is no value in 'age'
                            28,                          # Set it to 28 
                            titanic_df["age"])           # Otherwise keep the existing



In [32]:

    
# Initialize encoder
label_encoder = preprocessing.LabelEncoder()

# Convert Sex, Class and Cabin attributes to a numeric ones
encoded_sex = label_encoder.fit_transform(titanic_df["sex"])
encoded_class = label_encoder.fit_transform(titanic_df["pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_df["cabin"])

features_training = pd.DataFrame([encoded_class,
                                     encoded_cabin,
                                     encoded_sex,
                                     titanic_df["age"]]).T

# Initialize the model
log_res_model = linear_model.LogisticRegression()

# Train the model
log_res_model.fit(X = features_training,
                  y = titanic_df['survived'])

# Print the model's intercept and coefficients
print("Intercept: %s" % log_res_model.intercept_)
print("Coefficients: %s" % log_res_model.coef_)









    



Intercept: [ 3.32716302]
Coefficients: [[-0.90790164 -0.06426483 -2.43179802 -0.0265924 ]]



In [33]:

    
# Predict for the given data
predictions = log_res_model.predict(X = features_training)



In [42]:

    
# And the predictions look like...
predictions[:10]









    Out[42]:





array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])



In [34]:

    
# Compare predictions vs actual figures (Confusion Matrix)
pd.crosstab(predictions, titanic_df["survived"])



In [35]:

    
# Calculate the model's accuracy
log_res_model.score(X = features_training,
                   y = titanic_df["survived"])









    Out[35]:





0.79349046015712688



In [40]:

    
# Or view a summary of common classification metrics
# 
# Precision: tp / (tp + fp)
# Recall: tp / (tp + fn)
# f1 (or F-beat): Weighted harmonic mean of the precision and recall, 
#                 where an F-beta score reaches its best value at 1 
#                 and worst score at 0.
# support: The number of occurrences of each class in the actual y.
print(metrics.classification_report(y_true = titanic_df["survived"],
                              y_pred = predictions))









    



             precision    recall  f1-score   support

          0       0.83      0.84      0.83       549
          1       0.74      0.71      0.73       342

avg / total       0.79      0.79      0.79       891



In [ ]:

survived	0	1
row_0
0	463	98
1	86	244

	survived	pclass	name	sex	age	parch	ticket	fare	cabin	embarked
354	0	3	Yousif, Mr. Wazli	male	28.0	0	2647	7.2250	n	C
479	1	3	Hirvonen, Miss. Hildur E	female	2.0	1	3101298	12.2875	n	S
589	0	3	Murdlin, Mr. Joseph	male	28.0	0	A./5. 3235	8.0500	n	S