Homepage: https://github.com/tien-le/kaggle-titanic
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
In [2]:
#Training Corpus
trn_corpus_after_preprocessing = pd.read_csv("output/trn_corpus_after_preprocessing.csv")
#Testing Corpus
tst_corpus_after_preprocessing = pd.read_csv("output/tst_corpus_after_preprocessing.csv")
In [3]:
#tst_corpus_after_preprocessing[tst_corpus_after_preprocessing["Fare"].isnull()]
In [4]:
trn_corpus_after_preprocessing.info()
print("-"*36)
tst_corpus_after_preprocessing.info()
One definition: "Machine learning is the semi-automated extraction of knowledge from data"
Unsupervised learning: Extracting structure from data
High-level steps of supervised learning:
First, train a machine learning model using labeled data
Then, make predictions on new data for which the label is unknown
The primary goal of supervised learning is to build a model that "generalizes": It accurately predicts the future rather than the past!
In [ ]:
In [5]:
trn_corpus_after_preprocessing.columns
Out[5]:
In [6]:
list_of_non_preditor_variables = ['Survived','PassengerId']
In [7]:
#Method 1
#x_train = trn_corpus_after_preprocessing.ix[:, trn_corpus_after_preprocessing.columns != 'Survived']
#y_train = trn_corpus_after_preprocessing.ix[:,"Survived"]
#Method 2
x_train = trn_corpus_after_preprocessing[trn_corpus_after_preprocessing.columns.difference(list_of_non_preditor_variables)].copy()
y_train = trn_corpus_after_preprocessing['Survived'].copy()
#y_train = trn_corpus_after_preprocessing.iloc[:,-1]
#y_train = trn_corpus_after_preprocessing[trn_corpus_after_preprocessing.columns[-1]]
#x_train
In [8]:
#y_train
In [9]:
x_train.columns
Out[9]:
In [10]:
# check the types of the features and response
#print(type(x_train))
#print(type(x_test))
In [11]:
#Method 1
#x_test = tst_corpus_after_preprocessing.ix[:, trn_corpus_after_preprocessing.columns != 'Survived']
#y_test = tst_corpus_after_preprocessing.ix[:,"Survived"]
#Method 2
x_test = tst_corpus_after_preprocessing[tst_corpus_after_preprocessing.columns.difference(list_of_non_preditor_variables)].copy()
y_test = tst_corpus_after_preprocessing['Survived'].copy()
#y_test = tst_corpus_after_preprocessing.iloc[:,-1]
#y_test = tst_corpus_after_preprocessing[tst_corpus_after_preprocessing.columns[-1]]
In [12]:
#x_test
In [13]:
#y_test
In [14]:
# display the first 5 rows
x_train.head()
Out[14]:
In [15]:
# display the last 5 rows
x_train.tail()
Out[15]:
In [16]:
# check the shape of the DataFrame (rows, columns)
x_train.shape
Out[16]:
What are the features?
What is the response?
What else do we know?
Note that if the response variable is continuous, this is a regression problem.
In [ ]:
In [17]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)
In [18]:
#Once trained, we can export the tree in Graphviz format using the export_graphviz exporter.
#Below is an example export of a tree trained on the entire iris dataset:
with open("output/titanic.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
#Then we can use Graphviz’s dot tool to create a PDF file (or any other supported file type):
#dot -Tpdf titanic.dot -o titanic.pdf.
import os
os.unlink('output/titanic.dot')
#Alternatively, if we have Python module pydotplus installed, we can generate a PDF file
#(or any other supported file type) directly in Python:
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("output/titanic.pdf")
Out[18]:
In [19]:
#The export_graphviz exporter also supports a variety of aesthetic options,
#including coloring nodes by their class (or value for regression)
#and using explicit variable and class names if desired.
#IPython notebooks can also render these plots inline using the Image() function:
"""from IPython.display import Image
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names= list(x_train.columns[1:]), #iris.feature_names,
class_names= ["Survived"], #iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())"""
Out[19]:
In [20]:
print("accuracy score: ", clf.score(x_test,y_test))
Classification accuracy: percentage of correct predictions
In [21]:
#After being fitted, the model can then be used to predict the class of samples:
y_pred_class = clf.predict(x_test);
#Alternatively, the probability of each class can be predicted,
#which is the fraction of training samples of the same class in a leaf:
clf.predict_proba(x_test);
In [22]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))
Null accuracy: accuracy that could be achieved by always predicting the most frequent class
In [23]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()
Out[23]:
In [24]:
# calculate the percentage of ones
y_test.mean()
Out[24]:
In [25]:
# calculate the percentage of zeros
1 - y_test.mean()
Out[25]:
In [26]:
# calculate null accuracy (for binary classification problems coded as 0/1)
max(y_test.mean(), 1 - y_test.mean())
Out[26]:
In [27]:
# calculate null accuracy (for multi-class classification problems)
y_test.value_counts().head(1) / len(y_test)
Out[27]:
Comparing the true and predicted response values
In [28]:
# print the first 25 true and predicted responses
from __future__ import print_function
print('True:', y_test.values[0:25])
print('Pred:', y_pred_class[0:25])
Conclusion: ???
In [29]:
# IMPORTANT: first argument is true values, second argument is predicted values
print(metrics.confusion_matrix(y_test, y_pred_class))
Basic terminology
In [30]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
In [31]:
print(TP, TN, FP, FN)
Classification Accuracy: Overall, how often is the classifier correct?
In [32]:
print((TP + TN) / float(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred_class))
Classification Error: Overall, how often is the classifier incorrect?
In [33]:
print((FP + FN) / float(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred_class))
Specificity: When the actual value is negative, how often is the prediction correct?
In [34]:
print(TN / float(TN + FP))
False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
In [35]:
print(FP / float(TN + FP))
Precision: When a positive value is predicted, how often is the prediction correct?
In [36]:
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred_class))
In [37]:
print("Presicion: ", metrics.precision_score(y_test, y_pred_class))
print("Recall: ", metrics.recall_score(y_test, y_pred_class))
print("F1 score: ", metrics.f1_score(y_test, y_pred_class))
Many other metrics can be computed: F1 score, Matthews correlation coefficient, etc.
Conclusion:
Which metrics should you focus on?
Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.
Ref: http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
In [38]:
from sklearn import svm
model = svm.LinearSVC()
model.fit(x_train, y_train)
Out[38]:
In [39]:
acc_score = model.score(x_test, y_test)
print("Accuracy score: ", acc_score)
In [40]:
y_pred_class = model.predict(x_test)
In [41]:
from sklearn import metrics
In [42]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion_matrix)
http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
A comparison of a several classifiers in scikit-learn on synthetic datasets. The point of this example is to illustrate the nature of decision boundaries of different classifiers. This should be taken with a grain of salt, as the intuition conveyed by these examples does not necessarily carry over to real datasets.
Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers.
The plots show training points in solid colors and testing points semi-transparent. The lower right shows the classification accuracy on the test set.
In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
In [ ]:
In [44]:
#classifiers
In [45]:
#x_train
In [46]:
#sns.pairplot(x_train)
In [47]:
x_train_scaled = StandardScaler().fit_transform(x_train)
x_test_scaled = StandardScaler().fit_transform(x_test)
In [48]:
x_train_scaled[0]
Out[48]:
In [49]:
len(x_train_scaled[0])
Out[49]:
In [50]:
df_x_train_scaled = pd.DataFrame(columns=x_train.columns, data=x_train_scaled)
In [51]:
df_x_train_scaled.head()
Out[51]:
In [52]:
#sns.pairplot(df_x_train_scaled)
In [53]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
"Naive Bayes", "QDA", "Gaussian Process"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis()
#, GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # Take too long...
]
# iterate over classifiers
for name, model in zip(names, classifiers):
model.fit(x_train_scaled, y_train)
acc_score = model.score(x_test_scaled, y_test)
print(name, " - accuracy score: ", acc_score)
#end for
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Ref: http://scikit-learn.org/stable/modules/tree.html
Decision trees can also be applied to regression problems, using the DecisionTreeRegressor class.
As in the classification setting, the fit method will take as argument arrays X and y, only that in this case y is expected to have floating point values instead of integer values:
In [54]:
from sklearn import tree
clf = tree.DecisionTreeRegressor()
clf = clf.fit(x_train, y_train)
clf.score(x_test,y_test)
Out[54]:
In [55]:
#clf.predict(x_test)
In [ ]:
In [ ]:
In [ ]:
Recall that Simple Linear Regression is given by the following equation: $y = \alpha + \beta x$
Our goal is to solve the values $\alpha$ and $\beta$ that minimize the cost function.
$$\beta = \frac{cov(x,y)}{var(x)}$$where $cov(x,y)$ denotes a measure of how far a set of values is spread out.
Note that:
Having solved $\beta$, we can estimate $\alpha$ using the following formula: $$\alpha = \overline{y} - \beta \overline{x}$$
Using r-squared - that measures how well the observed values of the response variables are predicted by the model. In the case of simple linear regression, r-squared is equal to Pearson's r. In this method, r-squared must be a positive number between zero and one. In others, r-squared can return a negative number if the model performs extremely poorly.
In [56]:
from sklearn.linear_model import LinearRegression
In [57]:
model = LinearRegression()
model.fit(x_train, y_train)
r_squared = model.score(x_test, y_test)
print("R-squared: %.4f" %r_squared)
Formally, multiple linear regression is the following model:
$$y = \alpha+\beta_1x_1+\beta_2x_2+...+\beta_nx_n$$or
$$Y = X\beta$$where $Y$ denotes a column vector of the values of the response variables for training, $\beta$ denotes a column vector of the values of the model's parameters, $X$ is called the design matrix, an $m \times n$ dimensional matrix of the values of the features.
We can solve $\beta$ as follows:
$$\beta = \left( X^TX \right)^{-1}X^TY$$Note that - code python:
from numpy import dot, transpose
beta = dot(inv(dot(transpose(X),X)), dot(transpose(X), Y))
In [58]:
from sklearn.linear_model import LinearRegression
In [59]:
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
In [60]:
#for i in range(predictions.size):
# print("Predicted: %.2f, Target: %.2f" %(predictions[i], y_test[i]))
r_squared = model.score(x_test, y_test)
print("R-squared: %.4f" %r_squared)
In [61]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
In [62]:
model = LinearRegression()
model.fit(x_train, y_train)
Out[62]:
In [63]:
xx = np.linspace(0, 26, 100)
#yy = np.linspace(0, 26, 100)
#yy = model.predict(xx.reshape(xx.shape[0],1))
#plt.plot(xx, yy)
In [64]:
quadratic_featurizer = PolynomialFeatures(degree=2)
x_train_quadratic = quadratic_featurizer.fit_transform(x_train)
x_test_quadratic = quadratic_featurizer.fit(x_test)
In [65]:
x_train.head()
Out[65]:
In [66]:
model_quadratic = LinearRegression()
model_quadratic.fit(x_train_quadratic, y_train)
#predictions = model_quadratic.predict(x_test_quadratic)
#r_squared = model_quadratic.score(x_test_quadratic, y_test)
#r_squared
#print("R-squared: %.4f" %r_squared)
Out[66]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: