This is a Python-based introduction to machine learning, covering only the basics, and intended for a half day session.
Some reasons for using Python is given at https://www.quora.com/Why-is-Python-a-language-of-choice-for-data-scientists. Also, in may 2017, Python overtook R as most popular among KDnuggets 2900 voters in it's annual poll.
The process of iteratively learning a model from data. This model might be used for prediction of previously unseen data (supervised learning). So by providing the algorithm with lots of observations from a domain, it infers the model automatically.
One example is the classification of email into the classes spam & no-spam. The observation here is the email and metadata about it.
A more detailed explanation is found at https://github.com/amueller/scipy_2015_sklearn_tutorial/blob/master/notebooks/01.1%20Introduction%20to%20Machine%20Learning.ipynb
Machine learning infographic from Microsoft Azure: Which questions can machine learning answer?
Can be divided into 3 types of learning problems (by learning style):
This tutorial will be on supervised learning.
Useful Python Data Science packages included with Anaconda:
Check out https://github.com/rasbt/watermark for making reproducible analysis easier. %watermark
In [ ]:
# Support for type hints
import typing
import numpy as np
import pandas as pd # Data Analysis Library: high-performance, easy-to-use data structures and data analysis tools
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sb # Statistical data visualization
# Display plots inside notebook
%matplotlib inline
# To make inline plots not blurry on retina Macbook
%config InlineBackend.figure_format = 'retina'
# Machine learning sample datasets and algorithms
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
In [ ]:
from matplotlib.colors import ListedColormap
# Create color maps for 3-class classification problem, as with iris
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cmap_same = ListedColormap(['k', 'k', 'k'])
def plot_iris_knn(classify=True):
iris = load_iris()
X = iris.data[:, 2:] # we only take the last two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
if classify:
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
if classify:
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
else:
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_same)
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title("Score = {:.2f}(+/- {:.2e})".format(knn.score(X,y), knn.score(X,y)))
plt.axis('tight')
plt.show()
In [ ]:
# set classify to False to show how the dataset looks without labels (for e.g. clustering with unsupervised learning)
plot_iris_knn(classify=True)
There are several different workflows to use but Cross Industry Standard Process for Data Mining (CRISP-DM) remains one of the most used according to a KDnuggets poll from 2014. They mostly differ in how they divide the different steps. References for CRISP-DM:
Other workflows:
The following Classification-example with the Iris flowers is based on the excellent (and much more detailed) notebook https://github.com/rhiever/Data-Analysis-and-Machine-Learning-Projects/blob/master/example-data-science-notebook/Example%20Machine%20Learning%20Notebook.ipynb.
A similar tutorial is also given at machinelearningmastery.com.
In [ ]:
# Load dataset
iris_dataset = load_iris()
# Describe (provided description in dataset)
print(iris_dataset.DESCR)
In [ ]:
# Loading data into pandas Dataframe
print(iris_dataset.target_names)
iris_data = pd.DataFrame(iris_dataset.data, columns=iris_dataset.feature_names)
# Convert 0,1,2 in target column to iris_dataset.target_names
iris_data['class'] = iris_dataset.target
iris_data['class'] = iris_data['class'].apply(lambda i: iris_dataset.target_names[i])
iris_data.head()
In [ ]:
# Enter your solution here
In [ ]:
# Summary statistics about dataset
iris_data.describe()
In [ ]:
# Summary statistics for each class, but limiting to only petal length/width
iris_data.groupby('class')[['petal length (cm)', 'petal width (cm)']].describe(percentiles=[])
Note:
In [ ]:
iris_data.plot(figsize=(10,8))
x-axis is flower number, 0-49 is one class of flowers, etc. From the plot it seems like petal length/width are the most useful features for classifying the flowers.
In [ ]:
# Plot pairwise relationships in dataset
# Note: Only possible to print in 2D (3D), so thats a visualization restriction even when we work on higher dimensional data.
sb.pairplot(iris_data, hue='class')
Diagonal displays distribution of each feature, off-diagonal displays pairwise correlation between the features.
In [ ]:
# setosa has an outlier on sepal width
print(iris_data.columns.values)
# Create histogram for column 'sepal width (cm)' and rows that have class=='setosa'.
iris_data[iris_data['class'] == 'setosa']['sepal width (cm)'].hist()
In [ ]:
plt.figure(figsize=(10,10))
# Making Boxplots of each of the features, split on class
for column_index, column in enumerate(iris_data.columns):
if column == 'class':
continue
plt.subplot(2, 2, column_index + 1)
# Boxplot
sb.boxplot(x='class', y=column, data=iris_data)
# Violinplot is like boxplot but box is scaled according to density of the data
#sb.violinplot(x='class', y=column, data=iris_data)
In [ ]:
# Example of constructing new features.
iris_data['petal area'] = iris_data['petal length (cm)'] * iris_data['petal width (cm)']
iris_data[['petal area']].plot()
iris_data.groupby('class')['petal area'].describe(percentiles=[])
iris_data.drop(['petal area'], inplace=True, axis=1)
a Greedy learning algorithm:
Gini impurity is a measure of misclassification.
Impurity measure defines how well the classes are separated.
In general the impurity measure should satisfy:
In [ ]:
# 1. Using Decision Tree as modeling technique:
decision_tree_classifier = DecisionTreeClassifier()
In [ ]:
# 2. Generate test design
# Load data in input (X) and output (y) variables
y = iris_data['class'].values
X = iris_data.drop('class', axis=1).values
print("Shape of input: %s" % str(X.shape))
print("Shape of output: %s" % str(y.shape))
from sklearn.model_selection import train_test_split
# split data into (75%) training and (25%) test set.
# Fixing random state for demonstration purpose
(X_train, X_test, y_train, y_test) = train_test_split(X, y, train_size=0.75, random_state=1)
print("Shape of training data: %s" % str(X_train.shape))
print("Shape of test data: %s" % str(X_test.shape))
In [ ]:
# 3. Build model
# Train the classifier on the training set
decision_tree_classifier.fit(X_train, y_train)
In [ ]:
# Visualize the decision tree
from IPython.display import Image
from sklearn import tree
import pydotplus
def visualize_decision_tree(decision_tree_classifier):
dot_data = tree.export_graphviz(decision_tree_classifier, out_file=None,
feature_names=iris_dataset.feature_names,
class_names=iris_dataset.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
return Image(graph.create_png())
visualize_decision_tree(decision_tree_classifier)
TODO: Different ways of measuring. What is a good way? Depends on problem. https://en.wikipedia.org/wiki/Confusion_matrix
In [ ]:
# What is the result on the training set?
decision_tree_classifier.score(X_train, y_train)
100 % correct on training set! Is that good?
In [ ]:
# Predict class on whole test set
y_pred = decision_tree_classifier.predict(X_test)
# Compare prediction of first element in test set with real class
print("Predicted class first element: {}".format(y_pred[0]))
# Real class:
print("Real class first element: {}".format(y_test[0]))
#print(y_pred)
#print(y_test)
correct_predictions = np.sum(y_pred == y_test)
print("Count number of predictions equal to real class: {:d}".format(correct_predictions))
total_predictions = y_pred.shape[0]
print("Total number of predictions: {:d}".format(total_predictions))
accuracy = correct_predictions / total_predictions
print("Accuracy {}/{}={:2f}".format(correct_predictions, total_predictions, accuracy))
In [ ]:
# Validate the classifier on the testing set using classification accuracy
decision_tree_classifier.score(X_test, y_test)
Is that good? Depends on the sampling in train/test-split, business problem, ...
In [ ]:
# Check how accuracy varies with different samples (random_state is not fixed)
model_accuracies = []
for repetition in range(1000):
(X_train, X_test, y_train, y_test) = train_test_split(X, y, train_size=0.75)
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)
classifier_accuracy = decision_tree_classifier.score(X_test, y_test)
model_accuracies.append(classifier_accuracy)
ax = sb.distplot(model_accuracies)
ax.set_title('Average score: {:.2f} ({:.3f})'.format(np.mean(model_accuracies), np.std(model_accuracies)))
Big variations in how our model perform for different subsets of training data.
Does not generalize well on new data (X_test) => overfitting
This problem is the main reason that most data scientists perform k-fold cross-validation on their models: Split the original data set into k subsets, use one of the subsets as the testing set, and the rest of the subsets are used as the training set. This process is then repeated k times such that each subset is used as the testing set exactly once. 10-fold cross-validation is the most common choice, so let's use that here. Performing 10-fold cross-validation on our data set looks something like this:
In [ ]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
def plot_cv(cv, n_samples):
masks = []
for train, test in cv:
mask = np.zeros(n_samples, dtype=bool)
mask[test] = 1
masks.append(mask)
plt.figure(figsize=(15, 15))
plt.imshow(masks, interpolation='none')
plt.ylabel('Fold')
plt.xlabel('Row #')
skf = StratifiedKFold(n_splits=10)
plot_cv(skf.split(X,y), len(y))
Stratified k-fold cross-validation: Stratified means the class proportions the same across all of the folds, so we maintain a representative subset of our data set. (e.g., so we don't have 100% Iris setosa entries in one of the folds.)
In [ ]:
from sklearn.model_selection import cross_val_score
decision_tree_classifier = DecisionTreeClassifier()
# cross_val_score returns a list of the scores, which we can visualize
# to get a reasonable estimate of our classifier's performance
cv_scores = cross_val_score(decision_tree_classifier, X, y, cv=10)
sb.distplot(cv_scores)
plt.title('Average score: {:.2f} ({:.3f})'.format(np.mean(cv_scores), np.std(cv_scores)))
In [ ]:
# max_depth=1 gives low score
decision_tree_classifier = DecisionTreeClassifier(max_depth=1)
cv_scores = cross_val_score(decision_tree_classifier, X, y, cv=10)
sb.distplot(cv_scores, kde=False)
plt.title('Average score: {:.2f}'.format(np.mean(cv_scores)))
One common systematic way to find best parameters for model and data set: Grid Search.
In [ ]:
from sklearn.model_selection import GridSearchCV
decision_tree_classifier = DecisionTreeClassifier()
# Which parameters can be searched
print(decision_tree_classifier.get_params())
parameter_grid = {'max_depth': [1, 2, 3, 4, 5],
'max_features': [1, 2, 3, 4]}
grid_search = GridSearchCV(decision_tree_classifier,
param_grid=parameter_grid,
cv=10)
grid_search.fit(X, y)
print('Best score: {:.2f}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
In [ ]:
grid_visualization = grid_search.cv_results_['mean_test_score']
depth_count = len(parameter_grid['max_depth']) # 5
feature_count = len(parameter_grid['max_features']) # 4
grid_visualization = np.array(grid_visualization)
grid_visualization.shape = (depth_count, feature_count)
sb.heatmap(grid_visualization, cmap='Blues')
plt.xticks(np.arange(feature_count) + 0.5, grid_search.param_grid['max_features'])
plt.yticks(np.arange(depth_count) + 0.5, grid_search.param_grid['max_depth'][::-1])
plt.xlabel('max_features')
plt.ylabel('max_depth')
In [ ]:
decision_tree_classifier = DecisionTreeClassifier()
parameter_grid = {'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [1, 2, 3, 4, 5],
'max_features': [1, 2, 3, 4]}
grid_search = GridSearchCV(decision_tree_classifier,
param_grid=parameter_grid,
cv=10)
grid_search.fit(X, y)
print('Best score: {:.2f}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
In [ ]:
best_decision_tree_classifier = grid_search.best_estimator_
best_decision_tree_classifier
In [ ]:
visualize_decision_tree(best_decision_tree_classifier)
A powerful idea: Combining weak learners to make a strong one.
In [ ]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier()
parameter_grid = {'n_estimators': [5, 10, 25, 50],
'criterion': ['gini', 'entropy'],
'max_features': [1, 2, 3, 4],
'warm_start': [True, False]}
grid_search = GridSearchCV(random_forest_classifier,
param_grid=parameter_grid,
cv=10)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
grid_search.best_estimator_
In [ ]:
# Based on https://github.com/jakevdp/PMLC-2014/blob/master/fig_code/linear_regression.py
max_x = 30
np.random.seed(0)
def generate_linear_data(n: int) -> (np.array, np.array):
a = 0.5
b = 1.0
# generate n random values in the range [0, max_x)
x = np.sort(max_x * np.random.random(n))
# y = a*x + b with noise
noise = np.random.normal(size=x.shape)
y = a * x + b + noise
return x, y
def create_linear_model(x: np.array, y: np.array) -> (np.array, np.array):
# create a linear regression model
## linear regression attempts to draw a straight line that will
## best minimize the residual sum of squares between
## the model and the observed responses in the dataset
model = LinearRegression()
# Setting correct dimension
X = x.reshape(-1,1)
model.fit(X, y)
# From http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression.score:
# Score for LinearRegression: The coefficient R^2 is defined as (1 - u/v),
# where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and
# v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum().
# The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).
print("Goodness-of-fit on samples: {:.2f} (1.0 is perfect fit)".format(model.score(X,y)))
# *-operator is unpacking argument list (https://docs.python.org/3/tutorial/controlflow.html#unpacking-argument-lists)
print("Model parameters: a = {:.2f}, b = {:.2f}".format(*model.coef_, model.intercept_))
return model
def plot_results(x_samples: np.array, y_samples: np.array,
model):
# 100 values with equal distance from 0 to max_x
x_pred = np.linspace(0, max_x, 100)
# predict y from the data
y_pred = model.predict(x_pred.reshape(-1,1))
ax = plt.axes()
# Scatter plot samples
ax.scatter(x_samples, y_samples, label='Samples')
# Plot prediction model
ax.plot(x_pred, y_pred, label='Model')
# Plot error lines
label = False
for x, y in zip(x_samples, y_samples):
y_model = model.predict(x)
# plot vertical error line from model to sample
# 2 x,y pairs: (x_sample, y_model) and (x_sample, y_sample)
if not label: # only print label on first error line
ax.plot( [x,x], [y_model,y], 'r', label='Error' )
label = True
else:
ax.plot( [x,x], [y_model,y], 'r')
# Plotting specifics
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend(loc='best')
ax.axis('tight')
In [ ]:
# Try with larger sample size than 20, e.g. 1000, and see that model parameters are closer to original line: a=0.5, b=1.0
# Try a few runs with sample size 2. Explain the Goodness-of-fit and model parameters.
x, y = generate_linear_data(20)
model = create_linear_model(x, y)
plot_results(x, y, model)
plt.show()
Line is fitted to samples/points by minimizing the total error.
In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
n_samples = 30
max_x = 1
plot_degrees = [1, 4, 15]
degrees = list(range(1,16))
MSE = [] # list of mean squared error for given degree index
true_fun = lambda x: np.cos(1.5 * np.pi * x)
X_array = np.sort(np.random.rand(n_samples))
noise = np.random.normal(size=X_array.shape, scale=0.1)
y = true_fun(X_array) + noise
# Changing shape from (n_samples, ) to (n_samples, 1).
X = X_array.reshape(-1,1)
plt.figure(figsize=(14, 5))
plot_idx = 0
for i in range(len(degrees)):
polynomial_features = PolynomialFeatures(degree=degrees[i],
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X, y)
# Evaluate the models using crossvalidation
scores = cross_val_score(pipeline, X, y,
scoring="neg_mean_squared_error", cv=10)
# Training error decreases with increasing model complexity (increasing degree of fitted polynomial)
y_pred = pipeline.predict(X)
y_true = y
MSE.append(mean_squared_error(y_true, y_pred))
print("Degree: {} -- MSE {:.4f} -- CV score {:.4f}".format(degrees[i], MSE[i], -scores.mean()))
if degrees[i] in plot_degrees:
ax = plt.subplot(1, len(plot_degrees), plot_idx + 1)
plt.setp(ax, xticks=(), yticks=())
X_test = np.linspace(0, max_x, 100)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X.reshape(-1), y, label="Samples")
plt.xlabel("x")
plt.ylabel("y")
plt.xlim((0, max_x))
plt.ylim((-2, 2))
plt.legend(loc="best")
plt.title("Degree {}\nMSE = {:.3f} (+/- {:.2e})".format(
plot_degrees[plot_idx], -scores.mean(), scores.std()))
plot_idx +=1
plt.show()
ax_train_error = plt.axes()
ax_train_error.plot(degrees, MSE)
plt.title('Training error vs complexity')
Task: Finding model of correct complexity for given data set.
In [ ]:
# See http://www.neural.cz/dataset-exploration-boston-house-pricing.html
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
# Loading Boston House Prices dataset
# This dataset is already cleaned (no missing values)
boston_dataset = load_boston()
# Print data set Characteristics
print(boston_dataset.DESCR)
# A dataframe in Pandas is similar to a table
df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
# target here is price (MEDV), but note that we could have chosen another one!
df['target'] = boston_dataset.target
A model is an assumption about how the world works. What could be a reasonable model for predicting house price?
In [ ]:
#print(df.describe(percentiles=[]).T)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use = 'default'
df.boxplot()
In [ ]:
# Pandas styling the dataframe
def highlight_max(s):
'''
highlight the maximum in a Series yellow.
'''
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
# Coloring the target column green. For more colors: http://www.w3schools.com/cssref/css_colors.asp
df.head(10).style.apply(lambda s: ['background-color: darkseagreen']*s.size, axis=0, subset=['target'])
In [ ]:
from pandas.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='kde')
In [ ]:
import seaborn as sns # conventional alias
#sns.set(style="white")
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
# By using heatmap twice we get bold all correlations above 0.6
sns.heatmap(corr, mask=mask, annot=True)
sns.heatmap(corr, mask= abs(corr) < 0.6, cbar=False, annot=True)
#annot_kws={"weight": "bold"})
# Draw the heatmap with the mask and correct aspect ratio
#sns.heatmap(corr, cmap=cmap, vmax=.3,
# square=True, xticklabels=5, yticklabels=5,
# linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
In [ ]:
# Want to look further on correlation between RM, LSTAT and house price (MEDV)
sns.jointplot(df['RM'], df['target'])
sns.jointplot(df['LSTAT'], df['target'])
In [ ]:
# Which of the features exhibit a linear correlation with the target?
corr.iloc[-1].apply(abs).sort_values(ascending=True)
In [ ]:
#X = df[['RM', 'LSTAT']]
X = df[['LSTAT']]
y = df['target']
X.boxplot()
In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Split data into (80%) training and (20%) test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Assuming a linear relationship between number of Rooms and House price
model = LinearRegression()
# Train model
model.fit(X_train, y_train)
In [ ]:
def show_results(model, X_test, y_test):
# Predict y-values from model:
y_pred = model.predict(X_test)
# The coefficients
#print('Coefficients: \n', model.coef_)
# The mean squared error
print("Mean squared error: %.2f" % np.mean((y_pred - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model.score(X_test, y_test))
# Discarding higher order terms
if X_test.shape[1] > 1:
# Getting second column
X_linear = X_test[:,1]
else:
X_linear = X_test
# Plot outputs
n = y_pred.shape[0]
plt.scatter(X_linear, y_test, color='black')
X_model = np.linspace(0, X_test.max(), n)
plt.plot(X_model, model.predict(X_model.reshape(-1,1)), color='blue', linewidth=3)
plt.show()
In [ ]:
show_results(model, X_test, y_test)
Linear fit does not seem ideal. Let's try quadratic!
In [ ]:
# Code based on http://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
# Input
# X: one feature
# y: target values
# degree: highest order term in polynomial feature
def train_poly(X, y, degree=2):
# It is still called linear regression, since it is a linear combination of polynomial terms.
linear_regression = LinearRegression()
polynomial_features = PolynomialFeatures(degree=degree)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X, y)
# Evaluate the models using crossvalidation
scores = cross_val_score(pipeline, X, y,
scoring="neg_mean_squared_error", cv=10)
print("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
degree, -scores.mean(), scores.std()))
return pipeline
In [ ]:
model_poly = train_poly(X_train, y_train)
show_results(model_poly, X_test, y_test)
In [ ]:
show_results(train_poly(X_train, y_train, degree=5), X_test, y_test)
In [ ]: