"Machine learning gives computers the ability to learn without explicitly programmed." - Arthur Samuel, 1959
Machine learning originated from pattern recognition and computational learning theory in AI. It is the study and construction of algorithms to learn from and make predictions on data through building a model from sample input.
Supervised Learning: correct output known for each training example for predicting output when given an input vector
Unsupervised Learning: for learning an internal representation of the input to capture regularities and structure in the data without any labels
Sklearn class and function reference page http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
Machine Learning course from Computer Science Department, UOFT http://www.cs.toronto.edu/~urtasun/courses/CSC411_Fall16/CSC411_Fall16.html
TPOT for optimized machine learning pipelines https://rhiever.github.io/tpot/
KERAS package for Neural Networks https://keras.io/
Tensorflow Playground for Deep Neural Networks http://playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle®Dataset=reg-plane&learningRate=0.03®ularizationRate=0&noise=0&networkShape=4,2&seed=0.69754&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false
In [ ]:
%pylab inline
import matplotlib
import matplotlib.pyplot as plt
# Datasets
import numpy as np
import pandas as pd
from sklearn import datasets
In [ ]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
columns = iris['feature_names'] + ['target'])
iris_df.head()
In [ ]:
# Feature Correlations
iris_df.iloc[:,:4].corr()
In [ ]:
# Feature Relationships
X = iris.data
Y = iris.target
key = {0: ('red', 'Iris setosa'),
1: ('blue', 'Iris versicolor'),
2: ('green', 'Iris virginica')}
colors = [key[index][0] for index in Y]
# Plot the training points
# Sepal information
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=colors)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Sepal Information')
# Petal information
plt.subplot(222)
plt.scatter(X[:, 2], X[:, 3], c=colors)
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.title('Petal Information')
# Length information
plt.subplot(223)
plt.scatter(X[:, 0], X[:, 2], c=colors)
plt.xlabel('Sepal length')
plt.ylabel('Petal length')
plt.title('Length Information')
# Width information
plt.subplot(224)
plt.scatter(X[:, 1], X[:, 3], c=colors)
plt.xlabel('Sepal width')
plt.ylabel('Petal width')
plt.title('Width Information')
# Plot legend
patches = [matplotlib.patches.Patch(color=color, label=label) for color, label in key.values()]
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.1,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Applying PCA to iris features
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(iris.data)
plt.figure(figsize=(6, 6))
plt.scatter(iris_pca[:, 0], iris_pca[:, 1], c=colors)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA')
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.2,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Interpreting features from PCA
from scipy import stats
pca_corr = pd.DataFrame(columns = iris['feature_names'])
for i in range(2):
print('PC%d explained variance = %.2f%s\n' % (i+1, pca.explained_variance_ratio_[i]*100, '%'))
corr = []
for j in range(4):
corr.append(stats.pearsonr(iris_pca[:, i], iris.data[:, j])[0])
pca_corr.loc[i, ] = corr
pca_corr = pca_corr.rename(index={0: 'PC1', 1: 'PC2'})
pca_corr
In [ ]:
# Scaling data
from sklearn import preprocessing
X = iris.data
Y = iris.target
# When the data is scaled to complete data
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# When the data is scaled to a part of the data
#scaler = preprocessing.StandardScaler().fit(X)
#X_scaled = scaler.transform(X)
In [ ]:
# Scaled Feature Relationships
# Plot the training points
# Sepal information
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=colors)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Sepal Information')
# Petal information
plt.subplot(222)
plt.scatter(X_scaled[:, 2], X_scaled[:, 3], c=colors)
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.title('Petal Information')
# Length information
plt.subplot(223)
plt.scatter(X_scaled[:, 0], X_scaled[:, 2], c=colors)
plt.xlabel('Sepal length')
plt.ylabel('Petal length')
plt.title('Length Information')
# Width information
plt.subplot(224)
plt.scatter(X_scaled[:, 1], X_scaled[:, 3], c=colors)
plt.xlabel('Sepal width')
plt.ylabel('Petal width')
plt.title('Width Information')
# Plot legend
patches = [matplotlib.patches.Patch(color=color, label=label) for color, label in key.values()]
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.1,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Applying PCA to iris features
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
iris_pca_scaled = pca.fit_transform(X_scaled)
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.scatter(iris_pca_scaled[:, 0], iris_pca_scaled[:, 1], c=colors)
plt.xlim([-3,3])
plt.ylim([-3,3])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA with Scaled Data')
# PCA on original data from the previous cell
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(iris.data)
plt.subplot(122)
plt.scatter(iris_pca[:, 0], iris_pca[:, 1], c=colors)
plt.xlim([-3,3])
plt.ylim([-3,3])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA')
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.1,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Interpreting features from PCA
from scipy import stats
pca_corr = pd.DataFrame(columns = iris['feature_names'])
for i in range(2):
print('PC%d explained variance = %.2f%s\n' % (i+1, pca.explained_variance_ratio_[i]*100, '%'))
corr = []
for j in range(4):
corr.append(stats.pearsonr(iris_pca_scaled[:, i], X_scaled[:, j])[0])
pca_corr.loc[i, ] = corr
pca_corr = pca_corr.rename(index={0: 'PC1', 1: 'PC2'})
pca_corr
In [ ]:
# Iris with 2 classes
iris_df2 = iris_df[(iris_df.target == 0) | (iris_df.target == 1)]
# Plot 2-Class Iris data
iris2 = np.asarray(iris_df2.iloc[:, :4])
Y2 = list(iris_df2.target)
key = {0: ('red', 'Iris setosa'),
1: ('blue', 'Iris versicolor')}
colors = [key[index][0] for index in Y2]
# Plot the training points
# Sepal information
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.scatter(iris2[:, 0], iris2[:, 1], c=colors)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Sepal Information')
# Width information
plt.subplot(122)
plt.scatter(iris2[:, 2], iris2[:, 3], c=colors)
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.title('Petal Information')
# Plot legend
patches = [matplotlib.patches.Patch(color=color, label=label) for color, label in key.values()]
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.1,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Binary Classification - Logistic Regression and Linear Regression as a means of Classification
from sklearn import linear_model
# Model fit
lr = linear_model.LogisticRegression()
#lr = linear_model.LinearRegression()
lr.fit(iris2, Y2)
# Predict values
Y_pred = lr.predict(iris2)
# Calculate performance - accuracy
lr.score(iris2, Y2)
# Using some features
#lr.fit(iris2[:,:2], Y2)
#Y_pred = lr.predict(iris2[:,:2])
#lr.score(iris2[:,:2], Y2)
In [ ]:
# Multiclass Classification - Logistic Regression
from sklearn import linear_model
# Load data to classify: Complete, partial, scaled, PCA applied
X = iris.data
#X = iris.data[:,:2]
#X = X_scaled
#X = iris_pca
Y = iris.target
# Model fit
lr_multi = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
lr_multi.fit(X, Y)
# Predict values
Y_pred = lr_multi.predict(X)
# Calculate performance - accuracy
lr_multi.score(X, Y)
In [ ]:
# Cross-validation
from sklearn.model_selection import train_test_split
# Split labeled data into training and test set
X = iris.data
#X = iris.data[:,:2]
#X = X_scaled
#X = iris_pca
X_train, X_test, Y_train, Y_test = train_test_split(X, iris.target, test_size=0.2, random_state=0)
print('Training set size')
print(X_train.shape, Y_train.shape)
print('Test set size')
print(X_test.shape, Y_test.shape)
# Model fit
lr_multi = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
lr_multi.fit(X_train, Y_train)
# Predict values
Y_pred = lr_multi.predict(X_test)
# Calculate performance - accuracy
lr_multi.score(X_test, Y_test)
In [ ]:
# Plot confusion matrix
from sklearn.metrics import confusion_matrix
import itertools
# Enter values
model = lr_multi
data = X_test
actual_target = Y_test
predicted_target = Y_pred
classes = ['Iris setosa', 'Iris versicolor', 'Iris virginica']
# Calculates confusion matrix from actual and predicted target values
cnf = confusion_matrix(actual_target, predicted_target)
# Accuracy results
acc = model.score(data, actual_target)*100
def plot_confusion_matrix(cm, classes, acc):
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix: Acc %.2f%%' % acc)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, ha='right')
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j], horizontalalignment='center',color='white' if cm[i, j] > thresh else 'black')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
plot_confusion_matrix(cm=cnf, classes=classes, acc=acc)
In [ ]:
# Classification using Support Vector Machines
from sklearn import svm
from sklearn.model_selection import cross_val_score
# Split labeled data into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0)
# Model fit - SVM with linear kernel
svm_ = svm.SVC(kernel='linear')
# Model fit - SVM with non-linear kernel (RBF) for data not linearly separable
# svm = svm.SVC(kernel='rbf')
svm_.fit(X_train, Y_train)
# Predict values
Y_pred = svm_.predict(X_test)
# Calculate performance - accuracy
print('1-fold CV accuracy: %.2f' % svm_.score(X_test, Y_test))
scores = cross_val_score(svm_, iris.data, iris.target, cv=5)
print('\n5-fold CV accuracies:')
print(scores)
print('\n5-fold CV average accuracy: %.2f' % np.mean(scores))
In [ ]:
In [ ]:
# Working with image dataset of handwritten digits
digits = datasets.load_digits()
X_d = digits.data
Y_d = digits.target
print('Digits image shape: %d x %d pixels' % (digits.images.shape[1], digits.images.shape[2]))
print('Digits data shape: %d x %d' % (digits.data.shape[0], digits.data.shape[1]))
print('Digit targets: ', set(Y_d))
In [ ]:
# Model fit - using cross-validation
X_train, X_test, Y_train, Y_test = train_test_split(digits.data, digits.target, test_size=0.4, random_state=0)
lr_multi_d = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
# SVM produces higher accuracy (try kernel = 'rbf')
#lr_multi_d = svm.SVC(kernel='linear')
lr_multi_d.fit(X_train, Y_train)
# Predict values
Y_pred = lr_multi_d.predict(X_test)
# Enter values for confusion matrix
model = lr_multi_d
data = X_test
actual_target = Y_test
predicted_target = Y_pred
classes = digits.target_names
# Calculates confusion matrix from actual and predicted target values
cnf = confusion_matrix(actual_target, predicted_target)
# Accuracy results
acc = model.score(data, actual_target)*100
plot_confusion_matrix(cm=cnf, classes=classes, acc=acc)
In [ ]:
In [ ]:
# K-means Clustering
from sklearn.cluster import KMeans
X = iris.data
Y = iris.target
# Initialization is important in k-means
# k-means++ selects initial cluster centers for k-mean clustering in a smart way to speed up convergence
km = KMeans(n_clusters = 3, init='k-means++')
# Random initialization performs worse
#km = KMeans(n_clusters = 3, init='random')
# Cluster centers learned from a previous successful run
#km = KMeans(n_clusters = 3, init=cc)
Y_pred = km.fit_predict(X)
cc = km.cluster_centers_
# Visualize datapoints with targets and cluster centers
# Actual targets
key = {0: ('red', 'Iris setosa'),
1: ('blue', 'Iris versicolor'),
2: ('green', 'Iris virginica')}
colors_target = [key[index][0] for index in Y]
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=colors_target)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Targets')
patches = [matplotlib.patches.Patch(color=color, label=label) for color, label in key.values()]
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.1,-0.1,1,1), bbox_transform = plt.gcf().transFigure)
# Cluster Results
key = {0: ('pink', 'Cluster-1'),
1: ('orange', 'Cluster-2'),
2: ('purple', 'Cluster-3')}
colors_cluster = [key[index][0] for index in Y_pred]
plt.subplot(122)
plt.scatter(X[:, 0], X[:, 1], c=colors_cluster)
plt.scatter(cc[:, 0], cc[:, 1], c='black', marker='x', s=60) # Plot cluster centers
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Clusters')
# Plot legend
patches = [matplotlib.patches.Patch(color=color, label=label) for color, label in key.values()]
plt.legend(handles=patches, labels=[label for _, label in key.values()],
bbox_to_anchor = (0.075,-0.3,1,1), bbox_transform = plt.gcf().transFigure)
plt.show()
In [ ]:
# Enter values for confusion matrix
model = km
data = X
actual_target = Y
predicted_target = Y_pred
classes = iris.target_names
# Plot cluster results and labels
cnf = confusion_matrix(actual_target, predicted_target)
plt.imshow(cnf, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Clusters with corresponding targets')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, np.array(list(set(Y_pred))))
plt.yticks(tick_marks, classes)
thresh = cnf.max() / 2.
for i, j in itertools.product(range(cnf.shape[0]), range(cnf.shape[1])):
plt.text(j, i, cnf[i, j], horizontalalignment='center',color='white' if cnf[i, j] > thresh else 'black')
plt.ylabel('True label')
plt.xlabel('Cluster assignment')
plt.show()
In [ ]:
In [ ]:
boston = datasets.load_boston()
boston_df = pd.DataFrame(data = np.c_[boston['data'], boston['target']],
columns = list(boston['feature_names']) + ['target'])
print('Boston Housing Prices: dataframe shape')
print(boston_df.shape)
boston_df.head()
In [ ]:
# Correlation between features and target value
from scipy import stats
X = boston_df
Y = boston.target
features = boston['feature_names']
# Plot correlation for each feature
i=1
plt.figure(figsize=(20, 20))
for f in features:
corr = stats.pearsonr(list(X[f]), Y)[0]
plt.subplot(4, 4, i)
plt.scatter(list(X[f]), Y)
plt.xlabel(f)
plt.ylabel('Target Housing Price')
plt.title(('%s and target correlation = %.2f') % (f, corr))
i+=1
In [ ]:
# Linear Regression
from sklearn import linear_model
# Data for regression
X = boston.data
Y = boston.target
# Apply PCA
pca = PCA(n_components=3)
#X = pca.fit_transform(X)
# 2 features with highest correlation
#X = np.c_[list(boston_df.RM), list(boston_df.LSTAT)]
# Model - Linear, Lasso, Ridge, normalized or not
#lr = linear_model.LinearRegression(normalize=False)
lr = linear_model.Lasso(normalize=False)
#lr = linear_model.Ridge(normalize=False)
# Model fit
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
lr.fit(X_train, Y_train)
# Score: Correlation between actual and predicted values
lr.score(X_test, Y_test)
In [ ]:
# Plot Percentage of Variance Explained with additional Principal Components
exp_var = []
for i in range(X.shape[1]):
pca = PCA(n_components=i)
pca.fit(X)
summ = sum(pca.explained_variance_ratio_)
exp_var.append(summ)
plt.figure(figsize=(6, 6))
plt.plot(exp_var)
plt.xlabel('Number of PCs')
plt.ylabel('Total % of variance explained')
plt.title('PCA')
In [ ]:
In [ ]:
# Classification
from tpot import TPOTClassifier
from sklearn import datasets
# pip install tpot
data = datasets.load_iris()
#data = datasets.load_digits()
export_file = 'tpot_pipeline_iris.py'
X = data.data
Y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export(export_file)
In [ ]:
# Regression
from tpot import TPOTRegressor
from sklearn import datasets
data = datasets.load_boston()
export_file = 'tpot_pipeline_boston.py'
X = data.data
Y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export(export_file)
y_pred = tpot.predict(X_test)
print('\nCorrelation between predicted and actual target values: %.2f' % stats.pearsonr(y_pred, y_test)[0])
In [ ]: