Pip in a virtual environment

#Setup a virtual environment
/path/to/python3 -m venv ml_tutorial
source ml_tutorial/bin/activate

# Upgrade pip to newest version
python -m pip install --upgrade pip

# Install the dependencies
python -m pip install numpy scipy scikit-learn

Conda

# After installing conda or miniconda
conda create -n ml_tutorial python=3.5 scikit-learn
source activate ml_tutorial

# Activate the Conda environment on Windows:
activate ml_tutorial

Optional

pip install jupyter matplotlib

# Inside a Conda environment (these are already installed in the base distribution)
conda install jupyter matplotlib

Testing Your Installation

If the following works you are ready for this workshop.

import numpy as np
import sklearn

dataset = np.load('path/to/data.npz')
images = dataset['images']
labels = dataset['labels']

In [ ]:
# Import the libraries


# Load the dataset

In [ ]:
import numpy as np

# Optional for plotting
import matplotlib.pyplot as plt

# Optional for notebook display
%matplotlib inline

data = np.load('data.npz')
images = data['images']
labels = data['labels']

print(images.shape)
print(images.dtype)

In [ ]:
# Image Properties

In [ ]:
# Image Properties

# Get the first image:
first_image = images[0] # Or equivalently: images[0, :, :]
print(first_image.shape)

# Get first three images:
several_images = images[:3]
print(several_images.shape)

Our Data


In [ ]:
# Slicing and dicing

In [ ]:
# Slicing and dicing
cropped_image = first_image[4:20,4:20]
print(cropped_image.shape)

In [ ]:
# Getting our data in the right format

In [ ]:
# Getting our data in the right format

X = np.reshape(images, (730, -1))
print(X.shape)

In [ ]:
# Modifying arrays

In [ ]:
# Modifying arrays

increase_brightness = first_image + 30
decrease_brightness = first_image - 30
increase_contrast = first_image * 1.5
decrease_contrast = first_image * 0.5

brightness_compare = np.hstack((increase_brightness, decrease_brightness))
constrast_compare = np.hstack((increase_contrast, decrease_contrast))

plt.figure(figsize = (15, 12))
plt.title('Brightness')
plt.axis('off')
plt.imshow(brightness_compare, cmap='gray') 

plt.figure(figsize = (15, 12))
plt.title('Contrast')
plt.axis('off')
plt.imshow(constrast_compare, cmap='gray')

Questions/Exercises

  1. Why do we have to be careful with the datatype for numerical operations?
Tag Number Pattern Details
0 Rectangle 100 bees that were the control group
1 Circle 100 bees that were treated with caffeine
2 Blank Single queen in the colony received this tag

In [ ]:
# The labels to go with the images

In [ ]:
# The labels to go with the images

labels = data['labels']

print(labels.shape)
print(labels[::100])

y = labels

In [ ]:
# Split of a testing set

In [ ]:
# Split of a testing set

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [ ]:
# Visualise the data

In [ ]:
# Visualise the data
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
fit_trans_X = pca.fit(X).transform(X)
plt.figure(figsize = (35, 20))
plt.scatter(fit_trans_X[:, 0], fit_trans_X[:, 1], c=y, s=400)

In [ ]:
# A better transformation

In [ ]:
# A better transformation

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=2)
lda_model = lda.fit(X_train, y_train)
X_trans = lda_model.transform(X_train)
plt.figure(figsize = (35, 20))
plt.scatter(X_trans[:, 0], X_trans[:, 1], c=y_train, s=400)


In [ ]:
# Now lets make some predictions

In [ ]:
# Now lets make some predictions
from sklearn import svm

clf = svm.SVC(gamma=0.0001, C=10)
clf.fit(X_trans, y_train)

In [ ]:
# How well did we do?

In [ ]:
# How well did we do?
transform_testing_set = lda.transform(X_test)
y_pred = clf.predict(transform_testing_set)

from sklearn import metrics

print (metrics.accuracy_score(y_test, y_pred))

Exercises

  1. Replace the SVM classifier with any other classifier from the scikit-learn library.
  2. Compare the classification performance both with and without LDA.
  3. What happens if we evaluate our performance on the training set?

In [ ]:
# Let's build a pipeline

In [ ]:
# Let's build a pipeline
from sklearn.pipeline import Pipeline

# Start by creating the individual steps in the pipeline
lda = LDA(n_components=20)
svc = svm.SVC(gamma=0.0001, C=10)

pipeline = Pipeline([('lda', lda),
                     ('svc', svc)])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)

In [ ]:
# Let's write an adapt for our images to the sklearn API
from sklearn.base import TransformerMixin, BaseEstimator

class CropUnwrap(TransformerMixin, BaseEstimator):

    def __init__(self, crop_pixels=0):
        self.crop_pixels = crop_pixels
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_crop = X[:, 
                   self.crop_pixels:24-self.crop_pixels, 
                   self.crop_pixels:24-self.crop_pixels]
        rows, cols, depth = X.shape
        return X.reshape((rows, cols*depth))

In [ ]:
# Put it all together

In [ ]:
# Put it all together
unwrap = CropUnwrap()
lda = LDA(n_components=20)
svc = svm.SVC(gamma=0.0001)

pipeline = Pipeline([('unwrap', unwrap),
                     ('lda', lda),
                     ('svm', svc)])

# Now we don't need X_train, we can start with our raw images

pipeline.fit(images, labels)

In [ ]:
# Finding good parameters

from sklearn.grid_search import RandomizedSearchCV

search_range = {'lda__n_components': [5, 10, 15, 20, 30, 50],
                'unwrap__crop_pixels': [0, 2, 4, 6, 8],
                'svm__C': [1, 10, 100, 1000, 10e3, 10e4]}

searcher = RandomizedSearchCV(pipeline, search_range, n_iter=20)

searcher.fit(images, labels)

In [ ]:
# Testing it out

In [ ]:
# Testing it out
print(searcher.best_score_)
print(searcher.best_params_)
searcher.score(images, labels)

Questions/Exercises

  1. Why is searcher.score(images, labels) not an accurate measure of performance? How do we fix it?
  2. Try out some other classifiers in the pipeline - can you improve the final performance on the test set? What about with other search parameters?
  3. Is accuracy really the best measure of performance for this dataset?

In [ ]:
# Solution to one:

image_train, image_test, label_train, label_test = train_test_split(images, 
                                                                    labels, 
                                                                    test_size=0.2)
searcher.fit(image_train, label_train)
searcher.score(image_test, label_test)

In [ ]:


In [ ]: