Pip in a virtual environment

#Setup a virtual environment
/path/to/python3 -m venv ml_tutorial
source ml_tutorial/bin/activate

# Upgrade pip to newest version
python -m pip install --upgrade pip

# Install the dependencies
python -m pip install numpy scipy scikit-learn

Conda

# After installing conda or miniconda
conda create -n ml_tutorial python=3.5 scikit-learn
source activate ml_tutorial

# Activate the Conda environment on Windows:
activate ml_tutorial

Optional

pip install jupyter matplotlib

# Inside a Conda environment (these are already installed in the base distribution)
conda install jupyter matplotlib

Testing Your Installation

If the following works you are ready for this workshop.

import numpy as np
import sklearn

dataset = np.load('path/to/data.npz')
images = dataset['images']
labels = dataset['labels']



In [ ]:

    
# Import the libraries


# Load the dataset



In [ ]:

    
import numpy as np

# Optional for plotting
import matplotlib.pyplot as plt

# Optional for notebook display
%matplotlib inline

data = np.load('data.npz')
images = data['images']
labels = data['labels']

print(images.shape)
print(images.dtype)



In [ ]:

    
# Image Properties



In [ ]:

    
# Image Properties

# Get the first image:
first_image = images[0] # Or equivalently: images[0, :, :]
print(first_image.shape)

# Get first three images:
several_images = images[:3]
print(several_images.shape)

Our Data



In [ ]:

    
# Slicing and dicing



In [ ]:

    
# Slicing and dicing
cropped_image = first_image[4:20,4:20]
print(cropped_image.shape)



In [ ]:

    
# Getting our data in the right format



In [ ]:

    
# Getting our data in the right format

X = np.reshape(images, (730, -1))
print(X.shape)



In [ ]:

    
# Modifying arrays



In [ ]:

    
# Modifying arrays

increase_brightness = first_image + 30
decrease_brightness = first_image - 30
increase_contrast = first_image * 1.5
decrease_contrast = first_image * 0.5

brightness_compare = np.hstack((increase_brightness, decrease_brightness))
constrast_compare = np.hstack((increase_contrast, decrease_contrast))

plt.figure(figsize = (15, 12))
plt.title('Brightness')
plt.axis('off')
plt.imshow(brightness_compare, cmap='gray') 

plt.figure(figsize = (15, 12))
plt.title('Contrast')
plt.axis('off')
plt.imshow(constrast_compare, cmap='gray')

Questions/Exercises

Why do we have to be careful with the datatype for numerical operations?

Tag Number	Pattern	Details
0	Rectangle	100 bees that were the control group
1	Circle	100 bees that were treated with caffeine
2	Blank	Single queen in the colony received this tag



In [ ]:

    
# The labels to go with the images



In [ ]:

    
# The labels to go with the images

labels = data['labels']

print(labels.shape)
print(labels[::100])

y = labels



In [ ]:

    
# Split of a testing set



In [ ]:

    
# Split of a testing set

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)



In [ ]:

    
# Visualise the data



In [ ]:

    
# Visualise the data
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
fit_trans_X = pca.fit(X).transform(X)
plt.figure(figsize = (35, 20))
plt.scatter(fit_trans_X[:, 0], fit_trans_X[:, 1], c=y, s=400)



In [ ]:

    
# A better transformation



In [ ]:

    
# A better transformation

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=2)
lda_model = lda.fit(X_train, y_train)
X_trans = lda_model.transform(X_train)
plt.figure(figsize = (35, 20))
plt.scatter(X_trans[:, 0], X_trans[:, 1], c=y_train, s=400)



In [ ]:

    
# Now lets make some predictions



In [ ]:

    
# Now lets make some predictions
from sklearn import svm

clf = svm.SVC(gamma=0.0001, C=10)
clf.fit(X_trans, y_train)



In [ ]:

    
# How well did we do?



In [ ]:

    
# How well did we do?
transform_testing_set = lda.transform(X_test)
y_pred = clf.predict(transform_testing_set)

from sklearn import metrics

print (metrics.accuracy_score(y_test, y_pred))

Exercises

Replace the SVM classifier with any other classifier from the scikit-learn library.
Compare the classification performance both with and without LDA.
What happens if we evaluate our performance on the training set?



In [ ]:

    
# Let's build a pipeline



In [ ]:

    
# Let's build a pipeline
from sklearn.pipeline import Pipeline

# Start by creating the individual steps in the pipeline
lda = LDA(n_components=20)
svc = svm.SVC(gamma=0.0001, C=10)

pipeline = Pipeline([('lda', lda),
                     ('svc', svc)])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)



In [ ]:

    
# Let's write an adapt for our images to the sklearn API
from sklearn.base import TransformerMixin, BaseEstimator

class CropUnwrap(TransformerMixin, BaseEstimator):

    def __init__(self, crop_pixels=0):
        self.crop_pixels = crop_pixels
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_crop = X[:, 
                   self.crop_pixels:24-self.crop_pixels, 
                   self.crop_pixels:24-self.crop_pixels]
        rows, cols, depth = X.shape
        return X.reshape((rows, cols*depth))



In [ ]:

    
# Put it all together



In [ ]:

    
# Put it all together
unwrap = CropUnwrap()
lda = LDA(n_components=20)
svc = svm.SVC(gamma=0.0001)

pipeline = Pipeline([('unwrap', unwrap),
                     ('lda', lda),
                     ('svm', svc)])

# Now we don't need X_train, we can start with our raw images

pipeline.fit(images, labels)



In [ ]:

    
# Finding good parameters

from sklearn.grid_search import RandomizedSearchCV

search_range = {'lda__n_components': [5, 10, 15, 20, 30, 50],
                'unwrap__crop_pixels': [0, 2, 4, 6, 8],
                'svm__C': [1, 10, 100, 1000, 10e3, 10e4]}

searcher = RandomizedSearchCV(pipeline, search_range, n_iter=20)

searcher.fit(images, labels)



In [ ]:

    
# Testing it out



In [ ]:

    
# Testing it out
print(searcher.best_score_)
print(searcher.best_params_)
searcher.score(images, labels)

Questions/Exercises

Why is searcher.score(images, labels) not an accurate measure of performance? How do we fix it?
Try out some other classifiers in the pipeline - can you improve the final performance on the test set? What about with other search parameters?
Is accuracy really the best measure of performance for this dataset?



In [ ]:

    
# Solution to one:

image_train, image_test, label_train, label_test = train_test_split(images, 
                                                                    labels, 
                                                                    test_size=0.2)
searcher.fit(image_train, label_train)
searcher.score(image_test, label_test)



In [ ]:



In [ ]: