In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns
sns.set()
import numpy as np
import itertools
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
# Save a nice dark grey as a variable
almost_black = '#262626'
%run '/home/glemaitre/anaconda/lib/python2.7/site-packages/modshogun.py'
import modshogun
Generate some data with dimension reduction to observed something.
In [3]:
# Generate some data
x, y = make_classification(n_features=2, n_redundant=0, n_informative=1, class_sep=1.,
n_clusters_per_class=1, n_samples=10000, weights=[0.1, 0.9],
random_state=9)
# Instanciate a PCA object for the sake of easy visualisation
# pca = PCA(n_components = 2)
# Fit and transform x to visualise inside a 2D feature space
# x_vis = pca.fit_transform(x)
# Plot the original data
# Plot the two classes
palette = sns.color_palette()
plt.scatter(x[y==0, 0], x[y==0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
plt.scatter(x[y==1, 0], x[y==1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
plt.legend()
plt.show()
Compute the LMNN for each sample of the minority class
In [10]:
from collections import Counter
# Compute the classes representation
stat_class = Counter(y)
# Find the minority class key
label_min_class = min(stat_class, key=stat_class.get)
label_maj_class = max(stat_class, key=stat_class.get)
# Save the sample index of this class
idx_min_class = np.ravel(np.nonzero(y == label_min_class))
idx_maj_class = np.ravel(np.nonzero(y == label_maj_class))
from sklearn.neighbors import NearestNeighbors
neighbours_required = 7
# Create an object NN only for the minority class
min_class_NN = NearestNeighbors(n_neighbors=neighbours_required, metric='l2')#, n_jobs=-1)
min_class_NN.fit(x[idx_min_class, :], y[idx_min_class])
# Create an object NN only for the majority class
max_class_NN = NearestNeighbors(n_neighbors=neighbours_required, metric='l2')#, n_jobs=-1)
max_class_NN.fit(x[idx_maj_class, :], y[idx_maj_class])
# Create an object NN for the whole dataset
data_NN = NearestNeighbors(n_neighbors=1, metric='l2')#,n_jobs=-1)
data_NN.fit(x, y)
# Compute all the distance of the k-NN on the minority class on the original data for the point of the minority class
dist_min, ind_min = min_class_NN.kneighbors(x[idx_min_class, :])
# Compute all the distance of the k-NN on the majority class on the original data for the point of the minority class
dist_max, ind_max = max_class_NN.kneighbors(x[idx_min_class, :])
# Find the largest distance for the both above distances
dist_min = np.max(dist_min, axis=1)
dist_max = np.max(dist_max, axis=1)
# Select the max_dist between dist_min and dist_max to ensure a minimum of neighbours
#max_dist = dist_max.copy()
max_dist = dist_min.copy()
#max_dist[np.nonzero(dist_min < dist_max)] = dist_max[np.nonzero(dist_min < dist_max)]
#max_dist[np.nonzero(dist_min > dist_max)] = dist_min[np.nonzero(dist_min > dist_max)]
# Now find the samples to consider with the distance extracted using only the minority class
# We need to loop since that the distance is changing
s_considers = []
for s, d, tmpi in zip(x[idx_min_class], max_dist, idx_min_class):
dist, ind = data_NN.radius_neighbors(X=np.atleast_2d(s), radius=d)
a = ind[0]
# Move to the first position the index of interest
idxint = np.nonzero(a==tmpi)
a[idxint] = a[0]
a[0] = tmpi
s_considers.append(a)
from metric_learn import LMNN, ITML, SDML, LSML
cov_mat = []
idx_cons = []
for s in s_considers:
# Extract the interesting data
x_s = x[np.ravel(s), :]
y_s = y[np.ravel(s)]
# Check if there is any imposter
stat = Counter(y_s)
print stat
num_constraints = 200
if (len(stat) > 1):
if (stat[label_min_class] > stat[label_maj_class]):
#print (stat[label_maj_class] / stat[label_min_class])
if ((float(stat[label_min_class]) / float(stat[label_maj_class])) > 1 and
(float(stat[label_min_class]) / float(stat[label_maj_class])) < 2):
# Fit the LMNN for these data
s_lmnn = LMNN(k=1, convergence_tol=1e-9, min_iter=5000, max_iter=5000)
s_lmnn.fit(x_s, y_s)
#s_lmnn = ITML(max_iters=5000, convergence_threshold=1e-9)
#s_lmnn.fit(x_s, ITML.prepare_constraints(y_s, x_s.shape[0], num_constraints))
figure, axis = plt.subplots(1,1)
plt.scatter(x_s[y_s==0, 0], x_s[y_s==0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
plt.scatter(x_s[y_s==1, 0], x_s[y_s==1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
cm = np.matrix(s_lmnn.metric())
print np.matrix(cm).I
elli = make_covariance_ellipse(np.matrix(cm).I, x_s[0], 1)
axis.add_artist(elli)
plt.show()
# Store the covariance matrix
cov_mat.append(np.matrix(s_lmnn.metric()))
idx_cons.append(True)
else:
cov_mat.append(np.ma.cov(x_s.T))
idx_cons.append(False)
else:
cov_mat.append(np.ma.cov(x_s.T))
idx_cons.append(False)
else:
cov_mat.append(np.ma.cov(x_s.T))
idx_cons.append(False)
In [ ]:
xxxx = [172, 6541, 8670, 9961]
print x[xxxx, :]
Function to plot a covariance with some sigma on the plot
In [ ]:
def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
"""
Plots an `nstd` sigma error ellipse based on the specified covariance
matrix (`cov`). Additional keyword arguments are passed on to the
ellipse patch artist.
Parameters
----------
cov : The 2x2 covariance matrix to base the ellipse on
pos : The location of the center of the ellipse. Expects a 2-element
sequence of [x0, y0].
nstd : The radius of the ellipse in numbers of standard deviations.
Defaults to 2 standard deviations.
ax : The axis that the ellipse will be plotted on. Defaults to the
current axis.
Additional keyword arguments are pass on to the ellipse patch.
Returns
-------
A matplotlib ellipse artist
"""
def eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:,order]
if ax is None:
ax = plt.gca()
vals, vecs = eigsorted(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
# Width and height are "full" widths, not radius
width, height = 2 * nstd * np.sqrt(vals)
ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
ax.add_artist(ellip)
ellip.set_alpha(.1)
return ellip
In [11]:
# Plot the data
figure, axis = plt.subplots(1,1)
#plt.figure(figsize=(18,10))
plt.scatter(x[y==0, 0], x[y==0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
plt.scatter(x[y==1, 0], x[y==1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
# For each data, let's plot some elippse
for cm, s, cs in zip(cov_mat, x[idx_min_class], idx_cons):
if (cs == True):
#print np.matrix(cm).I
#print s
elli = make_covariance_ellipse(np.matrix(cm).I, s, std=2.)
axis.add_artist(elli)
plt.legend()
plt.show()
In [ ]:
print np.sum(np.sum(np.cov(x_s.T)))
In [ ]:
x = np.array([[0,0],[-1,0.1],[0.3,-0.05],[0.7,0.3],[-0.2,-0.6],[-0.15,-0.63],[-0.25,0.55],[-0.28,0.67]])
y = np.array([0,0,0,0,1,1,2,2])
In [ ]:
import matplotlib.pyplot as pyplot
%matplotlib inline
def plot_data(features,labels,axis,alpha=1.0):
# separate features according to their class
X0,X1,X2 = features[labels==0], features[labels==1], features[labels==2]
# class 0 data
axis.plot(X0[:,0], X0[:,1], 'o', color='green', markersize=12, alpha=alpha)
# class 1 data
axis.plot(X1[:,0], X1[:,1], 'o', color='red', markersize=12, alpha=alpha)
# class 2 data
axis.plot(X2[:,0], X2[:,1], 'o', color='blue', markersize=12, alpha=alpha)
# set axes limits
axis.set_xlim(-1.5,1.5)
axis.set_ylim(-1.5,1.5)
axis.set_aspect('equal')
axis.set_xlabel('x')
axis.set_ylabel('y')
figure,axis = plt.subplots(1,1)
plot_data(x,y,axis)
axis.set_title('Toy data set')
plt.show()
In [5]:
def make_covariance_ellipse(covariance, mean, std=2):
import matplotlib.patches as patches
import scipy.linalg as linalg
# the ellipse is centered at (0,0)
# mean = np.array([0,0])
# eigenvalue decomposition of the covariance matrix (w are eigenvalues and v eigenvectors),
# keeping only the real part
w,v = linalg.eigh(covariance)
# normalize the eigenvector corresponding to the largest eigenvalue
u = v[0]/linalg.norm(v[0])
# angle in degrees
angle = 180.0/np.pi*np.arctan(u[1]/u[0])
# fill Gaussian ellipse at 2 standard deviation
ellipse = patches.Ellipse(mean, std*w[0]**0.5, std*w[1]**0.5, 180+angle, color='orange', alpha=0.3)
return ellipse
# represent the Euclidean distance
figure,axis = plt.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(np.eye(2))
axis.add_artist(ellipse)
axis.set_title('Euclidean distance')
plt.show()
In [ ]:
# number of target neighbours per example
k = 2
lmnn = LMNN(k, min_iter=50, max_iter=1000, convergence_tol=1e-9)
lmnn.fit(x, y)
In [ ]:
# get the linear transform from LMNN
L = lmnn.transformer()
# square the linear transform to obtain the Mahalanobis distance matrix
M = np.matrix(lmnn.metric())
print M.I
# represent the distance given by LMNN
figure,axis = plt.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(M.I)
axis.add_artist(ellipse)
axis.set_title('LMNN distance')
plt.show()
In [14]:
import numpy
x = numpy.array([[0,0],[-1,0.1],[0.3,-0.05],[0.7,0.3],[-0.2,-0.6],[-0.15,-0.63],[-0.25,0.55],[-0.28,0.67]])
y = numpy.array([0,0,0,0,1,1,2,2])
import matplotlib.pyplot as pyplot
%matplotlib inline
def plot_data(features,labels,axis,alpha=1.0):
# separate features according to their class
X0,X1,X2 = features[labels==0], features[labels==1], features[labels==2]
# class 0 data
axis.plot(X0[:,0], X0[:,1], 'o', color='green', markersize=12, alpha=alpha)
# class 1 data
axis.plot(X1[:,0], X1[:,1], 'o', color='red', markersize=12, alpha=alpha)
# class 2 data
axis.plot(X2[:,0], X2[:,1], 'o', color='blue', markersize=12, alpha=alpha)
# set axes limits
axis.set_xlim(-1.5,1.5)
axis.set_ylim(-1.5,1.5)
axis.set_aspect('equal')
axis.set_xlabel('x')
axis.set_ylabel('y')
figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
axis.set_title('Toy data set')
pyplot.show()
def make_covariance_ellipse(covariance):
import matplotlib.patches as patches
import scipy.linalg as linalg
# the ellipse is centered at (0,0)
mean = numpy.array([0,0])
# eigenvalue decomposition of the covariance matrix (w are eigenvalues and v eigenvectors),
# keeping only the real part
w,v = linalg.eigh(covariance)
# normalize the eigenvector corresponding to the largest eigenvalue
u = v[0]/linalg.norm(v[0])
# angle in degrees
angle = 180.0/numpy.pi*numpy.arctan(u[1]/u[0])
# fill Gaussian ellipse at 2 standard deviation
ellipse = patches.Ellipse(mean, 2*w[0]**0.5, 2*w[1]**0.5, 180+angle, color='orange', alpha=0.3)
return ellipse
# represent the Euclidean distance
figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(numpy.eye(2))
axis.add_artist(ellipse)
axis.set_title('Euclidean distance')
pyplot.show()
from modshogun import RealFeatures, MulticlassLabels
features = RealFeatures(x.T)
labels = MulticlassLabels(y.astype(numpy.float64))
from modshogun import LMNN
# number of target neighbours per example
k = 1
lmnn = LMNN(features,labels,k)
# set an initial transform as a start point of the optimization
init_transform = numpy.eye(2)
lmnn.set_maxiter(2000)
lmnn.train(init_transform)
# get the linear transform from LMNN
L = lmnn.get_linear_transform()
# square the linear transform to obtain the Mahalanobis distance matrix
M = numpy.matrix(numpy.dot(L.T,L))
# represent the distance given by LMNN
figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(M.I)
axis.add_artist(ellipse)
axis.set_title('LMNN distance')
pyplot.show()
# project original data using L
lx = numpy.dot(L,x.T)
print L
# represent the data in the projected space
figure,axis = pyplot.subplots(1,1)
plot_data(lx.T,y,axis)
plot_data(x,y,axis,0.3)
ellipse = make_covariance_ellipse(numpy.eye(2))
axis.add_artist(ellipse)
axis.set_title('LMNN\'s linear transform')
pyplot.show()