FDMS TME2

Paul Willot


In [1]:
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import copy

from sklearn.datasets import fetch_mldata
from sklearn import cross_validation
from sklearn import base
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
#mnist = fetch_mldata('iris')

import matplotlib.pyplot as plt

Data generation


In [3]:
ds = sklearn.datasets.make_classification(n_samples=20000,
                                          n_features=30,    # 30 features
                                          n_informative=5,  # only 5 informatives ones
                                          n_redundant=0,
                                          n_repeated=3,     # and 3 duplicate
                                          n_classes=2,
                                          n_clusters_per_class=1,
                                          weights=None,
                                          flip_y=0.03,
                                          class_sep=0.8,
                                          hypercube=True,
                                          shift=0.0,
                                          scale=1.0,
                                          shuffle=True,
                                          random_state=None)
X= ds[0]
y= ds[1]

# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
    if (i==0):
        y[idx]=-1

In [4]:
print(X[0])
print(y[0])


[-2.17203579 -1.73878073 -1.75401013 -0.89961043  0.92517207 -1.66479672
 -0.10310872  1.82915384 -1.15478618 -2.17203579  0.00618371  0.92517207
  1.66346587 -0.76517213 -0.91434611  0.8060977   0.35508     1.28384287
 -0.03080881 -1.22900547  0.81055976  1.22569563 -0.72297449  0.81055976
 -0.03063604 -0.2256332  -2.05217595 -1.54350934  0.10835675  0.18635222]
1

L1

Advantage: good features selection

L1 gradient pseudocode

procedureGradientL1(θ,X,Y,I;ε) for it = 1, I do for i = 1, n do idx ← random(l) θ′ ← θ − ε∇θ L(θ)(idx) for j=1,n do if θj′ ∗ θj < 0 then θj ← 0 else θ j ← θ j′ end if end for end for Afficher L(θ) et accuracy(θ) pour controle end for return θ end procedure L(θ) = 1/l * somme de 1 a l:( yi − fθ(xi))2 + λC(θ) )

In [5]:
class GradientDescent(base.BaseEstimator):
    def __init__(self,theta,lamb,eps):
        self.theta=theta
        self.eps=eps
        self.lamb=lamb
        self.used_features=len(theta)

    def fit(self,X,y,nbIt=1000,printevery=-1):
        l=len(X)
        xTrans = X.transpose()
        
        for i in xrange(0,nbIt):
            #index = np.random.randint(l)
            loss = np.dot(X, self.theta) - y
            cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))
            gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
            
            if i%(nbIt/100)==0:
                thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
            else:
                thetaprime = self.theta - self.eps * gradient
            
            for k in xrange(0,len(theta)):
                self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
            
            
            if printevery!=-1 and i%printevery==0:
                    print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
                    ttt = self.nb_used_features()
                    print("%d features used"%(ttt))
                    self.used_features=ttt
            elif i%1000==0:
                ttt = self.nb_used_features()
                self.used_features=ttt
            
                
    def predict(self,x):
        ret=[]
        for i in x:
            ret.append(1 if np.dot(i,self.theta)>0 else -1)
        return ret
    
    def score(self,X,y):
        cpt=0.0
        allpred = self.predict(X)
        for idx,i in enumerate(allpred):
            cpt += 1 if i==y[idx] else 0
        return cpt/len(X)
    
    def nb_used_features(self):
        cpt=0
        for ii in self.theta:
            if ii==0:
                cpt+=1
        return len(self.theta)-cpt

In [6]:
theta = copy.deepcopy(X[0])
lamb=500
eps=0.00001

gd = GradientDescent(theta,lamb,eps)

In [7]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))


Iteration 0      | Cost: 3304.296262 | Score: 0.641
30 features used
Iteration 500    | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 1000   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 1500   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 2000   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 2500   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 3000   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 3500   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 4000   | Cost: 176.169854 | Score: 0.817
11 features used
Iteration 4500   | Cost: 176.169854 | Score: 0.817
11 features used
Cross validation scores: [ 0.81475  0.81825  0.81825  0.815    0.822  ], mean: 0.82

Selecting lambda

We have only 5 informatives features over 30, and 3 redundancies.
We thrive to reach this number of features, while keeping a good classification score.


In [10]:
eps=0.00001
la = []
cross_sc = []
used_features = []

for lamb in np.arange(0,4000,200):
    theta = copy.deepcopy(X[0])
    gd = GradientDescent(theta,lamb,eps)
    nbIterations = 4000
    gd.fit(X,y,nbIterations)
    scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
    print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
    #print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
    cross_sc.append(np.mean(scoresSvm))
    la.append(lamb)
    used_features.append(gd.used_features)


Lamda: 0     | Cross val mean: 0.832 | Features: 30
Lamda: 200   | Cross val mean: 0.836 | Features: 29
Lamda: 400   | Cross val mean: 0.837 | Features: 21
Lamda: 600   | Cross val mean: 0.840 | Features: 18
Lamda: 800   | Cross val mean: 0.840 | Features: 16
Lamda: 1000  | Cross val mean: 0.842 | Features: 14
Lamda: 1200  | Cross val mean: 0.843 | Features: 12
Lamda: 1400  | Cross val mean: 0.845 | Features: 12
Lamda: 1600  | Cross val mean: 0.845 | Features: 11
Lamda: 1800  | Cross val mean: 0.844 | Features: 9
Lamda: 2000  | Cross val mean: 0.842 | Features: 9
Lamda: 2200  | Cross val mean: 0.841 | Features: 9
Lamda: 2400  | Cross val mean: 0.838 | Features: 5
Lamda: 2600  | Cross val mean: 0.834 | Features: 5
Lamda: 2800  | Cross val mean: 0.833 | Features: 5
Lamda: 3000  | Cross val mean: 0.832 | Features: 5
Lamda: 3200  | Cross val mean: 0.832 | Features: 5
Lamda: 3400  | Cross val mean: 0.833 | Features: 5
Lamda: 3600  | Cross val mean: 0.832 | Features: 5
Lamda: 3800  | Cross val mean: 0.832 | Features: 5

In [11]:
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')

ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')

ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()



L2

The difference between the L1 and L2 regularization is that L1 work on the sum of the weights, and L2 work on the sum of the square of the weights, and is therefor more sensitive to outliers.

Advantage: good predictions with significants constraints


In [22]:
class GradientDescentL2(base.BaseEstimator):
    def __init__(self,theta,lamb,eps):
        self.theta=theta
        self.eps=eps
        self.lamb=lamb
        self.used_features=len(theta)

    def fit(self,X,y,nbIt=1000,printevery=-1):
        l=len(X)
        xTrans = X.transpose()
        
        for i in xrange(0,nbIt):
            index = np.random.randint(l)
            loss = np.dot(X, self.theta) - y
            cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))**2
            gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
            
            if i%(nbIt/100)==0:
                thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
            else:
                thetaprime = self.theta - self.eps * gradient
            
            for k in xrange(0,len(theta)):
                self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
            
            
            if printevery!=-1 and i%printevery==0:
                    print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
                    ttt = self.nb_used_features()
                    print("%d features used"%(ttt))
                    self.used_features=ttt
            elif i%1000==0:
                ttt = self.nb_used_features()
                self.used_features=ttt
            
                
    def predict(self,x):
        ret=[]
        for i in x:
            ret.append(1 if np.dot(i,self.theta)>0 else -1)
        return ret
    
    def score(self,X,y):
        cpt=0.0
        allpred = self.predict(X)
        for idx,i in enumerate(allpred):
            cpt += 1 if i==y[idx] else 0
        return cpt/len(X)
    
    def nb_used_features(self):
        cpt=0
        for ii in self.theta:
            if ii==0:
                cpt+=1
        return len(self.theta)-cpt

Test with only 200 samples


In [57]:
ds = sklearn.datasets.make_classification(n_samples=200,
                                          n_features=30,    # 30 features
                                          n_informative=5,  # only 5 informatives ones
                                          n_redundant=0,
                                          n_repeated=3,     # and 3 duplicate
                                          n_classes=2,
                                          n_clusters_per_class=1,
                                          weights=None,
                                          flip_y=0.01,
                                          class_sep=0.8,
                                          hypercube=True,
                                          shift=0.0,
                                          scale=1.0,
                                          shuffle=True,
                                          random_state=None)
X= ds[0]
y= ds[1]

# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
    if (i==0):
        y[idx]=-1

In [58]:
theta = copy.deepcopy(X[0])
lamb=2000
eps=0.00001

gd = GradientDescentL2(theta,lamb,eps)
#gd.tmp

In [59]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))


Iteration 0      | Cost: 165083619.961018 | Score: 0.517
29 features used
Iteration 500    | Cost: 384214.774171 | Score: 0.766
9 features used
Iteration 1000   | Cost: 384136.704331 | Score: 0.766
9 features used
Iteration 1500   | Cost: 384136.680772 | Score: 0.766
9 features used
Iteration 2000   | Cost: 384136.680765 | Score: 0.766
9 features used
Iteration 2500   | Cost: 384136.680765 | Score: 0.766
9 features used
Iteration 3000   | Cost: 384136.680765 | Score: 0.766
9 features used
Iteration 3500   | Cost: 384136.680765 | Score: 0.766
9 features used
Iteration 4000   | Cost: 384136.680765 | Score: 0.766
9 features used
Iteration 4500   | Cost: 384136.680765 | Score: 0.766
9 features used
Cross validation scores: [ 0.73    0.7925  0.7575  0.7525  0.765 ], mean: 0.76

Selecting lambda

Similar to L1


In [60]:
eps=0.00001
la = []
cross_sc = []
used_features = []

for lamb in np.arange(0,4000,200):
    theta = copy.deepcopy(X[0])
    gd = GradientDescentL2(theta,lamb,eps)
    nbIterations = 5000
    gd.fit(X,y,nbIterations)
    scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
    print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
    cross_sc.append(np.mean(scoresSvm))
    la.append(lamb)
    used_features.append(gd.used_features)


Lamda: 0     | Cross val mean: 0.762 | Features: 30
Lamda: 200   | Cross val mean: 0.763 | Features: 29
Lamda: 400   | Cross val mean: 0.764 | Features: 24
Lamda: 600   | Cross val mean: 0.762 | Features: 23
Lamda: 800   | Cross val mean: 0.763 | Features: 19
Lamda: 1000  | Cross val mean: 0.762 | Features: 15
Lamda: 1200  | Cross val mean: 0.760 | Features: 12
Lamda: 1400  | Cross val mean: 0.761 | Features: 11
Lamda: 1600  | Cross val mean: 0.760 | Features: 10
Lamda: 1800  | Cross val mean: 0.760 | Features: 10
Lamda: 2000  | Cross val mean: 0.759 | Features: 9
Lamda: 2200  | Cross val mean: 0.760 | Features: 9
Lamda: 2400  | Cross val mean: 0.759 | Features: 7
Lamda: 2600  | Cross val mean: 0.759 | Features: 6
Lamda: 2800  | Cross val mean: 0.759 | Features: 6
Lamda: 3000  | Cross val mean: 0.759 | Features: 6
Lamda: 3200  | Cross val mean: 0.760 | Features: 6
Lamda: 3400  | Cross val mean: 0.760 | Features: 6
Lamda: 3600  | Cross val mean: 0.759 | Features: 6
Lamda: 3800  | Cross val mean: 0.758 | Features: 6

In [61]:
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')

ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')

ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()



Evaluation using sklearn Lasso

Sklearn's Lasso works the same, although way faster, and a lambda 0 < λ < 1 is more practical


In [62]:
#used to cross-val on lasso and elastic-net
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    cpt=0.0
    for idx,i in enumerate(pred):
        if i<0:
            cpt += 1 if y[idx]==-1 else 0
        else:
            cpt += 1 if y[idx]==1 else 0
    return cpt/len(y)

In [71]:
lass = Lasso(alpha = 0.2)
lass.fit(X,y)

scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print(lass.coef_)
print("Feature used: %d"%np.count_nonzero(lass.coef_))


Cross validation scores: [ 0.832  0.86   0.846  0.862  0.872], mean: 0.85
[-0.         -0.         -0.          0.          0.         -0.03322676
 -0.          0.13436393 -0.          0.         -0.25845565 -0.          0.
 -0.07689383 -0.          0.         -0.         -0.         -0.03722968
 -0.          0.          0.          0.         -0.          0.          0.
  0.          0.         -0.          0.        ]
Feature used: 5

In [72]:
eps=0.00001
la = []
cross_sc = []
used_features = []

for lamb in np.arange(0.05,1.05,0.05):
    theta = copy.deepcopy(X[0])
    gd = Lasso(alpha = lamb)
    nbIterations = 4000
    gd.fit(X,y)
    scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
    print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
    #print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
    cross_sc.append(np.mean(scoresSvm))
    la.append(lamb)
    used_features.append(np.count_nonzero(gd.coef_))


Lamda: 0.05  | Cross val mean: 0.920 | Features: 7
Lamda: 0.1   | Cross val mean: 0.910 | Features: 6
Lamda: 0.15  | Cross val mean: 0.885 | Features: 7
Lamda: 0.2   | Cross val mean: 0.854 | Features: 5
Lamda: 0.25  | Cross val mean: 0.816 | Features: 4
Lamda: 0.3   | Cross val mean: 0.812 | Features: 2
Lamda: 0.35  | Cross val mean: 0.811 | Features: 2
Lamda: 0.4   | Cross val mean: 0.809 | Features: 2
Lamda: 0.45  | Cross val mean: 0.808 | Features: 3
Lamda: 0.5   | Cross val mean: 0.808 | Features: 3
Lamda: 0.55  | Cross val mean: 0.802 | Features: 2
Lamda: 0.6   | Cross val mean: 0.798 | Features: 3
Lamda: 0.65  | Cross val mean: 0.776 | Features: 3
Lamda: 0.7   | Cross val mean: 0.750 | Features: 1
Lamda: 0.75  | Cross val mean: 0.735 | Features: 2
Lamda: 0.8   | Cross val mean: 0.486 | Features: 0
Lamda: 0.85  | Cross val mean: 0.486 | Features: 0
Lamda: 0.9   | Cross val mean: 0.486 | Features: 0
Lamda: 0.95  | Cross val mean: 0.486 | Features: 0
Lamda: 1.0   | Cross val mean: 0.486 | Features: 0

In [73]:
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')

ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')

ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()




Comparaison of L1 and L2 using sklearn ElasticNet


In [168]:
lass = ElasticNet(alpha = 0.2, l1_ratio=0)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))

lass = ElasticNet(alpha = 0.2, l1_ratio=0.5)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))

lass = ElasticNet(alpha = 0.2, l1_ratio=1)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))


Cross validation scores: [ 0.844  0.844  0.868  0.832  0.84 ], mean: 0.85
Feature used: 30
Cross validation scores: [ 0.848  0.852  0.852  0.806  0.812], mean: 0.83
Feature used: 7
Cross validation scores: [ 0.844  0.848  0.846  0.81   0.818], mean: 0.83
Feature used: 4

We observe that, as expected, the more we take L1 into account the less features are used.


In [74]:
eps=0.00001
la = []
cross_sc = []
used_features = []

for lamb in np.arange(0.05,1.05,0.05):
    theta = copy.deepcopy(X[0])
    gd = ElasticNet(alpha = 0.2, l1_ratio=lamb)
    nbIterations = 4000
    gd.fit(X,y)
    scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
    print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
    #print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
    cross_sc.append(np.mean(scoresSvm))
    la.append(lamb)
    used_features.append(np.count_nonzero(gd.coef_))


Lamda: 0.05  | Cross val mean: 0.922 | Features: 20
Lamda: 0.1   | Cross val mean: 0.917 | Features: 13
Lamda: 0.15  | Cross val mean: 0.917 | Features: 8
Lamda: 0.2   | Cross val mean: 0.916 | Features: 8
Lamda: 0.25  | Cross val mean: 0.914 | Features: 8
Lamda: 0.3   | Cross val mean: 0.913 | Features: 8
Lamda: 0.35  | Cross val mean: 0.910 | Features: 8
Lamda: 0.4   | Cross val mean: 0.908 | Features: 8
Lamda: 0.45  | Cross val mean: 0.905 | Features: 8
Lamda: 0.5   | Cross val mean: 0.902 | Features: 8
Lamda: 0.55  | Cross val mean: 0.898 | Features: 8
Lamda: 0.6   | Cross val mean: 0.895 | Features: 8
Lamda: 0.65  | Cross val mean: 0.894 | Features: 8
Lamda: 0.7   | Cross val mean: 0.889 | Features: 8
Lamda: 0.75  | Cross val mean: 0.879 | Features: 8
Lamda: 0.8   | Cross val mean: 0.874 | Features: 8
Lamda: 0.85  | Cross val mean: 0.869 | Features: 7
Lamda: 0.9   | Cross val mean: 0.865 | Features: 7
Lamda: 0.95  | Cross val mean: 0.859 | Features: 7
Lamda: 1.0   | Cross val mean: 0.854 | Features: 5

In [76]:
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#FF9900')
ax2.plot(la, used_features, '#9933FF')

ax1.set_xlabel('L1 L2 ratio')
ax1.set_ylabel('Cross val score', color='#FF9900')
ax2.set_ylabel('Nb features used', color='#9933FF')

ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()