In [2]:
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import copy
from sklearn.datasets import fetch_mldata
from sklearn import cross_validation
from sklearn import base
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
import matplotlib.pyplot as plt
In [3]:
ds = sklearn.datasets.make_classification(n_samples=2500,
n_features=30, # 30 features
n_informative=8, # only 8 informatives ones
n_redundant=0,
n_repeated=2, # and 2 duplicate
n_classes=2,
n_clusters_per_class=1,
weights=None,
flip_y=0.03,
class_sep=0.8,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=None)
X= ds[0]
y= ds[1]
# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
if (i==0):
y[idx]=-1
In [4]:
print(X[0])
print(y[0])
In [5]:
class GradientDescent(base.BaseEstimator):
def __init__(self,theta,lamb,eps):
self.theta=theta
self.eps=eps
self.lamb=lamb
self.used_features=len(theta)
def fit(self,X,y,nbIt=1000,printevery=-1):
l=len(X)
xTrans = X.transpose()
for i in xrange(0,nbIt):
#index = np.random.randint(l)
loss = np.dot(X, self.theta) - y
cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))
gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
if i%(nbIt/100)==0:
thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
else:
thetaprime = self.theta - self.eps * gradient
for k in xrange(0,len(theta)):
self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
if printevery!=-1 and i%printevery==0:
print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
ttt = self.nb_used_features()
print("%d features used"%(ttt))
self.used_features=ttt
elif i%1000==0:
ttt = self.nb_used_features()
self.used_features=ttt
def predict(self,x):
ret=[]
for i in x:
ret.append(1 if np.dot(i,self.theta)>0 else -1)
return ret
def score(self,X,y):
cpt=0.0
allpred = self.predict(X)
for idx,i in enumerate(allpred):
cpt += 1 if i==y[idx] else 0
return cpt/len(X)
def nb_used_features(self):
cpt=0
for ii in self.theta:
if ii==0:
cpt+=1
return len(self.theta)-cpt
In [6]:
theta = copy.deepcopy(X[0])
lamb=500
eps=0.00001
gd = GradientDescent(theta,lamb,eps)
In [7]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
In [8]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0,5000,200):
theta = copy.deepcopy(X[0])
gd = GradientDescent(theta,lamb,eps)
nbIterations = 4000
gd.fit(X,y,nbIterations)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(gd.used_features)
In [9]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
We can see that only 8 features are usefull (score 87%), with more features we can make overfitting and scores are only 2% better.
The difference between the L1 and L2 regularization is that L1 work on the sum of the weights, and L2 work on the sum of the square of the weights, and is therefor more sensitive to outliers.
Advantage: good predictions with significants constraints
In [11]:
class GradientDescentL2(base.BaseEstimator):
def __init__(self,theta,lamb,eps):
self.theta=theta
self.eps=eps
self.lamb=lamb
self.used_features=len(theta)
def fit(self,X,y,nbIt=1000,printevery=-1):
l=len(X)
xTrans = X.transpose()
for i in xrange(0,nbIt):
index = np.random.randint(l)
loss = np.dot(X, self.theta) - y
cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))**2
gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
if i%(nbIt/100)==0:
thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
else:
thetaprime = self.theta - self.eps * gradient
for k in xrange(0,len(theta)):
self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
if printevery!=-1 and i%printevery==0:
print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
ttt = self.nb_used_features()
print("%d features used"%(ttt))
self.used_features=ttt
elif i%1000==0:
ttt = self.nb_used_features()
self.used_features=ttt
def predict(self,x):
ret=[]
for i in x:
ret.append(1 if np.dot(i,self.theta)>0 else -1)
return ret
def score(self,X,y):
cpt=0.0
allpred = self.predict(X)
for idx,i in enumerate(allpred):
cpt += 1 if i==y[idx] else 0
return cpt/len(X)
def nb_used_features(self):
cpt=0
for ii in self.theta:
if ii==0:
cpt+=1
return len(self.theta)-cpt
In [12]:
ds = sklearn.datasets.make_classification(n_samples=200,
n_features=30, # 30 features
n_informative=8, # only 8 informatives ones
n_redundant=0,
n_repeated=2, # and 2 duplicate
n_classes=2,
n_clusters_per_class=1,
weights=None,
flip_y=0.01,
class_sep=0.8,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=None)
X= ds[0]
y= ds[1]
# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
if (i==0):
y[idx]=-1
In [13]:
theta = copy.deepcopy(X[0])
lamb=1000
eps=0.00001
gd = GradientDescentL2(theta,lamb,eps)
#gd.tmp
In [14]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
We can see that L2 is more sensitive to outliers because it chooses 7 usefull features against 8 for L1
In [18]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0,4000,200):
theta = copy.deepcopy(X[0])
gd = GradientDescentL2(theta,lamb,eps)
nbIterations = 5000
gd.fit(X,y,nbIterations)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(gd.used_features)
In [20]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
L2 chooses less features than L1, and keeps a good score.
In [48]:
#used to cross-val on lasso and elastic-net
def scorer(estimator, X, y):
pred = estimator.predict(X)
cpt=0.0
for idx,i in enumerate(pred):
if i<0:
cpt += 1 if y[idx]==-1 else 0
else:
cpt += 1 if y[idx]==1 else 0
return cpt/len(y)
In [49]:
lass = Lasso(alpha = 0.2)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print(lass.coef_)
print("Feature used: %d"%np.count_nonzero(lass.coef_))
In [50]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0.05,1.05,0.05):
theta = copy.deepcopy(X[0])
gd = Lasso(alpha = lamb)
nbIterations = 4000
gd.fit(X,y)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(np.count_nonzero(gd.coef_))
In [51]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
In [52]:
lass = ElasticNet(alpha = 0.2, l1_ratio=0)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
lass = ElasticNet(alpha = 0.2, l1_ratio=0.5)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
lass = ElasticNet(alpha = 0.2, l1_ratio=1)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
We observe that, as expected, the more we take L1 into account the less features are used.
In [53]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0.05,1.05,0.05):
theta = copy.deepcopy(X[0])
gd = ElasticNet(alpha = 0.2, l1_ratio=lamb)
nbIterations = 4000
gd.fit(X,y)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(np.count_nonzero(gd.coef_))
In [54]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#FF9900')
ax2.plot(la, used_features, '#9933FF')
ax1.set_xlabel('L1 L2 ratio')
ax1.set_ylabel('Cross val score', color='#FF9900')
ax2.set_ylabel('Nb features used', color='#9933FF')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()