In [1]:
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import copy
from sklearn.datasets import fetch_mldata
from sklearn import cross_validation
from sklearn import base
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
#mnist = fetch_mldata('iris')
import matplotlib.pyplot as plt
In [3]:
ds = sklearn.datasets.make_classification(n_samples=20000,
n_features=30, # 30 features
n_informative=5, # only 5 informatives ones
n_redundant=0,
n_repeated=3, # and 3 duplicate
n_classes=2,
n_clusters_per_class=1,
weights=None,
flip_y=0.03,
class_sep=0.8,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=None)
X= ds[0]
y= ds[1]
# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
if (i==0):
y[idx]=-1
In [4]:
print(X[0])
print(y[0])
In [5]:
class GradientDescent(base.BaseEstimator):
def __init__(self,theta,lamb,eps):
self.theta=theta
self.eps=eps
self.lamb=lamb
self.used_features=len(theta)
def fit(self,X,y,nbIt=1000,printevery=-1):
l=len(X)
xTrans = X.transpose()
for i in xrange(0,nbIt):
#index = np.random.randint(l)
loss = np.dot(X, self.theta) - y
cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))
gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
if i%(nbIt/100)==0:
thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
else:
thetaprime = self.theta - self.eps * gradient
for k in xrange(0,len(theta)):
self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
if printevery!=-1 and i%printevery==0:
print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
ttt = self.nb_used_features()
print("%d features used"%(ttt))
self.used_features=ttt
elif i%1000==0:
ttt = self.nb_used_features()
self.used_features=ttt
def predict(self,x):
ret=[]
for i in x:
ret.append(1 if np.dot(i,self.theta)>0 else -1)
return ret
def score(self,X,y):
cpt=0.0
allpred = self.predict(X)
for idx,i in enumerate(allpred):
cpt += 1 if i==y[idx] else 0
return cpt/len(X)
def nb_used_features(self):
cpt=0
for ii in self.theta:
if ii==0:
cpt+=1
return len(self.theta)-cpt
In [6]:
theta = copy.deepcopy(X[0])
lamb=500
eps=0.00001
gd = GradientDescent(theta,lamb,eps)
In [7]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
In [10]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0,4000,200):
theta = copy.deepcopy(X[0])
gd = GradientDescent(theta,lamb,eps)
nbIterations = 4000
gd.fit(X,y,nbIterations)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(gd.used_features)
In [11]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
The difference between the L1 and L2 regularization is that L1 work on the sum of the weights, and L2 work on the sum of the square of the weights, and is therefor more sensitive to outliers.
Advantage: good predictions with significants constraints
In [22]:
class GradientDescentL2(base.BaseEstimator):
def __init__(self,theta,lamb,eps):
self.theta=theta
self.eps=eps
self.lamb=lamb
self.used_features=len(theta)
def fit(self,X,y,nbIt=1000,printevery=-1):
l=len(X)
xTrans = X.transpose()
for i in xrange(0,nbIt):
index = np.random.randint(l)
loss = np.dot(X, self.theta) - y
cost = np.sum(loss ** 2) * (1 / l) + (self.lamb*np.linalg.norm(self.theta))**2
gradient = np.dot(xTrans,(np.dot(self.theta,xTrans)-y))
if i%(nbIt/100)==0:
thetaprime = self.theta - self.eps * (np.sign(theta)*self.lamb)
else:
thetaprime = self.theta - self.eps * gradient
for k in xrange(0,len(theta)):
self.theta[k] = 0 if thetaprime[k]*theta[k]<0 else thetaprime[k]
if printevery!=-1 and i%printevery==0:
print("Iteration %s | Cost: %f | Score: %.03f" % (str(i).ljust(6), cost,self.score(X,y)))
ttt = self.nb_used_features()
print("%d features used"%(ttt))
self.used_features=ttt
elif i%1000==0:
ttt = self.nb_used_features()
self.used_features=ttt
def predict(self,x):
ret=[]
for i in x:
ret.append(1 if np.dot(i,self.theta)>0 else -1)
return ret
def score(self,X,y):
cpt=0.0
allpred = self.predict(X)
for idx,i in enumerate(allpred):
cpt += 1 if i==y[idx] else 0
return cpt/len(X)
def nb_used_features(self):
cpt=0
for ii in self.theta:
if ii==0:
cpt+=1
return len(self.theta)-cpt
In [57]:
ds = sklearn.datasets.make_classification(n_samples=200,
n_features=30, # 30 features
n_informative=5, # only 5 informatives ones
n_redundant=0,
n_repeated=3, # and 3 duplicate
n_classes=2,
n_clusters_per_class=1,
weights=None,
flip_y=0.01,
class_sep=0.8,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=None)
X= ds[0]
y= ds[1]
# labels: [0,1] -> [-1,1]
for idx,i in enumerate(y):
if (i==0):
y[idx]=-1
In [58]:
theta = copy.deepcopy(X[0])
lamb=2000
eps=0.00001
gd = GradientDescentL2(theta,lamb,eps)
#gd.tmp
In [59]:
nbIterations = 5000
gd.fit(X,y,nbIterations,printevery=nbIterations/10)
scores = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
In [60]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0,4000,200):
theta = copy.deepcopy(X[0])
gd = GradientDescentL2(theta,lamb,eps)
nbIterations = 5000
gd.fit(X,y,nbIterations)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring="accuracy")
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(gd.used_features)
In [61]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
In [62]:
#used to cross-val on lasso and elastic-net
def scorer(estimator, X, y):
pred = estimator.predict(X)
cpt=0.0
for idx,i in enumerate(pred):
if i<0:
cpt += 1 if y[idx]==-1 else 0
else:
cpt += 1 if y[idx]==1 else 0
return cpt/len(y)
In [71]:
lass = Lasso(alpha = 0.2)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print(lass.coef_)
print("Feature used: %d"%np.count_nonzero(lass.coef_))
In [72]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0.05,1.05,0.05):
theta = copy.deepcopy(X[0])
gd = Lasso(alpha = lamb)
nbIterations = 4000
gd.fit(X,y)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(np.count_nonzero(gd.coef_))
In [73]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#6DC433')
ax2.plot(la, used_features, '#5AC8ED')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Cross val score', color='#6DC433')
ax2.set_ylabel('Nb features used', color='#5AC8ED')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()
In [168]:
lass = ElasticNet(alpha = 0.2, l1_ratio=0)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
lass = ElasticNet(alpha = 0.2, l1_ratio=0.5)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
lass = ElasticNet(alpha = 0.2, l1_ratio=1)
lass.fit(X,y)
scores = cross_validation.cross_val_score(lass, X, y, cv=5,scoring=scorer)
print("Cross validation scores: %s, mean: %.02f"%(scores,np.mean(scores)))
print("Feature used: %d"%np.count_nonzero(lass.coef_))
We observe that, as expected, the more we take L1 into account the less features are used.
In [74]:
eps=0.00001
la = []
cross_sc = []
used_features = []
for lamb in np.arange(0.05,1.05,0.05):
theta = copy.deepcopy(X[0])
gd = ElasticNet(alpha = 0.2, l1_ratio=lamb)
nbIterations = 4000
gd.fit(X,y)
scoresSvm = cross_validation.cross_val_score(gd, X, y, cv=5,scoring=scorer)
print("Lamda: %s | Cross val mean: %.03f | Features: %d"%(str(lamb).ljust(5),np.mean(scoresSvm),np.count_nonzero(gd.coef_)))
#print("Lamda: %.02f | Cross val mean: %.02f | Features: %d"%(lamb,gd.score(X,y),gd.used_features))
cross_sc.append(np.mean(scoresSvm))
la.append(lamb)
used_features.append(np.count_nonzero(gd.coef_))
In [76]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(la, cross_sc, '#FF9900')
ax2.plot(la, used_features, '#9933FF')
ax1.set_xlabel('L1 L2 ratio')
ax1.set_ylabel('Cross val score', color='#FF9900')
ax2.set_ylabel('Nb features used', color='#9933FF')
ax1.yaxis.grid(False)
ax2.grid(False)
plt.show()