In [84]:
import pandas as pd
import numpy as np
from random import randint
from pandas import Series,DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import math
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 10)
plt.rcParams['font.size'] = 14
In [85]:
%%html
<style>
table {float:left}
</style>
In [86]:
from sklearn.datasets import load_iris
iris = load_iris()
type(iris)
Out[86]:
Load and return the boston house-prices dataset (regression).
Description | Value |
---|---|
Classes | 3 |
Samples per class | 50 |
Samples total | 150 |
Dimensionality | 4 |
Features | real, positive |
In [87]:
print type(iris.data)
print iris.data.shape
print iris.target.shape
print iris.data[0:5]
print np.unique(iris.target)
In [88]:
#make it a binary classification problem instead
X = np.copy(iris.data)
X = (X - np.average(X,axis=0)) / np.std(X, axis=0)
Y = np.copy(iris.target)
np.place(Y, Y==2, [0])
print np.unique(Y)
For Logistic regression, the normal function is this:
$$ f(\beta,X^{(i)}) = \dfrac{1}{1+e^{-(\beta \cdot X^{(i)})}} $$For prediction, push to either 0 or 1
$$ d = \begin{cases} 1, & \text{if} \quad f(\beta,X^{(i)}) \ge 0.5 \\ 0, & \text{if} \quad f(\beta,X^{(i)}) \lt 0.5 \end{cases} $$We need a convex cost function.. enter "cost" function computed from log-likelyhood $$ cost(f,y) = \begin{cases} -log(f) & \text{if y = 1} \\ -log(1-f) & \text{if y = 0} \end{cases} $$
$$ cost(f,y) = -log((1-y)+(2y-1) \cdot f) $$For the Loss function we can use aggregate errors we need to minimize: $$ L = \dfrac{1}{m}\sum_{i=1}^m cost $$
In [166]:
def f_func(beta,X):
return 1.0/(1.0 + np.exp(-beta.dot(X.transpose())))
def d_func(f,cutoff=0.5):
#python function always pass obj by ref_name, array is mutable
return np.where(f>=cutoff,1,0)
def cost_ifunc(f,y):
return -math.log1p((1-y)+(2*y-1)*f)
cost_func = np.vectorize(cost_ifunc)
def grad_func(beta,X,Y):
a = f_func(beta,X)-Y
b = a.dot(X)
return b
In [167]:
#test
test_x = f_func(np.array([[0.1,0.2,0.2,-0.5]]), np.array([[1,3,1,1],[2,3,4,5]]))
print test_x
print d_func(test_x,0.5)
print cost_func(test_x,[1,0])
print np.average(cost_func(test_x,[1,0]))
t = grad_func(beta,X,Y)
t
Out[167]:
Using gradient descent from this: https://en.wikipedia.org/wiki/Gradient_descent
Stepping (where gamma is learning rate): $$\beta_{n+1} \leftarrow \beta_n - {\gamma \cdot \nabla L}$$
With a complicated derivation here, we arrive at the final equation:
$$\beta_{n+1} \leftarrow \beta_n - {\gamma \cdot \dfrac{1}{m}\sum_{i=1}^m\left(f(\beta,x^{(i)})-y^{(i)}\right)\cdot x_j^{(i)}}$$
In [117]:
beta = np.array([0.5,0.5,0.5,0.5])
y_prob = f_func(beta,X[0])
print y_prob
Run it!
In [179]:
# Gradient Descent method
# init beta as an average of Y/Xi
learn_rate = 0.01
beta = np.array([0.5,0.5,0.5,0.5])
y_prob = f_func(beta,X)
y_pred = d_func(y_prob)
curr_loss = np.average(cost_func(y_prob,Y))
loss = []
loss.append(curr_loss)
print curr_loss
for i in range(0,1000):
beta = beta - learn_rate*grad_func(beta,X,Y)
y_prob = f_func(beta,X)
y_pred = d_func(y_prob)
curr_loss = np.average(cost_func(y_prob,Y))
loss.append(curr_loss)
#print 'curr={},prev={},diff={}'.format(curr_loss,loss[-2],loss[-2]-curr_loss)
if (i>10) and ((loss[-2] - curr_loss) < 10**-5): #stopping criterion
print 'stop at {}'.format(i)
break
unique, counts = np.unique(Y, return_counts=True)
print beta
plt.figure()
plt.xlabel('no. of run')
plt.ylabel('loss function')
sns.tsplot(loss)
Out[179]:
In [177]:
from sklearn.metrics import confusion_matrix
cm_mat = confusion_matrix(Y,d_func(f_func(beta,X),0.6))
print cm_mat.T
df_temp = pd.DataFrame(cm_mat.flatten()[np.newaxis].T,columns = ['values'])
df_temp = pd.DataFrame(cm_mat.flatten()[np.newaxis].T,columns = ['values'])
plt.figure(figsize = (6,4),dpi=600)
sns.heatmap(cm_mat.T, cbar=True ,annot=True, fmt=',.0f')
plt.xlabel('Truth')
plt.ylabel('Predicted')
Out[177]:
In [154]:
print X[0]
print Y
print y_pred
print y_prob[0]
In [102]:
print X[0:5]
print np.gradient(X[0:5],axis=0)
print beta
In [26]:
Using Newton's optimization method from this: http://www.stat.cmu.edu/~cshalizi/402/lectures/14-logistic-regression/lecture-14.pdf
Also this, page 4, cubic iteration: http://www.sztaki.hu/~bozoki/oktatas/nemlinearis/SebahGourdon-Newton.pdf
For implementation: https://ipvs.informatik.uni-stuttgart.de/mlr/marc/teaching/13-Optimization/04-secondOrderOpt.pdf
For finding optima of f(x) in 1D: $$x_{n+1} \leftarrow x_n - \dfrac{f'(x)}{f''(x)}$$ For matrix version, hessians all the way: $$x_{n+1} \leftarrow x_n - \dfrac{f'(x)}{f''(x)}$$
In [122]:
# Computer R Squared
ssreg = np.sum((y_pred-Y.mean())**2)
sstot = np.sum((Y-Y.mean())**2)
R_sq = ssreg/sstot
print 'R Squared = {:.2f}'.format(R_sq)
In [123]:
#standardized residual plot
res = Y-y_pred
res_d = ((1.0/(len(res)-1))*np.sum(res**2))**0.5
res_stu = res/res_d
plt.figure()
grid = sns.JointGrid(x=Y, y=res_stu, space=0, size=6, ratio=50)
grid.plot_joint(plt.scatter, color="g")
grid.plot_marginals(sns.rugplot, height=1, color="g")
grid.set_axis_labels(xlabel="Boston house price",ylabel="studentized residues")
Out[123]:
In [47]:
res_d
Out[47]:
In [124]:
res_stu[0:10]
Out[124]:
In [ ]: