In [84]:
import pandas as pd
import numpy as np
from random import randint
from pandas import Series,DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import math
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 10)
plt.rcParams['font.size'] = 14

In [85]:
%%html
<style>
table {float:left}
</style>



In [86]:
from sklearn.datasets import load_iris
iris = load_iris()
type(iris)


Out[86]:
sklearn.datasets.base.Bunch

Load and return the boston house-prices dataset (regression).

Description Value
Classes 3
Samples per class 50
Samples total 150
Dimensionality 4
Features real, positive

In [87]:
print type(iris.data)
print iris.data.shape
print iris.target.shape
print iris.data[0:5]
print np.unique(iris.target)


<type 'numpy.ndarray'>
(150L, 4L)
(150L,)
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
[0 1 2]

In [88]:
#make it a binary classification problem instead
X = np.copy(iris.data)
X = (X - np.average(X,axis=0)) / np.std(X, axis=0)
Y = np.copy(iris.target)
np.place(Y, Y==2, [0])
print np.unique(Y)


[0 1]

For Logistic regression, the normal function is this:

$$ f(\beta,X^{(i)}) = \dfrac{1}{1+e^{-(\beta \cdot X^{(i)})}} $$

For prediction, push to either 0 or 1

$$ d = \begin{cases} 1, & \text{if} \quad f(\beta,X^{(i)}) \ge 0.5 \\ 0, & \text{if} \quad f(\beta,X^{(i)}) \lt 0.5 \end{cases} $$

We need a convex cost function.. enter "cost" function computed from log-likelyhood $$ cost(f,y) = \begin{cases} -log(f) & \text{if y = 1} \\ -log(1-f) & \text{if y = 0} \end{cases} $$

$$ cost(f,y) = -log((1-y)+(2y-1) \cdot f) $$

For the Loss function we can use aggregate errors we need to minimize: $$ L = \dfrac{1}{m}\sum_{i=1}^m cost $$


In [166]:
def f_func(beta,X):
    return 1.0/(1.0 + np.exp(-beta.dot(X.transpose())))

def d_func(f,cutoff=0.5):
    #python function always pass obj by ref_name, array is mutable
    return np.where(f>=cutoff,1,0)

def cost_ifunc(f,y):
    return -math.log1p((1-y)+(2*y-1)*f)

cost_func = np.vectorize(cost_ifunc)

def grad_func(beta,X,Y):
    a = f_func(beta,X)-Y
    b = a.dot(X)
    return b

In [167]:
#test
test_x = f_func(np.array([[0.1,0.2,0.2,-0.5]]), np.array([[1,3,1,1],[2,3,4,5]]))
print test_x
print d_func(test_x,0.5)
print cost_func(test_x,[1,0])
print np.average(cost_func(test_x,[1,0]))
t = grad_func(beta,X,Y)
t


[[ 0.59868766  0.2890505 ]]
[[1 0]]
[[-0.46918308 -0.53704848]]
-0.503115780681
Out[167]:
array([ 0.09767129, -0.04648568, -0.30848977,  0.20314009])

Using gradient descent from this: https://en.wikipedia.org/wiki/Gradient_descent

Stepping (where gamma is learning rate): $$\beta_{n+1} \leftarrow \beta_n - {\gamma \cdot \nabla L}$$

With a complicated derivation here, we arrive at the final equation:

$$\beta_{n+1} \leftarrow \beta_n - {\gamma \cdot \dfrac{1}{m}\sum_{i=1}^m\left(f(\beta,x^{(i)})-y^{(i)}\right)\cdot x_j^{(i)}}$$

In [117]:
beta = np.array([0.5,0.5,0.5,0.5])
y_prob = f_func(beta,X[0])
print y_prob


0.220726699171

Run it!


In [179]:
# Gradient Descent method
# init beta as an average of Y/Xi
learn_rate = 0.01
beta = np.array([0.5,0.5,0.5,0.5])
y_prob = f_func(beta,X)
y_pred = d_func(y_prob)

curr_loss = np.average(cost_func(y_prob,Y))
loss = []
loss.append(curr_loss)
print curr_loss

for i in range(0,1000):
    beta = beta - learn_rate*grad_func(beta,X,Y)
    y_prob = f_func(beta,X)
    y_pred = d_func(y_prob)
    curr_loss = np.average(cost_func(y_prob,Y))
    loss.append(curr_loss)
    #print 'curr={},prev={},diff={}'.format(curr_loss,loss[-2],loss[-2]-curr_loss)
    if (i>10) and ((loss[-2] - curr_loss) < 10**-5): #stopping criterion
        print 'stop at {}'.format(i)
        break

unique, counts = np.unique(Y, return_counts=True)

print beta
plt.figure()
plt.xlabel('no. of run')
plt.ylabel('loss function')
sns.tsplot(loss)


-0.388231045571
stop at 205
[ 0.08103381 -1.06981926  1.30688047 -1.41558835]
Out[179]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a8da048>

In [177]:
from sklearn.metrics import confusion_matrix
cm_mat = confusion_matrix(Y,d_func(f_func(beta,X),0.6))
print cm_mat.T
df_temp = pd.DataFrame(cm_mat.flatten()[np.newaxis].T,columns = ['values'])
df_temp = pd.DataFrame(cm_mat.flatten()[np.newaxis].T,columns = ['values'])
plt.figure(figsize = (6,4),dpi=600)
sns.heatmap(cm_mat.T, cbar=True ,annot=True, fmt=',.0f')
plt.xlabel('Truth')
plt.ylabel('Predicted')


[[81 16]
 [19 34]]
Out[177]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a558898>

In [154]:
print X[0]
print Y
print y_pred
print y_prob[0]


[-0.90068117  1.03205722 -1.3412724  -1.31297673]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
[0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0
 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0
 0 1]
0.255146724964

In [102]:
print X[0:5]
print np.gradient(X[0:5],axis=0)
print beta


[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
[[-0.2  -0.5   0.    0.  ]
 [-0.2  -0.15 -0.05  0.  ]
 [-0.15  0.05  0.05  0.  ]
 [ 0.15  0.2   0.05  0.  ]
 [ 0.4   0.5  -0.1   0.  ]]
[1 2 3 4]

In [26]:

Using Newton's optimization method from this: http://www.stat.cmu.edu/~cshalizi/402/lectures/14-logistic-regression/lecture-14.pdf

Also this, page 4, cubic iteration: http://www.sztaki.hu/~bozoki/oktatas/nemlinearis/SebahGourdon-Newton.pdf

For implementation: https://ipvs.informatik.uni-stuttgart.de/mlr/marc/teaching/13-Optimization/04-secondOrderOpt.pdf

For finding optima of f(x) in 1D: $$x_{n+1} \leftarrow x_n - \dfrac{f'(x)}{f''(x)}$$ For matrix version, hessians all the way: $$x_{n+1} \leftarrow x_n - \dfrac{f'(x)}{f''(x)}$$


In [122]:
# Computer R Squared
ssreg = np.sum((y_pred-Y.mean())**2)
sstot = np.sum((Y-Y.mean())**2)
R_sq = ssreg/sstot
print 'R Squared  = {:.2f}'.format(R_sq)


R Squared  = 1.33

In [123]:
#standardized residual plot
res = Y-y_pred
res_d = ((1.0/(len(res)-1))*np.sum(res**2))**0.5
res_stu = res/res_d

plt.figure()
grid = sns.JointGrid(x=Y, y=res_stu, space=0, size=6, ratio=50)
grid.plot_joint(plt.scatter, color="g")
grid.plot_marginals(sns.rugplot, height=1, color="g")
grid.set_axis_labels(xlabel="Boston house price",ylabel="studentized residues")


Out[123]:
<seaborn.axisgrid.JointGrid at 0x10a08c50>
<matplotlib.figure.Figure at 0x10a08f98>

In [47]:
res_d


Out[47]:
4.9214055276647812

In [124]:
res_stu[0:10]


Out[124]:
array([ 0.        , -1.78050913,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -1.78050913,  0.        ])

In [ ]: