So far we have been looking at regression problems, where the solution are continous variables. Let us look at a binary classification problem - where we need to identify the correct class [0 or 1]
In [1]:
import numpy as np
import pandas as pd
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)
In [3]:
pop = pd.read_csv('data/cars_small.csv')
In [4]:
pop.head()
Out[4]:
Lets say we want to classify the vehicles by 'Hatchback' and 'Sedan'
In [5]:
class_mapping = {'Hatchback': 0, 'Sedan': 1}
In [6]:
pop['types'] = pop['type'].map(class_mapping)
In [7]:
plt.scatter(pop.price, pop.types, c=pop.types, s = 150, alpha = 0.8 )
plt.xlabel('price')
plt.ylabel('types')
Out[7]:
In [8]:
def ols (df, xlabel, ylabel):
n = df.shape[0]
x0 = np.ones(n)
x1 = df[xlabel]
X = np.c_[x0, x1]
X = np.asmatrix(X)
y = np.transpose(np.asmatrix(df[ylabel]))
X_T = np.transpose(X)
X_pseudo = np.linalg.inv(X_T * X) * X_T
beta = X_pseudo * y
return beta
In [9]:
def plot_ols(df, xlabel, ylabel):
beta = ols(df, 'price', 'types')
beta_0 = beta.item(0)
beta_1 = beta.item(1)
plt.scatter(df[xlabel], df[ylabel], c=df[ylabel], s = 150, alpha = 0.8 )
plt.xlabel(xlabel)
plt.ylabel(ylabel)
y = beta_0 + beta_1 * df[xlabel]
plt.plot(df[xlabel], y, '-')
cutoff = (0.5 - beta_0)/beta_1
plt.vlines(cutoff, -0.4, 1.4)
In [10]:
ols(pop, 'price', 'types')
Out[10]:
In [11]:
plot_ols(pop, 'price', 'types')
However, there are two problems with this approach
In [12]:
pop1 = pop.copy()
In [13]:
pop1.tail()
Out[13]:
In [14]:
# Lets create an outlier
pop1.loc[37,'price'] = 1500
pop1.loc[41,'price'] = 2000
In [15]:
plot_ols(pop1, 'price', 'types')
In [16]:
z = np.linspace(-10, 10, 100)
In [17]:
p = 1/(1+np.exp(-z))
In [19]:
plt.plot(z,p)
plt.hlines(0.5, -20,20)
plt.vlines(0, 0,1)
plt.xlabel('z')
plt.ylabel('P(z)')
Out[19]:
So now we can transpose our linear regression problem with this logit function
$$ y = X\beta $$becomes
$$ P(y) = P(X\beta) $$$$ y = 0, when \, P(y) >0.5 $$which is equivalent to $$X\beta > 0 $$
Lets see this in the example in 'price' and 'kmpl' variables, for class $types$
$$ \beta_0 + \beta_1*price + \beta_2*kmpl + \epsilon > 0 $$So we are looking for a line which will fulfill the above condition
In [20]:
plt.scatter(pop['kmpl'], pop['price'], c=pop['types'], s = 150, alpha = 0.8 )
plt.xlabel('kmpl')
plt.ylabel('price')
Out[20]:
In [21]:
z = np.linspace(0.001, 0.999, 1000)
In [22]:
c1 = -np.log(z)
c2 = -np.log(1-z)
In [23]:
plt.plot(z,c1)
plt.plot(z,c2)
#plt.hlines(0.5, -10,10)
#plt.vlines(0, 0,1)
plt.xlabel('z')
plt.ylabel('Cost')
Out[23]:
To make it easier to work with we can write this in one line - Log-Likelihood Function
$$ C(\beta) = \frac{1}{n} (- y^T * log(P(X\beta)) - (1- y^T) * log(1 -P(X\beta))) $$If we were to differentiate this, we will get our gradient which is very similiar to our linear regression one:
$$ \nabla C(\beta) = \frac {2}{n} X^T(P(X\beta)−y) $$and are gradient descent algorithm will be
$$ \beta_{i+1} = \beta_{i} - \eta * \nabla C(\beta)$$
In [24]:
n = pop.shape[0]
x0 = np.ones(n)
x1 = pop.kmpl
x2 = pop.price
X_actual = np.c_[x1, x2]
In [25]:
X_norm = (X_actual - np.mean(X_actual, axis=0)) / np.std(X_actual, axis=0)
In [26]:
X = np.c_[x0, X_norm]
In [27]:
X = np.asmatrix(X)
y = np.asmatrix(pop.types.values.reshape(-1,1))
b = np.asmatrix([[0],[0],[0]])
In [28]:
def P(z):
return 1.0/(1+np.exp(-z))
In [29]:
def cost(X,y,b,n):
C = (- y.T*np.log(P(X*b))-(1-y.T)*np.log(1-P(X*b)))/n
return C[0,0]
In [30]:
def gradient(X,y,b,n):
g = (2/n)*X.T*(P(X*b) - y)
return g
In [31]:
def gradient_descent_logistic (eta, epochs, X, y, n):
# Set Initial Values
b = np.asmatrix([[0],[0],[0]])
c = cost(X,y,b,n)
c_all = []
c_all.append(c)
# Run the calculation for those many epochs
for i in range(epochs):
g = gradient(X,y,b,n)
b = b - eta * g
c = cost(X,y,b,n)
c_all.append(c)
return c_all, b
In [32]:
x1_min, x1_max = -3, 3
x2_min, x2_max = -3, 3
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, (x1_max - x1_min)/100),
np.arange(x2_min, x2_max, (x2_max - x2_min)/100))
xx = np.c_[np.ones(xx1.ravel().shape[0]), xx1.ravel(), xx2.ravel()]
In [33]:
def plot_gradient_descent(eta, epoch, gradient_func):
es, bs = gradient_func(eta, epoch, X, y, n)
# Plot the intercept and coefficients
plt.subplot(1, 2, 1)
#plt.tight_layout()
# Plot the probabilty plot contour
Z = P(xx*bs)
Z = Z.reshape(xx1.shape)
cs = plt.contourf(xx1, xx2, Z, cmap=plt.cm.viridis, alpha = 0.5)
plt.colorbar(cs)
# Plot the intercept and coefficients
plt.scatter(X[:,1], X[:,2], c=pop.types, s = 150, alpha = 0.8 )
plt.xlabel('kmpl')
plt.ylabel('price')
# Plot the error rates
plt.subplot(1, 2, 2)
plt.plot(es)
plt.xlabel('Epochs')
plt.ylabel('Error')
In [34]:
plot_gradient_descent(0.05, 1000, gradient_descent_logistic)
In [ ]:
Try with learning rate = 0.1, Epochs = 500
In [ ]:
In [ ]: