In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import sys
sys.path.append('..')
from helper import logistic_regression as lr # my own module
from helper import general as general
from sklearn.metrics import classification_report
In [2]:
# prepare data
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()
Out[2]:
In [3]:
X = general.get_X(data)
print(X.shape)
y = general.get_y(data)
print(y.shape)
In [4]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(np.arange(-10, 10, step=0.01),
lr.sigmoid(np.arange(-10, 10, step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z', fontsize=18)
ax.set_ylabel('g(z)', fontsize=18)
ax.set_title('sigmoid function', fontsize=18)
Out[4]:
In [5]:
theta = theta=np.zeros(3) # X(m*n) so theta is n*1
theta
Out[5]:
In [6]:
lr.cost(theta, X, y)
Out[6]:
looking good, be careful of the data shape
In [7]:
lr.gradient(theta, X, y)
Out[7]:
- here I'm using
scipy.optimize.minimizeto find the parameters- and I use this model without understanding.... what is
Jacobian...
In [8]:
import scipy.optimize as opt
In [9]:
res = opt.minimize(fun=lr.cost, x0=theta, args=(X, y), method='Newton-CG', jac=lr.gradient)
In [10]:
print(res)
In [11]:
final_theta = res.x
y_pred = lr.predict(X, final_theta)
print(classification_report(y, y_pred))
http://stats.stackexchange.com/questions/93569/why-is-logistic-regression-a-linear-classifier
$X \times \theta = 0$ (this is the line)
In [12]:
print(res.x) # this is final theta
In [13]:
coef = -(res.x / res.x[2]) # find the equation
print(coef)
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x
In [14]:
data.describe() # find the range of x and y
Out[14]:
you know the intercept would be around 125 for both x and y
In [15]:
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('exam1', 'exam2', hue='admitted', data=data,
size=6,
fit_reg=False,
scatter_kws={"s": 25}
)
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
Out[15]:
In [ ]: