In [1]:
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

# set plotting options
%matplotlib inline
plt.style.use('bmh')

In [2]:
# synthetic data
X = np.genfromtxt('../data/synthetic_data.txt', delimiter=' ')
y = np.genfromtxt('../data/label.txt')

In [3]:
# fit model (Fisher scoring?), display results
lm = sm.Logit(y, X).fit()
results = lm.summary()
print(results)


Optimization terminated successfully.
         Current function value: 0.047510
         Iterations 11
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      997
Method:                           MLE   Df Model:                            2
Date:                Wed, 13 Jul 2016   Pseudo R-squ.:                  0.9315
Time:                        09:37:02   Log-Likelihood:                -47.510
converged:                       True   LL-Null:                       -693.15
                                        LLR p-value:                4.011e-281
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         -8.0686      0.944     -8.550      0.000        -9.918    -6.219
x1             4.2066      0.517      8.132      0.000         3.193     5.220
x2             0.3780      0.147      2.564      0.010         0.089     0.667
==============================================================================

Possibly complete quasi-separation: A fraction 0.48 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.

In [4]:
# plot data
figsize(12, 6)
plt.scatter(X[:500,1], X[:500, 2], color='red', label='class 0')
plt.scatter(X[500:, 1], X[500:, 2], color='blue', label='class 1')
plt.legend(loc='best');



In [5]:
# with decision boundary
xs = np.linspace(0, 3, 100)
ys = (1 / lm.params[2]) * (-lm.params[1] * xs - lm.params[0])

# plot data
figsize(12, 6)
plt.scatter(X[:500,1], X[:500, 2], color='red', label='class 0')
plt.scatter(X[500:, 1], X[500:, 2], color='blue', label='class 1')
plt.plot(xs, ys, color='purple', linestyle = '--', label='decision boundary')
plt.ylim(-8, 8)
plt.title('Quasi-Complete Separation')
plt.xlabel('x1')
plt.ylabel('x2')
plt.legend(loc='best');



In [ ]: