In [1]:
%pylab inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
In [12]:
default=pd.read_csv("Data/Default.csv")
default.head()
default.describe()
Out[12]:
In [13]:
from sklearn.cross_validation import train_test_split
In [15]:
train, test = train_test_split(default, test_size=0.33, random_state=1)
In [21]:
# Converting data into the dataframe
train = pd.DataFrame(data=train, columns=default.columns)
test = pd.DataFrame(data=test, columns=default.columns)
In [20]:
train.hist()
Out[20]:
In [27]:
plt.scatter(train.income, train.balance)
plt.xlim(0, 75000)
plt.ylim(0, 3000)
plt.xlabel('Income')
plt.ylabel('Balance')
plt.title('Income vs Balance')
Out[27]:
In [60]:
train.plot(x='balance', y='income', kind='scatter', alpha=0.3)
plt.ylim([0,80000])
plt.xlim([0,3000])
Out[60]:
In [61]:
train_default=train[train.default==1]
train_no_default=train[train.default==0]
In [62]:
plt.figure()
Out[62]:
In [63]:
plt.scatter(train_no_default.balance, train_no_default.income, alpha=0.5, marker='+', c='g')
plt.scatter(train_default.balance, train_default.income, marker='o', edgecolors = 'r', facecolors = 'none')
plt.legend( ('No Default', 'Default'), loc='upper right')
plt.ylim([0,80000])
plt.xlim([0,3000])
plt.xlabel('Income')
plt.ylabel('Balance')
Out[63]:
Plot the same graph differently
In [64]:
plt.scatter(train_default.balance, train_default.income, alpha=0.5, marker='o', c='g')
plt.scatter(train_no_default.balance, train_no_default.income, marker='+', edgecolors = 'r', facecolors = 'none')
plt.legend( ('Default', 'No Default'), loc='upper right')
plt.ylim([0,80000])
plt.xlim([0,3000])
plt.xlabel('Income')
plt.ylabel('Balance')
Out[64]:
=============================================
Logistic Regreassion
=============================================
In [68]:
# Logistic regression on the balance variable
bal = smf.logit(formula='default ~ balance', data=train).fit()
bal.summary()
Out[68]:
In [134]:
# Get confidence interval.
bal.conf_int()
Out[134]:
In [122]:
# odds ratio or Beta
# http://www.ats.ucla.edu/stat/mult_pkg/faq/general/odds_ratio.htm
np.exp(bal.params.balance)
Out[122]:
In [123]:
prob = bal.predict({'balance': [1200, 2000]})
print (prob)
In [124]:
bal.predict({'balance': 1900})
Out[124]:
In [125]:
x = np.linspace(test.balance.min(), test.balance.max(), 500)
betas = [bal.params.Intercept, bal.params.balance]
In [126]:
y = np.exp(betas[0] + betas[1]*x) / (1 + np.exp(betas[0] + betas[1]*x))
odds = np.exp(betas[0] + betas[1]*x)
log_odds = betas[0] + betas[1]*x
In [ ]:
In [132]:
plt.plot(x, y, 'r', linewidth=2)
plt.ylabel('Probability')
plt.text(500, 0.7, r'$\frac{e^{\beta_o + \beta_1x}}{1+e^{\beta_o + \beta_1x}}$', fontsize=25)
Out[132]:
In [129]:
plt.plot(x, odds, 'k', linewidth=2)
plt.ylabel('Odds')
plt.text(500, 30, r'$e^{\beta_o + \beta_1x}$', fontsize=20)
Out[129]:
In [133]:
plt.plot(x, log_odds, 'c', linewidth=2)
plt.ylabel('Log(Odds)')
plt.xlabel('x')
plt.text(500, 2, r'$\beta_o + \beta_1x$', fontsize=15)
Out[133]:
In [90]:
x_data = pd.DataFrame({'balance': np.linspace(test.balance.min(),
test.balance.max(),
500)})
y_data = bal.predict(x_data)
In [131]:
plt.figure()
plt.scatter(test.balance, test.default, alpha=0.5)
plt.plot(x_data, y_data, 'r', linewidth=2)
plt.xlabel("Balance")
plt.ylabel("Probability of Default")
plt.ylim([-0.05,1.05])
plt.xlim([0, 2800])
# Draw default
plt.plot([1200, 1500], prob, 'y')
Out[131]:
In [ ]:
In [ ]: