In [ ]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from patsy import dmatrices
import seaborn as sns
%matplotlib inline
In [ ]:
# read the data in
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")
df.groupby('rank').mean()
train_cols = df.columns[1:]
In [ ]:
df1 = df[df['rank']==1]
df2 = df[df['rank']==2]
df3 = df[df['rank']==3]
df4 = df[df['rank']==4]
plt.scatter(df1['gre'],df1['admit'])
Logistic regression fits probability of the following form: $$\pi(x) = P(y=1\mid x) = \frac{e^{\alpha+\beta x}}{1+e^{\alpha+\beta x}}$$
This is a sigmoid function; when $x\rightarrow \infty$, then $\pi(x)\rightarrow 1$ and when $x\rightarrow -\infty$, then $\pi(x)\rightarrow 0$.
In [ ]:
alphas = [-4, -8,-12,-20]
betas = [0.4,0.4,0.6,1]
x = np.arange(40)
fig = plt.figure(figsize=(8, 6))
ax = plt.subplot(111)
for i in range(len(alphas)):
a = alphas[i]
b = betas[i]
y = np.exp(a+b*x)/(1+np.exp(a+b*x))
plt.plot(x,y, label="$\\frac{e^{ %d + %.fx}}{1+e^{%d + %.fx}}$" %(a,b,a,b))
plt.xlabel('x')
plt.ylabel('$\pi(x)$')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 20})
Parameter $\beta$ controls how fast $\pi(x)$ raises from $0$ to $1$
Parameter $\alpha$ shows the value of $x$ for which $\pi(x)=0.5$
For variable $y_i\in\{0,1\}$, the expected valye of $y_i$ given $x_i$ is $\text{logit}\left(\pi(x_i)\right)$
Linear regression: $y_i = \alpha +\beta x_i$
$y_i$ comes from a normal distribution with standard deviation $\sigma$
$y_i=\alpha + \beta x_i$ is the linear predictor
Logistic regression: The expected value of $E[y_i]$ given $x_i$ is $\pi_i=\pi(x_i)$ and $$\text{logit}\left(E[y_i]\right)=\text{logit}\left(\pi(x_i)\right)=\alpha + \beta x_i$$
$y_i\in\{0,1\}$ with $\text{Pr}(y_i=1)=\pi(x_i)$
Input pairs $(x_i,y_i)$
Output parameters $\widehat{\alpha}$ and $\widehat{\beta}$ that maximize the likelihood of the data given these parameters for the logistic regression model.
Method Maximum likelihood estimation
In [ ]:
logit = sm.Logit(df['admit'], df[train_cols])
# fit the model
result = logit.fit()
In [ ]:
print result.summary()
Some more information on performing logistic regression using the statmodels package can be found here:
http://blog.yhathq.com/posts/logistic-regression-and-python.html
Extramarital affair data used to explain the allocation of an individual’s time among work, time spent with a spouse, and time spent with a paramour. The data is used as an example of regression with censored data.
In [ ]:
# load dataset
dta = sm.datasets.fair.load_pandas().data
print dta.head(2)
In [ ]:
# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)
Understanding our data
In [ ]:
dta.groupby('affair').mean()
We can see that on average, marriages with affairs are rated lower, which is to be expected.
In [ ]:
dta.groupby('rate_marriage').mean()
An increase in age, yrs_married, and children appears to correlate with a declining marriage rating.
In [ ]:
# histogram of education
dta.educ.value_counts().sort_index().plot(kind='bar')
plt.title('Histogram of Education')
plt.xlabel('Education Level')
plt.ylabel('Frequency')
plt.show()
In [ ]:
# histogram of marriage rating
dta.rate_marriage.value_counts().sort_index().plot(kind='bar', by='affair')
plt.title('Histogram of Marriage Rating')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.show()
In [ ]:
# barplot of marriage rating grouped by affair (True or False)
pd.crosstab(dta.rate_marriage, dta.affair.astype(bool)).plot(kind='bar')
plt.title('Marriage Rating Distribution by Affair Status')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.legend(loc='center left', title="Affair", bbox_to_anchor=(1, 0.5))
plt.show()
Stacked barplot to look at the percentage of women having affairs by number of years of marriage.
In [ ]:
affair_yrs_married = pd.crosstab(dta.yrs_married, dta.affair.astype(bool))
affair_yrs_married.div(affair_yrs_married.sum(1).astype(float), axis=0).plot(kind='bar',
stacked=True
)
plt.title('Affair Percentage by Years Married')
plt.xlabel('Years Married')
plt.ylabel('Percentage')
plt.legend(loc='center left', title="Affair", bbox_to_anchor=(1, 0.5))
plt.show()
Treat affair as a target variable and occupation and occupation_husb as boolean variables:
In [ ]:
# create dataframes with an intercept column and dummy variables for
# occupation and occupation_husb
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
dta, return_type="dataframe")
print X.columns
Renaming the occupation-related binary variables
In [ ]:
# fix column names of X
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
print X.columns
In [ ]:
print y.shape,
y = np.ravel(y)
print y.shape
In [ ]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y,
test_size=0.3,
random_state=0
)
logistic_regr = linear_model.LogisticRegression()
logistic_regr.fit(X_train, y_train)
In [ ]:
# predict class labels for the test set
y_predicted = logistic_regr.predict(X_test)
print y_predicted
In [ ]:
#prediction probabilities
probs = logistic_regr.predict_proba(X_test)
print probs
In [ ]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, y_predicted)
In [ ]:
print metrics.confusion_matrix(y_test, y_predicted)
print metrics.classification_report(y_test, y_predicted)
Looking at the coefficients
In [ ]:
# examine the coefficients
print pd.DataFrame(zip(X.columns, np.transpose(logistic_regr.coef_)))
In [ ]:
# evaluate the model using 10-fold cross-validation
scores = cross_validation.cross_val_score(linear_model.LogisticRegression(),
X,
y,
scoring='accuracy',
cv=10)
print scores
print scores.mean()
Probability of an affair for a random woman not present in the dataset. She's a 25-year-old teacher who graduated college, has been married for 3 years, has 1 child, rates herself as strongly religious, rates her marriage as fair, and her husband is a farmer.
In [ ]:
logistic_regr.predict_proba(np.array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 3, 25, 3, 1, 4,
16]))
In [76]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[76]: