In [3]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
In [4]:
import requests
url = 'http://www.ats.ucla.edu/stat/data/binary.csv'
response = requests.get(url)
response.status_code
Out[4]:
In [6]:
from io import StringIO
data = pd.read_csv(StringIO(response.text))
In [7]:
data.head()
Out[7]:
In [15]:
data.admit.groupby(data.admit).count()
Out[15]:
In [34]:
data['admit'].groupby(data['rank']).value_counts()
Out[34]:
In [16]:
data.drop('rank', axis=1).corr()
Out[16]:
In [17]:
data.plot(kind='scatter', x='admit', y='gre')
Out[17]:
In [18]:
data.plot(kind='scatter', x='admit', y='gpa')
Out[18]:
In [55]:
import statsmodels.formula.api as sm
model = sm.logit(formula='admit ~ gpa + C(rank)', data=data)
result = model.fit()
result.summary()
Out[55]:
In [58]:
from statsmodels.api import add_constant
predicted_probs = result.predict(data.drop('admit', axis=1))
predicted_ranks = np.where(predicted_probs > 0.5, 1, 0)
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_true=data['admit'], y_pred=predicted_ranks)
print(conf_matrix)
conf_df = pd.DataFrame(data=conf_matrix, columns=['rejected', 'accepted'], index=['rejected', 'accepted'])
import seaborn as sns
sns.heatmap(conf_df, annot=True)
pylab.xlabel('true label')
pylab.ylabel('predicted label')
Out[58]:
In [ ]: