Traditionally, Educational Institutions use rule based models to generate risk score which then informs resource allocation. For example, Hiller et al, 1999
Instead, we'll build a simple model using basic ML techniques and demonstrate why the risk scores generated are better
In [184]:
## Imports
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
In [77]:
# Gen Data
%run sim.py
In [131]:
stud_df.gpa = pd.to_numeric(stud_df.gpa)
stud_df.honors = pd.to_numeric(stud_df.honors)
stud_df.psat = pd.to_numeric(stud_df.psat)
In [125]:
avg_gpas = stud_df.groupby('college').gpa.mean()
def isUndermatched(student):
if student.gpa >= (avg_gpas[student.college] + .50):
return True
else:
return False
In [133]:
stud_df['undermatch_status'] = stud_df.apply(isUndermatched, axis =1 )
#stud_df.groupby('race').undermatch_status.value_counts()
In [155]:
msk = np.random.rand(len(stud_df)) < 0.8
train = stud_df[msk]
test = stud_df[~msk]
print("Training Set Length: ", len(train))
print("Testing Set Length: ", len(test))
The Rules
In [156]:
stud_df.psat.hist()
Out[156]:
In [157]:
def rule_based_model(student_r):
"""returns a college for each student passed"""
risk_score = 0
if student_r.race == 'aa':
risk_score += 1
if student_r.race == 'latino':
risk_score += .5
if student_r.psat >= 170 and student_r.honors <= 3:
risk_score += 1
return risk_score
In [158]:
test['risk_score'] = test.apply(rule_based_model, axis = 1)
In [192]:
from sklearn import linear_model
feature_cols = ['psat', 'gpa', 'honors']
X = train[feature_cols]
y = train['undermatch_status']
# instantiate, fit
lm = linear_model.LogisticRegression()
lm.fit(X, y)
Out[192]:
In [194]:
# The coefficients
print('Coefficients: \n', lm.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
% np.mean((lm.predict(test[feature_cols]) - test['undermatch_status']) ** 2))
# Explained variance score: 1 is perfect prediction
lm.predict(train[feature_cols])
Out[194]:
In [201]:
sns.lmplot(x='psat', y='undermatch_status', data=test, logistic=True)
Out[201]:
In [200]:
In [ ]: