Predicting student admissions to graduate school at UCLA based on GRE Scores, GPA Scores, and class rank
Supervised Learning. Classification
Dataset from http://www.ats.ucla.edu/
Based on the Predicting Student Admissions mini project of the Udacity's Artificial Intelligence Nanodegree
In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import helper
helper.reproducible(seed=0)
sns.set()
In [2]:
data_path = 'data/student_admissions.csv'
df = pd.read_csv(data_path)
df.head()
Out[2]:
In [3]:
df.describe()
Out[3]:
In [4]:
targets = ['admit']
features = ['gre', 'gpa', 'rank']
categorical = ['admit', 'rank']
numerical = ['gre', 'gpa']
# NaN values
df.fillna(df[numerical].median(), inplace=True) # NaN from numerical feature replaced by median
df.dropna(axis='index', how='any', inplace = True) # NaN from categorical feature: delete row
df_visualize = df # copy for model visualization
df.shape
Out[4]:
In [5]:
def plot_data(dataf, hue="admit"):
""" Custom plot for this project """
g = sns.FacetGrid(dataf, col="rank", hue=hue)
g = (g.map(plt.scatter, "gre", "gpa", edgecolor="w").add_legend())
return g
plot_data(df)
Out[5]:
In [6]:
dummies = pd.get_dummies(df['rank'], prefix='rank', drop_first=False)
df = pd.concat([df, dummies], axis=1)
df = df.drop('rank', axis='columns')
df.head()
Out[6]:
In [7]:
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for f in numerical:
mean, std = df[f].mean(), df[f].std()
scaled_features[f] = [mean, std]
df.loc[:, f] = (df[f] - mean)/std
df.head()
Out[7]:
In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=9)
# Separate the data into features and targets (x=features, y=targets)
x_train, y_train = train.drop(targets, axis=1).values, train[targets].values
x_test, y_test = test.drop(targets, axis=1).values, test[targets].values
In [9]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print("Training set: \t x-shape = {} \t y-shape = {}".format(x_train.shape ,y_train.shape))
print("Test set: \t x-shape = {} \t y-shape = {}".format(x_test.shape ,y_test.shape))
In [10]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
input_nodes = x_train.shape[1]*8
weights = keras.initializers.RandomNormal(stddev=0.1)
model = Sequential()
model.add(Dense(input_nodes, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(.2))
model.add(Dense(2,activation='softmax'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('\nTraining ....')
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=0)]
%time history = model.fit(x_train, y_train, epochs=1000, batch_size=64, verbose=0, \
validation_split=0.25, callbacks=callbacks)
helper.show_training(history)
model_path = os.path.join("models", "student_admissions.h5")
model.save(model_path)
print("\nModel saved at",model_path)
In [11]:
model = keras.models.load_model(model_path)
print("Model loaded:", model_path)
score = model.evaluate(x_test, y_test, verbose=0)
print("\nTest Accuracy: {:.2f}".format(score[1]))
In [12]:
predictions = model.predict(df.drop(targets, axis=1).values)
predictions = np.argmax(predictions, axis=1)
df_visualize['predicted'] = predictions
plot_data(df_visualize).fig.suptitle('Actual')
plot_data(df_visualize, hue='predicted').fig.suptitle('Model')
Out[12]:
A bit overfitting can be appreciated in rank 2 sometimes. More information can be extracted when looking at the predicted probabilities instead of the binary accepted-rejected result shown here.
In [13]:
def predict_admission(student):
# student_data: {id: [gre, gpa, 'rank1, rank2, rank3, rank4]}
print('Admission Probabilities: \n')
for key, value in student.items():
p_name = key
single_data = value
# normalize data
for idx, f in enumerate(numerical):
single_data[idx] = (single_data[idx] - scaled_features[f][0]) / scaled_features[f][1]
# make prediction
single_pred = model.predict(np.array([single_data]))
print('{}: \t {:.0f}%\n'.format(p_name, single_pred[0, 1] * 100))
df_visualize.describe()
Out[13]:
In [14]:
# student_data: {id: [gre, gpa, 'rank1, rank2, rank3, rank4]}
new_students = {'High scores rank-1': [730, 3.83, 1, 0, 0, 0],
'High scores rank-2': [730, 3.83, 0, 1, 0, 0],
'High scores rank-3': [730, 3.83, 0, 0, 1, 0],
'High scores rank-4': [730, 3.83, 0, 0, 0, 1],
'Avg scores rank-1': [588, 3.4, 1, 0, 0, 0],
'Avg scores rank-2': [588, 3.4, 0, 1, 0, 0],
'Avg scores rank-3': [588, 3.4, 0, 0, 1, 0],
'Avg scores rank-4': [588, 3.4, 0, 0, 0, 1],
}
predict_admission(new_students)
The predictions confirm that rank is the most influential feature in determining the admission, which seems reasonable. The absolute grades of the students are more relevant for rank-1 students (Q1).