Predicting the probability that a set of blight tickets will be paid on time
Supervised Learning. Classification
Source: Applied Machine Learning in Python | Coursera. Solved with classical machine learning classifiers here
Data provided by Michigan Data Science Team (MDST), the Michigan Student Symposium for Interdisciplinary Statistical Sciences (MSSISS) and the City of Detroit Detroit Open Data Portal.
Each row of the dataset corresponds to a single blight ticket, and includes information about when, why, and to whom each ticket was issued. The target variable is compliance, which is True if the ticket was paid early, on time, or within one month of the hearing data, False if the ticket was paid after the hearing date or not at all, and Null if the violator was found not responsible.
Features
ticket_id - unique identifier for tickets
agency_name - Agency that issued the ticket
inspector_name - Name of inspector that issued the ticket
violator_name - Name of the person/organization that the ticket was issued to
violation_street_number, violation_street_name, violation_zip_code - Address where the violation occurred
mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country - Mailing address of the violator
ticket_issued_date - Date and time the ticket was issued
hearing_date - Date and time the violator's hearing was scheduled
violation_code, violation_description - Type of violation
disposition - Judgment and judgement type
fine_amount - Violation fine amount, excluding fees
admin_fee - $20 fee assigned to responsible judgments
state_fee - $10 fee assigned to responsible judgments late_fee - 10% fee assigned to responsible judgments discount_amount - discount applied, if any clean_up_cost - DPW clean-up or graffiti removal cost judgment_amount - Sum of all fines and fees grafitti_status - Flag for graffiti violations
Labels
payment_amount - Amount paid, if any
payment_date - Date payment was made, if it was received
payment_status - Current payment status as of Feb 1 2017
balance_due - Fines and fees still owed
collection_status - Flag for payments in collections
compliance [target variable for prediction]
Null = Not responsible
0 = Responsible, non-compliant
1 = Responsible, compliant
compliance_detail - More information on why each ticket was marked compliant or non-compliant
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper
import keras
helper.info_gpu()
#sns.set_palette("GnBu_d")
#helper.reproducible(seed=0) # Setup reproducible results from run to run using Keras
%matplotlib inline
In [2]:
data_path = 'data/property_maintenance_fines_data.csv'
target = ['compliance']
df_original = pd.read_csv(data_path, encoding='iso-8859-1', dtype='unicode')
print("{} rows \n{} columns \ntarget: {}".format(*df_original.shape, target))
In [3]:
print(df_original[target].squeeze().value_counts(dropna=False))
In [4]:
# Remove rows with NULL targets
df_original = df_original.dropna(subset=target)
print(df_original[target].squeeze().value_counts())
print(df_original.shape)
Imbalanced target: the evaluation metric used in this problem is the Area Under the ROC Curve
In [5]:
from sklearn.model_selection import train_test_split
df, df_test = train_test_split(
df_original, test_size=0.2, stratify=df_original[target], random_state=0)
To avoid data leakage, only the training dataframe, df, will be explored and processed here
In [6]:
df.head(2)
Out[6]:
In [7]:
helper.missing(df)
In [8]:
def remove_features(df):
relevant_col = ['agency_name', 'violation_street_name', 'city', 'state', 'violator_name',
'violation_code', 'late_fee', 'discount_amount', 'judgment_amount', 'disposition',
'fine_amount', 'compliance']
df = df[relevant_col]
return df
df = remove_features(df)
print(df.shape)
In [9]:
num = ['late_fee', 'discount_amount', 'judgment_amount', 'fine_amount']
df = helper.classify_data(df, target, numerical=num)
pd.DataFrame(dict(df.dtypes), index=["Type"])[df.columns].head() # show data types
Out[9]:
In [10]:
df, dict_categories = helper.remove_categories(df, target=target, ratio=0.001, show=False)
In [11]:
df = helper.fill_simple(df, target, missing_categorical='Other')
In [12]:
helper.missing(df);
In [13]:
for i in ['state', 'disposition']:
helper.show_categorical(df[[i]])
In [14]:
for i in ['state', 'disposition']:
helper.show_target_vs_categorical(df[[i, target[0]]], target)
In [15]:
helper.show_numerical(df, kde=True)
In [16]:
helper.show_target_vs_numerical(df, target, point_size=10 ,jitter=0.3, fit_reg=True)
plt.ylim(ymin=-0.2, ymax=1.2)
Out[16]:
In [17]:
helper.show_correlation(df, target, figsize=(6,3))
In [18]:
droplist = [] # features to drop
# For the model 'data' instead of 'df'
data = df.copy()
# del(df)
data.drop(droplist, axis='columns', inplace=True)
data.head(2)
Out[18]:
In [19]:
data, scale_param = helper.scale(data)
In [20]:
data, dict_dummies = helper.replace_by_dummies(data, target)
model_features = [f for f in data if f not in target] # sorted neural network inputs
data.head(3)
Out[20]:
In [21]:
val_size = 0.2
random_state = 0
def validation_split(data, val_size=0.25):
train, test = train_test_split(
data, test_size=val_size, random_state=random_state, stratify=data[target])
# Separate the data into features and target (x=features, y=target)
x_train, y_train = train.drop(target, axis=1).values, train[target].values
x_val, y_val = test.drop(target, axis=1).values, test[target].values
# _nc: non-categorical yet (needs one-hot encoding)
return x_train, y_train, x_val, y_val
x_train, y_train, x_val, y_val = validation_split(data, val_size=val_size)
# x_train = x_train.astype(np.float16)
y_train = y_train.astype(np.float16)
# X_val = x_val.astype(np.float16)
y_val = y_val.astype(np.float16)
In [22]:
def one_hot_output(y_train, y_val):
num_classes = len(np.unique(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
return y_train, y_val
y_train, y_val = one_hot_output(y_train, y_val)
In [23]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))
In [24]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent').fit(x_train, np.ravel(y_train))
# The dummy 'most_frequent' classifier always predicts class 0
y_pred = clf.predict(x_val).reshape([-1, 1])
helper.binary_classification_scores(y_val[:, 1], y_pred);
In [25]:
from sklearn.ensemble import RandomForestClassifier
%time clf_random_forest_opt = RandomForestClassifier(n_estimators = 30, max_features=150, \
max_depth=13, class_weight='balanced', n_jobs=-1, \
random_state=0).fit(x_train, np.ravel(y_train[:,1]))
In [26]:
y_pred = clf_random_forest_opt.predict(x_val).reshape([-1, 1])
helper.binary_classification_scores(y_val[:, 1], y_pred);
In [27]:
cw = helper.get_class_weight(y_train[:, 1]) # class weight (imbalanced target)
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
def build_nn(input_size, output_size, summary=False):
input_nodes = input_size // 8
model = Sequential()
model.add(Dense(input_nodes, input_dim=input_size, activation='relu'))
model.add(Dense(output_size, activation='softmax'))
if summary:
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
model = build_nn(x_train.shape[1], y_train.shape[1], summary=True)
In [28]:
import os
from time import time
model_path = os.path.join("models", "detroit.h5")
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
"""
Train the neural network model. If no validation_datais provided, a split for validation
will be used
"""
if show:
print('Training ....')
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=1)]
t0 = time()
history = model.fit(
x_train,
y_train,
epochs=100,
batch_size=2048,
class_weight = cw,
verbose=1,
validation_split=0.3,
validation_data = validation_data,
callbacks=callbacks)
if show:
print("time: \t {:.1f} s".format(time() - t0))
helper.show_training(history)
if path:
model.save(path)
print("\nModel saved at", path)
return history
model = None
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, path=None);
from sklearn.metrics import roc_auc_score
y_pred_train = model.predict(x_train, verbose=1)
print('\n\n ROC_AUC train:\t{:.2f} \n'.format(roc_auc_score(y_train, y_pred_train)))
y_pred_val = model.predict(x_val, verbose=1)
print('\n\n ROC_AUC val:\t{:.2f}'.format(roc_auc_score(y_val, y_pred_val)))
In [29]:
helper.binary_classification_scores(y_val[:, 1], y_pred_val[:, 1]);
In [30]:
df_test.head(2)
Out[30]:
In [31]:
df_test = remove_features(df_test)
df_test = helper.classify_data(df_test, target, numerical=num)
df_test, _ = helper.remove_categories(
df_test, target=target, show=False, dict_categories=dict_categories)
df_test = helper.fill_simple(df_test, target, missing_categorical='Other')
df_test, _ = helper.scale(df_test, scale_param)
df_test, _ = helper.replace_by_dummies(df_test, target, dict_dummies)
df_test = df_test[model_features+target] # sort columns to match training features order
In [32]:
def separate_x_y(data):
""" Separate the data into features and target (x=features, y=target) """
x, y = data.drop(target, axis=1).values, data[target].values
x = x.astype(np.float16)
y = y.astype(np.float16)
return x, y
x_test, y_test = separate_x_y(df_test)
y_test = keras.utils.to_categorical(y_test, 2)
In [33]:
y_pred = clf_random_forest_opt.predict_proba(x_test)[:,1]
helper.binary_classification_scores(y_test[:,1], y_pred);
In [34]:
helper.show_feature_importances(model_features, clf_random_forest_opt)
In [35]:
y_pred = model.predict(x_test, verbose=1)[:,1]
helper.binary_classification_scores(y_test[:,1], y_pred);
In [36]:
helper.ml_classification(x_train, y_train[:, 1], x_test, y_test[:, 1])
Out[36]: