Kaggle San Francisco Crime Classification

Berkeley MIDS W207 Final Project: Sam Goodgame, Sarah Cha, Kalvin Kao, Bryan Moore

Environment and Data


In [1]:
# Additional Libraries
%matplotlib inline
import matplotlib.pyplot as plt

# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV

# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Local, individual load of updated data set (with weather data integrated) into training, development, and test subsets.


In [2]:
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()

# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()

# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)

####
#X = np.around(X, decimals=2)
####

# Shuffle data to remove any underlying pattern that may exist.  Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]

# Due to difficulties with log loss and set(y_pred) needing to match set(labels), we will remove the extremely rare
# crimes from the data for quality issues.
X_minus_trea = X[np.where(y != 'TREA')]
y_minus_trea = y[np.where(y != 'TREA')]
X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]

# Separate training, dev, and test data:
test_data, test_labels = X_final[800000:], y_final[800000:]
dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000]
train_data, train_labels = X_final[100000:700000], y_final[100000:700000]
calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000]

# Create mini versions of the above sets
mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000]
mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[19000:28000]
mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000]

# Create list of the crime type labels.  This will act as the "labels" parameter for the log loss functions that follow
crime_labels = list(set(y_final))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))

#print(len(train_data),len(train_labels))
#print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
#print(len(test_data),len(test_labels))
print(len(mini_calibrate_data),len(mini_calibrate_labels))
#print(len(calibrate_data),len(calibrate_labels))


37 37 37 37
20000 20000
11000 11000
9000 9000

The Best RF Classifier


In [ ]:
tuned_DT_calibrate_isotonic = RandomForestClassifier(min_impurity_split=1, 
                                       n_estimators=100, 
                                       bootstrap= True,
                                       max_features=15,
                                       criterion='entropy',
                                       min_samples_leaf=10,
                                       max_depth=None
                                      ).fit(train_data, train_labels)
ccv_isotonic = CalibratedClassifierCV(tuned_DT_calibrate_isotonic, method = 'isotonic', cv = 'prefit')
ccv_isotonic.fit(calibrate_data, calibrate_labels)
ccv_predictions = ccv_isotonic.predict(dev_data)
ccv_prediction_probabilities_isotonic = ccv_isotonic.predict_proba(dev_data)
working_log_loss_isotonic = log_loss(y_true = dev_labels, y_pred = ccv_prediction_probabilities_isotonic, labels = crime_labels)
print("Multi-class Log Loss with RF and calibration with isotonic is:", working_log_loss_isotonic)

In [69]:
pd.DataFrame(np.amax(ccv_prediction_probabilities_isotonic, axis=1)).hist()


Out[69]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000024682C052B0>]], dtype=object)

Error Analysis: Calibration


In [74]:
#clf_probabilities, clf_predictions, labels
def error_analysis_calibration(buckets, clf_probabilities, clf_predictions, labels):
    """inputs:
    clf_probabilities = clf.predict_proba(dev_data)
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    
    #buckets = [0.05, 0.15, 0.3, 0.5, 0.8]
    #buckets = [0.15, 0.25, 0.3, 1.0]
    correct = [0 for i in buckets]
    total = [0 for i in buckets]

    lLimit = 0
    uLimit = 0
    for i in range(len(buckets)):
        uLimit = buckets[i]
        for j in range(clf_probabilities.shape[0]):
            if (np.amax(clf_probabilities[j]) > lLimit) and (np.amax(clf_probabilities[j]) <= uLimit):
                if clf_predictions[j] == labels[j]:
                    correct[i] += 1
                total[i] += 1
        lLimit = uLimit
        
    print(sum(correct))
    print(sum(total))
    print(correct)
    print(total)

    #here we report the classifier accuracy for each posterior probability bucket
    accuracies = []
    for k in range(len(buckets)):
        print(1.0*correct[k]/total[k])
        accuracies.append(1.0*correct[k]/total[k])
        print('p(pred) <= %.13f    total = %3d    correct = %3d    accuracy = %.3f' \
              %(buckets[k], total[k], correct[k], 1.0*correct[k]/total[k]))
    plt.plot(buckets,accuracies)
    plt.title("Calibration Analysis")
    plt.xlabel("Posterior Probability")
    plt.ylabel("Classifier Accuracy")
    
    return buckets, accuracies

In [ ]:
#i think you'll need to look at how the posteriors are distributed in order to set the best bins in 'buckets'
pd.DataFrame(np.amax(bestLRPredictionProbabilities, axis=1)).hist()

In [75]:
buckets = [0.15, 0.25, 0.3, 1.0]
calibration_buckets, calibration_accuracies = error_analysis_calibration(buckets, clf_probabilities=bestLRPredictionProbabilities, \
                                                                         clf_predictions=bestLRPredictions, \
                                                                         labels=mini_dev_labels)


11246
50000
[199, 6513, 2458, 2076]
[1351, 33941, 8338, 6370]
0.14729829755736493
p(pred) <= 0.1500000000000    total = 1351    correct = 199    accuracy = 0.147
0.19189181226245544
p(pred) <= 0.2500000000000    total = 33941    correct = 6513    accuracy = 0.192
0.2947949148476853
p(pred) <= 0.3000000000000    total = 8338    correct = 2458    accuracy = 0.295
0.3259026687598116
p(pred) <= 1.0000000000000    total = 6370    correct = 2076    accuracy = 0.326

Error Analysis: Classification Report


In [77]:
def error_analysis_classification_report(clf_predictions, labels):
    """inputs:
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    print('Classification Report:')
    report = classification_report(labels, clf_predictions)
    print(report)
    return report

In [78]:
classification_report = error_analysis_classification_report(clf_predictions=bestLRPredictions, \
                                                            labels=mini_dev_labels)


Classification Report:
                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00        94
                    ASSAULT       0.23      0.00      0.00      4427
                 BAD CHECKS       0.00      0.00      0.00        27
                    BRIBERY       0.00      0.00      0.00        14
                   BURGLARY       0.00      0.00      0.00      2047
         DISORDERLY CONDUCT       0.00      0.00      0.00       243
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       157
              DRUG/NARCOTIC       0.24      0.31      0.27      3009
                DRUNKENNESS       0.00      0.00      0.00       254
               EMBEZZLEMENT       0.00      0.00      0.00        61
                  EXTORTION       0.00      0.00      0.00        14
            FAMILY OFFENSES       0.00      0.00      0.00        25
     FORGERY/COUNTERFEITING       0.00      0.00      0.00       594
                      FRAUD       0.00      0.00      0.00       953
                   GAMBLING       0.00      0.00      0.00        10
                 KIDNAPPING       0.00      0.00      0.00       132
              LARCENY/THEFT       0.24      0.77      0.37     10076
                LIQUOR LAWS       0.00      0.00      0.00       111
                  LOITERING       0.00      0.00      0.00        69
             MISSING PERSON       0.00      0.00      0.00      1483
               NON-CRIMINAL       0.30      0.00      0.00      5210
             OTHER OFFENSES       0.18      0.35      0.24      7117
    PORNOGRAPHY/OBSCENE MAT       0.00      0.00      0.00         1
               PROSTITUTION       0.00      0.00      0.00       450
          RECOVERED VEHICLE       0.00      0.00      0.00       181
                    ROBBERY       0.00      0.00      0.00      1310
                    RUNAWAY       0.00      0.00      0.00        92
            SECONDARY CODES       0.00      0.00      0.00       571
      SEX OFFENSES FORCIBLE       0.00      0.00      0.00       266
  SEX OFFENSES NON FORCIBLE       0.00      0.00      0.00         8
            STOLEN PROPERTY       0.00      0.00      0.00       234
                    SUICIDE       0.00      0.00      0.00        31
             SUSPICIOUS OCC       0.00      0.00      0.00      1785
                       TREA       0.00      0.00      0.00         2
                   TRESPASS       0.00      0.00      0.00       432
                  VANDALISM       0.00      0.00      0.00      2502
              VEHICLE THEFT       0.18      0.03      0.05      3109
                   WARRANTS       0.00      0.00      0.00      2434
                WEAPON LAWS       0.00      0.00      0.00       465

                avg / total       0.15      0.22      0.13     50000

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Error Analysis: Confusion Matrix


In [99]:
crime_labels_mini_dev


Out[99]:
['TREA',
 'DISORDERLY CONDUCT',
 'BAD CHECKS',
 'SECONDARY CODES',
 'FORGERY/COUNTERFEITING',
 'SEX OFFENSES NON FORCIBLE',
 'RUNAWAY',
 'BURGLARY',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'LOITERING',
 'VANDALISM',
 'STOLEN PROPERTY',
 'OTHER OFFENSES',
 'NON-CRIMINAL',
 'GAMBLING',
 'FAMILY OFFENSES',
 'VEHICLE THEFT',
 'ARSON',
 'PORNOGRAPHY/OBSCENE MAT',
 'ROBBERY',
 'SUICIDE',
 'SEX OFFENSES FORCIBLE',
 'EMBEZZLEMENT',
 'EXTORTION',
 'WEAPON LAWS',
 'KIDNAPPING',
 'MISSING PERSON',
 'SUSPICIOUS OCC',
 'TRESPASS',
 'WARRANTS',
 'ASSAULT',
 'LARCENY/THEFT',
 'PROSTITUTION',
 'BRIBERY',
 'FRAUD',
 'LIQUOR LAWS',
 'DRUNKENNESS',
 'RECOVERED VEHICLE']

In [104]:
def error_analysis_confusion_matrix(label_names, clf_predictions, labels):
    """inputs:
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    cm = pd.DataFrame(confusion_matrix(labels, clf_predictions, labels=label_names))
    cm.columns=label_names
    cm.index=label_names
    cm.to_csv(path_or_buf="./confusion_matrix.csv")
    #print(cm)
    return cm

In [105]:
error_analysis_confusion_matrix(label_names=crime_labels_mini_dev, clf_predictions=bestLRPredictions, \
                                                            labels=mini_dev_labels)


Out[105]:
TREA DISORDERLY CONDUCT BAD CHECKS SECONDARY CODES FORGERY/COUNTERFEITING SEX OFFENSES NON FORCIBLE RUNAWAY BURGLARY DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC ... TRESPASS WARRANTS ASSAULT LARCENY/THEFT PROSTITUTION BRIBERY FRAUD LIQUOR LAWS DRUNKENNESS RECOVERED VEHICLE
TREA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 2 0 0 0 0 0 0
DISORDERLY CONDUCT 0 0 0 0 0 0 0 0 0 28 ... 0 0 0 136 0 0 0 0 0 0
BAD CHECKS 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 21 0 0 0 0 0 0
SECONDARY CODES 0 0 0 0 0 0 0 0 0 42 ... 0 0 0 324 0 0 0 0 0 0
FORGERY/COUNTERFEITING 0 0 0 0 0 0 0 0 0 28 ... 0 0 1 387 0 0 0 0 0 0
SEX OFFENSES NON FORCIBLE 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 2 0 0 0 0 0 0
RUNAWAY 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 67 0 0 0 0 0 0
BURGLARY 0 0 0 0 0 0 0 0 0 58 ... 0 0 0 1399 0 0 0 0 0 0
DRIVING UNDER THE INFLUENCE 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 93 0 0 0 0 0 0
DRUG/NARCOTIC 0 0 0 0 0 0 0 0 0 943 ... 0 0 1 1341 0 0 0 0 0 0
LOITERING 0 0 0 0 0 0 0 0 0 6 ... 0 0 0 45 0 0 0 0 0 0
VANDALISM 0 0 0 0 0 0 0 0 0 73 ... 0 0 2 1640 0 0 0 0 0 0
STOLEN PROPERTY 0 0 0 0 0 0 0 0 0 17 ... 0 0 0 161 0 0 0 0 0 0
OTHER OFFENSES 0 0 0 0 0 0 0 0 0 660 ... 0 0 2 3886 0 0 0 0 0 0
NON-CRIMINAL 0 0 0 0 0 0 0 0 0 380 ... 0 0 5 3591 0 0 0 0 0 0
GAMBLING 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 5 0 0 0 0 0 0
FAMILY OFFENSES 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 13 0 0 0 0 0 0
VEHICLE THEFT 0 0 0 0 0 0 0 0 0 55 ... 0 0 4 1964 0 0 0 0 0 0
ARSON 0 0 0 0 0 0 0 0 0 2 ... 0 0 1 54 0 0 0 0 0 0
PORNOGRAPHY/OBSCENE MAT 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
ROBBERY 0 0 0 0 0 0 0 0 0 90 ... 0 0 0 765 0 0 0 0 0 0
SUICIDE 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 15 0 0 0 0 0 0
SEX OFFENSES FORCIBLE 0 0 0 0 0 0 0 0 0 12 ... 0 0 0 159 0 0 0 0 0 0
EMBEZZLEMENT 0 0 0 0 0 0 0 0 0 3 ... 0 0 0 38 0 0 0 0 0 0
EXTORTION 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 8 0 0 0 0 0 0
WEAPON LAWS 0 0 0 0 0 0 0 0 0 40 ... 0 0 1 232 0 0 0 0 0 0
KIDNAPPING 0 0 0 0 0 0 0 0 0 5 ... 0 0 0 74 0 0 0 0 0 0
MISSING PERSON 0 0 0 0 0 0 0 0 0 40 ... 0 0 0 845 0 0 0 0 0 0
SUSPICIOUS OCC 0 0 0 0 0 0 0 0 0 129 ... 0 0 2 1075 0 0 0 0 0 0
TRESPASS 0 0 0 0 0 0 0 0 0 54 ... 0 0 0 264 0 0 0 0 0 0
WARRANTS 0 0 0 0 0 0 0 0 0 342 ... 0 0 0 1414 0 0 0 0 0 0
ASSAULT 0 0 0 0 0 0 0 0 0 341 ... 0 0 7 2511 0 0 0 0 0 0
LARCENY/THEFT 0 0 0 0 0 0 0 0 0 531 ... 0 0 3 7710 0 0 0 0 0 0
PROSTITUTION 0 0 0 0 0 0 0 0 0 34 ... 0 0 0 276 0 0 0 0 0 0
BRIBERY 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 6 0 0 0 0 0 0
FRAUD 0 0 0 0 0 0 0 0 0 52 ... 0 0 0 652 0 0 0 0 0 0
LIQUOR LAWS 0 0 0 0 0 0 0 0 0 11 ... 0 0 1 72 0 0 0 0 0 0
DRUNKENNESS 0 0 0 0 0 0 0 0 0 17 ... 0 0 0 178 0 0 0 0 0 0
RECOVERED VEHICLE 0 0 0 0 0 0 0 0 0 7 ... 0 0 0 73 0 0 0 0 0 0

39 rows × 39 columns