Kaggle San Francisco Crime Classification

Berkeley MIDS W207 Final Project: Sam Goodgame, Sarah Cha, Kalvin Kao, Bryan Moore

Environment and Data



In [1]:

    
# Additional Libraries
%matplotlib inline
import matplotlib.pyplot as plt

# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV

# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)









    



C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Local, individual load of updated data set (with weather data integrated) into training, development, and test subsets.



In [2]:

    
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()

# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()

# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)

####
#X = np.around(X, decimals=2)
####

# Shuffle data to remove any underlying pattern that may exist.  Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]

# Due to difficulties with log loss and set(y_pred) needing to match set(labels), we will remove the extremely rare
# crimes from the data for quality issues.
X_minus_trea = X[np.where(y != 'TREA')]
y_minus_trea = y[np.where(y != 'TREA')]
X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]

# Separate training, dev, and test data:
test_data, test_labels = X_final[800000:], y_final[800000:]
dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000]
train_data, train_labels = X_final[100000:700000], y_final[100000:700000]
calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000]

# Create mini versions of the above sets
mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000]
mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[19000:28000]
mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000]

# Create list of the crime type labels.  This will act as the "labels" parameter for the log loss functions that follow
crime_labels = list(set(y_final))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))

#print(len(train_data),len(train_labels))
#print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
#print(len(test_data),len(test_labels))
print(len(mini_calibrate_data),len(mini_calibrate_labels))
#print(len(calibrate_data),len(calibrate_labels))









    



37 37 37 37
20000 20000
11000 11000
9000 9000

The Best RF Classifier



In [ ]:

    
tuned_DT_calibrate_isotonic = RandomForestClassifier(min_impurity_split=1, 
                                       n_estimators=100, 
                                       bootstrap= True,
                                       max_features=15,
                                       criterion='entropy',
                                       min_samples_leaf=10,
                                       max_depth=None
                                      ).fit(train_data, train_labels)
ccv_isotonic = CalibratedClassifierCV(tuned_DT_calibrate_isotonic, method = 'isotonic', cv = 'prefit')
ccv_isotonic.fit(calibrate_data, calibrate_labels)
ccv_predictions = ccv_isotonic.predict(dev_data)
ccv_prediction_probabilities_isotonic = ccv_isotonic.predict_proba(dev_data)
working_log_loss_isotonic = log_loss(y_true = dev_labels, y_pred = ccv_prediction_probabilities_isotonic, labels = crime_labels)
print("Multi-class Log Loss with RF and calibration with isotonic is:", working_log_loss_isotonic)



In [69]:

    
pd.DataFrame(np.amax(ccv_prediction_probabilities_isotonic, axis=1)).hist()









    Out[69]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000024682C052B0>]], dtype=object)

Error Analysis: Calibration



In [74]:

    
#clf_probabilities, clf_predictions, labels
def error_analysis_calibration(buckets, clf_probabilities, clf_predictions, labels):
    """inputs:
    clf_probabilities = clf.predict_proba(dev_data)
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    
    #buckets = [0.05, 0.15, 0.3, 0.5, 0.8]
    #buckets = [0.15, 0.25, 0.3, 1.0]
    correct = [0 for i in buckets]
    total = [0 for i in buckets]

    lLimit = 0
    uLimit = 0
    for i in range(len(buckets)):
        uLimit = buckets[i]
        for j in range(clf_probabilities.shape[0]):
            if (np.amax(clf_probabilities[j]) > lLimit) and (np.amax(clf_probabilities[j]) <= uLimit):
                if clf_predictions[j] == labels[j]:
                    correct[i] += 1
                total[i] += 1
        lLimit = uLimit
        
    print(sum(correct))
    print(sum(total))
    print(correct)
    print(total)

    #here we report the classifier accuracy for each posterior probability bucket
    accuracies = []
    for k in range(len(buckets)):
        print(1.0*correct[k]/total[k])
        accuracies.append(1.0*correct[k]/total[k])
        print('p(pred) <= %.13f    total = %3d    correct = %3d    accuracy = %.3f' \
              %(buckets[k], total[k], correct[k], 1.0*correct[k]/total[k]))
    plt.plot(buckets,accuracies)
    plt.title("Calibration Analysis")
    plt.xlabel("Posterior Probability")
    plt.ylabel("Classifier Accuracy")
    
    return buckets, accuracies



In [ ]:

    
#i think you'll need to look at how the posteriors are distributed in order to set the best bins in 'buckets'
pd.DataFrame(np.amax(bestLRPredictionProbabilities, axis=1)).hist()



In [75]:

    
buckets = [0.15, 0.25, 0.3, 1.0]
calibration_buckets, calibration_accuracies = error_analysis_calibration(buckets, clf_probabilities=bestLRPredictionProbabilities, \
                                                                         clf_predictions=bestLRPredictions, \
                                                                         labels=mini_dev_labels)









    



11246
50000
[199, 6513, 2458, 2076]
[1351, 33941, 8338, 6370]
0.14729829755736493
p(pred) <= 0.1500000000000    total = 1351    correct = 199    accuracy = 0.147
0.19189181226245544
p(pred) <= 0.2500000000000    total = 33941    correct = 6513    accuracy = 0.192
0.2947949148476853
p(pred) <= 0.3000000000000    total = 8338    correct = 2458    accuracy = 0.295
0.3259026687598116
p(pred) <= 1.0000000000000    total = 6370    correct = 2076    accuracy = 0.326

Error Analysis: Classification Report



In [77]:

    
def error_analysis_classification_report(clf_predictions, labels):
    """inputs:
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    print('Classification Report:')
    report = classification_report(labels, clf_predictions)
    print(report)
    return report



In [78]:

    
classification_report = error_analysis_classification_report(clf_predictions=bestLRPredictions, \
                                                            labels=mini_dev_labels)









    



Classification Report:
                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00        94
                    ASSAULT       0.23      0.00      0.00      4427
                 BAD CHECKS       0.00      0.00      0.00        27
                    BRIBERY       0.00      0.00      0.00        14
                   BURGLARY       0.00      0.00      0.00      2047
         DISORDERLY CONDUCT       0.00      0.00      0.00       243
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       157
              DRUG/NARCOTIC       0.24      0.31      0.27      3009
                DRUNKENNESS       0.00      0.00      0.00       254
               EMBEZZLEMENT       0.00      0.00      0.00        61
                  EXTORTION       0.00      0.00      0.00        14
            FAMILY OFFENSES       0.00      0.00      0.00        25
     FORGERY/COUNTERFEITING       0.00      0.00      0.00       594
                      FRAUD       0.00      0.00      0.00       953
                   GAMBLING       0.00      0.00      0.00        10
                 KIDNAPPING       0.00      0.00      0.00       132
              LARCENY/THEFT       0.24      0.77      0.37     10076
                LIQUOR LAWS       0.00      0.00      0.00       111
                  LOITERING       0.00      0.00      0.00        69
             MISSING PERSON       0.00      0.00      0.00      1483
               NON-CRIMINAL       0.30      0.00      0.00      5210
             OTHER OFFENSES       0.18      0.35      0.24      7117
    PORNOGRAPHY/OBSCENE MAT       0.00      0.00      0.00         1
               PROSTITUTION       0.00      0.00      0.00       450
          RECOVERED VEHICLE       0.00      0.00      0.00       181
                    ROBBERY       0.00      0.00      0.00      1310
                    RUNAWAY       0.00      0.00      0.00        92
            SECONDARY CODES       0.00      0.00      0.00       571
      SEX OFFENSES FORCIBLE       0.00      0.00      0.00       266
  SEX OFFENSES NON FORCIBLE       0.00      0.00      0.00         8
            STOLEN PROPERTY       0.00      0.00      0.00       234
                    SUICIDE       0.00      0.00      0.00        31
             SUSPICIOUS OCC       0.00      0.00      0.00      1785
                       TREA       0.00      0.00      0.00         2
                   TRESPASS       0.00      0.00      0.00       432
                  VANDALISM       0.00      0.00      0.00      2502
              VEHICLE THEFT       0.18      0.03      0.05      3109
                   WARRANTS       0.00      0.00      0.00      2434
                WEAPON LAWS       0.00      0.00      0.00       465

                avg / total       0.15      0.22      0.13     50000







    



C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Error Analysis: Confusion Matrix



In [99]:

    
crime_labels_mini_dev









    Out[99]:





['TREA',
 'DISORDERLY CONDUCT',
 'BAD CHECKS',
 'SECONDARY CODES',
 'FORGERY/COUNTERFEITING',
 'SEX OFFENSES NON FORCIBLE',
 'RUNAWAY',
 'BURGLARY',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'LOITERING',
 'VANDALISM',
 'STOLEN PROPERTY',
 'OTHER OFFENSES',
 'NON-CRIMINAL',
 'GAMBLING',
 'FAMILY OFFENSES',
 'VEHICLE THEFT',
 'ARSON',
 'PORNOGRAPHY/OBSCENE MAT',
 'ROBBERY',
 'SUICIDE',
 'SEX OFFENSES FORCIBLE',
 'EMBEZZLEMENT',
 'EXTORTION',
 'WEAPON LAWS',
 'KIDNAPPING',
 'MISSING PERSON',
 'SUSPICIOUS OCC',
 'TRESPASS',
 'WARRANTS',
 'ASSAULT',
 'LARCENY/THEFT',
 'PROSTITUTION',
 'BRIBERY',
 'FRAUD',
 'LIQUOR LAWS',
 'DRUNKENNESS',
 'RECOVERED VEHICLE']



In [104]:

    
def error_analysis_confusion_matrix(label_names, clf_predictions, labels):
    """inputs:
    clf_predictions = clf.predict(dev_data)
    labels = dev_labels"""
    cm = pd.DataFrame(confusion_matrix(labels, clf_predictions, labels=label_names))
    cm.columns=label_names
    cm.index=label_names
    cm.to_csv(path_or_buf="./confusion_matrix.csv")
    #print(cm)
    return cm



In [105]:

    
error_analysis_confusion_matrix(label_names=crime_labels_mini_dev, clf_predictions=bestLRPredictions, \
                                                            labels=mini_dev_labels)









    Out[105]:






  
    
      
      TREA
      DISORDERLY CONDUCT
      BAD CHECKS
      SECONDARY CODES
      FORGERY/COUNTERFEITING
      SEX OFFENSES NON FORCIBLE
      RUNAWAY
      BURGLARY
      DRIVING UNDER THE INFLUENCE
      DRUG/NARCOTIC
      ...
      TRESPASS
      WARRANTS
      ASSAULT
      LARCENY/THEFT
      PROSTITUTION
      BRIBERY
      FRAUD
      LIQUOR LAWS
      DRUNKENNESS
      RECOVERED VEHICLE
    
  
  
    
      TREA
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
    
    
      DISORDERLY CONDUCT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      28
      ...
      0
      0
      0
      136
      0
      0
      0
      0
      0
      0
    
    
      BAD CHECKS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      21
      0
      0
      0
      0
      0
      0
    
    
      SECONDARY CODES
      0
      0
      0
      0
      0
      0
      0
      0
      0
      42
      ...
      0
      0
      0
      324
      0
      0
      0
      0
      0
      0
    
    
      FORGERY/COUNTERFEITING
      0
      0
      0
      0
      0
      0
      0
      0
      0
      28
      ...
      0
      0
      1
      387
      0
      0
      0
      0
      0
      0
    
    
      SEX OFFENSES NON FORCIBLE
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
    
    
      RUNAWAY
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      67
      0
      0
      0
      0
      0
      0
    
    
      BURGLARY
      0
      0
      0
      0
      0
      0
      0
      0
      0
      58
      ...
      0
      0
      0
      1399
      0
      0
      0
      0
      0
      0
    
    
      DRIVING UNDER THE INFLUENCE
      0
      0
      0
      0
      0
      0
      0
      0
      0
      2
      ...
      0
      0
      0
      93
      0
      0
      0
      0
      0
      0
    
    
      DRUG/NARCOTIC
      0
      0
      0
      0
      0
      0
      0
      0
      0
      943
      ...
      0
      0
      1
      1341
      0
      0
      0
      0
      0
      0
    
    
      LOITERING
      0
      0
      0
      0
      0
      0
      0
      0
      0
      6
      ...
      0
      0
      0
      45
      0
      0
      0
      0
      0
      0
    
    
      VANDALISM
      0
      0
      0
      0
      0
      0
      0
      0
      0
      73
      ...
      0
      0
      2
      1640
      0
      0
      0
      0
      0
      0
    
    
      STOLEN PROPERTY
      0
      0
      0
      0
      0
      0
      0
      0
      0
      17
      ...
      0
      0
      0
      161
      0
      0
      0
      0
      0
      0
    
    
      OTHER OFFENSES
      0
      0
      0
      0
      0
      0
      0
      0
      0
      660
      ...
      0
      0
      2
      3886
      0
      0
      0
      0
      0
      0
    
    
      NON-CRIMINAL
      0
      0
      0
      0
      0
      0
      0
      0
      0
      380
      ...
      0
      0
      5
      3591
      0
      0
      0
      0
      0
      0
    
    
      GAMBLING
      0
      0
      0
      0
      0
      0
      0
      0
      0
      2
      ...
      0
      0
      0
      5
      0
      0
      0
      0
      0
      0
    
    
      FAMILY OFFENSES
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      13
      0
      0
      0
      0
      0
      0
    
    
      VEHICLE THEFT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      55
      ...
      0
      0
      4
      1964
      0
      0
      0
      0
      0
      0
    
    
      ARSON
      0
      0
      0
      0
      0
      0
      0
      0
      0
      2
      ...
      0
      0
      1
      54
      0
      0
      0
      0
      0
      0
    
    
      PORNOGRAPHY/OBSCENE MAT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      ROBBERY
      0
      0
      0
      0
      0
      0
      0
      0
      0
      90
      ...
      0
      0
      0
      765
      0
      0
      0
      0
      0
      0
    
    
      SUICIDE
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4
      ...
      0
      0
      0
      15
      0
      0
      0
      0
      0
      0
    
    
      SEX OFFENSES FORCIBLE
      0
      0
      0
      0
      0
      0
      0
      0
      0
      12
      ...
      0
      0
      0
      159
      0
      0
      0
      0
      0
      0
    
    
      EMBEZZLEMENT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      3
      ...
      0
      0
      0
      38
      0
      0
      0
      0
      0
      0
    
    
      EXTORTION
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      8
      0
      0
      0
      0
      0
      0
    
    
      WEAPON LAWS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      40
      ...
      0
      0
      1
      232
      0
      0
      0
      0
      0
      0
    
    
      KIDNAPPING
      0
      0
      0
      0
      0
      0
      0
      0
      0
      5
      ...
      0
      0
      0
      74
      0
      0
      0
      0
      0
      0
    
    
      MISSING PERSON
      0
      0
      0
      0
      0
      0
      0
      0
      0
      40
      ...
      0
      0
      0
      845
      0
      0
      0
      0
      0
      0
    
    
      SUSPICIOUS OCC
      0
      0
      0
      0
      0
      0
      0
      0
      0
      129
      ...
      0
      0
      2
      1075
      0
      0
      0
      0
      0
      0
    
    
      TRESPASS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      54
      ...
      0
      0
      0
      264
      0
      0
      0
      0
      0
      0
    
    
      WARRANTS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      342
      ...
      0
      0
      0
      1414
      0
      0
      0
      0
      0
      0
    
    
      ASSAULT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      341
      ...
      0
      0
      7
      2511
      0
      0
      0
      0
      0
      0
    
    
      LARCENY/THEFT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      531
      ...
      0
      0
      3
      7710
      0
      0
      0
      0
      0
      0
    
    
      PROSTITUTION
      0
      0
      0
      0
      0
      0
      0
      0
      0
      34
      ...
      0
      0
      0
      276
      0
      0
      0
      0
      0
      0
    
    
      BRIBERY
      0
      0
      0
      0
      0
      0
      0
      0
      0
      2
      ...
      0
      0
      0
      6
      0
      0
      0
      0
      0
      0
    
    
      FRAUD
      0
      0
      0
      0
      0
      0
      0
      0
      0
      52
      ...
      0
      0
      0
      652
      0
      0
      0
      0
      0
      0
    
    
      LIQUOR LAWS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      11
      ...
      0
      0
      1
      72
      0
      0
      0
      0
      0
      0
    
    
      DRUNKENNESS
      0
      0
      0
      0
      0
      0
      0
      0
      0
      17
      ...
      0
      0
      0
      178
      0
      0
      0
      0
      0
      0
    
    
      RECOVERED VEHICLE
      0
      0
      0
      0
      0
      0
      0
      0
      0
      7
      ...
      0
      0
      0
      73
      0
      0
      0
      0
      0
      0
    
  

39 rows × 39 columns

	DRUG/NARCOTIC	...	ASSAULT	LARCENY/THEFT
TREA	0	...	0	2
DISORDERLY CONDUCT	28	...	0	136
BAD CHECKS	0	...	0	21
SECONDARY CODES	42	...	0	324
FORGERY/COUNTERFEITING	28	...	1	387
SEX OFFENSES NON FORCIBLE	0	...	0	2
RUNAWAY	1	...	0	67
BURGLARY	58	...	0	1399
DRIVING UNDER THE INFLUENCE	2	...	0	93
DRUG/NARCOTIC	943	...	1	1341
LOITERING	6	...	0	45
VANDALISM	73	...	2	1640
STOLEN PROPERTY	17	...	0	161
OTHER OFFENSES	660	...	2	3886
NON-CRIMINAL	380	...	5	3591
GAMBLING	2	...	0	5
FAMILY OFFENSES	1	...	0	13
VEHICLE THEFT	55	...	4	1964
ARSON	2	...	1	54
PORNOGRAPHY/OBSCENE MAT	0	...	0	1
ROBBERY	90	...	0	765
SUICIDE	4	...	0	15
SEX OFFENSES FORCIBLE	12	...	0	159
EMBEZZLEMENT	3	...	0	38
EXTORTION	0	...	0	8
WEAPON LAWS	40	...	1	232
KIDNAPPING	5	...	0	74
MISSING PERSON	40	...	0	845
SUSPICIOUS OCC	129	...	2	1075
TRESPASS	54	...	0	264
WARRANTS	342	...	0	1414
ASSAULT	341	...	7	2511
LARCENY/THEFT	531	...	3	7710
PROSTITUTION	34	...	0	276
BRIBERY	2	...	0	6
FRAUD	52	...	0	652
LIQUOR LAWS	11	...	1	72
DRUNKENNESS	17	...	0	178
RECOVERED VEHICLE	7	...	0	73