In [2]:
# Additional Libraries
%matplotlib inline
import matplotlib.pyplot as plt

# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV

# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [3]:
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()

# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()

# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)

####
X = np.around(X, decimals=2)
####

# Shuffle data to remove any underlying pattern that may exist.  Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]

print(np.where(y == 'TREA'))
print(np.where(y == 'PORNOGRAPHY/OBSCENE MAT'))

## Due to difficulties with log loss and set(y_pred) needing to match set(labels), we will remove the extremely rare
## crimes from the data for quality issues.
#X_minus_trea = X[np.where(y != 'TREA')]
#y_minus_trea = y[np.where(y != 'TREA')]
#X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
#y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]

## Separate training, dev, and test data:
#test_data, test_labels = X_final[800000:], y_final[800000:]
#dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000]
#train_data, train_labels = X_final[100000:700000], y_final[100000:700000]
#calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000]

test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[100000:700000], y[100000:700000]
train_data, train_labels = X[:700000], y[:700000]
#calibrate_data, calibrate_labels = X[:100000], y[:100000]

# Create mini versions of the above sets
#mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000]
#mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[19000:28000]
#mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000]

#mini_train_data, mini_train_labels = X[:20000], y[:20000]
mini_train_data, mini_train_labels = X[:200000], y[:200000]
#mini_calibrate_data, mini_calibrate_labels = X[19000:28000], y[19000:28000]
mini_dev_data, mini_dev_labels = X[430000:480000], y[430000:480000]

## Create list of the crime type labels.  This will act as the "labels" parameter for the log loss functions that follow
#crime_labels = list(set(y_final))
#crime_labels_mini_train = list(set(mini_train_labels))
#crime_labels_mini_dev = list(set(mini_dev_labels))
#crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
#print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))

crime_labels = list(set(y))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
#crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
#print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev))

print(len(train_data),len(train_labels))
print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
print(len(test_data),len(test_labels))
#print(len(mini_calibrate_data),len(mini_calibrate_labels))
#print(len(calibrate_data),len(calibrate_labels))


(array([ 39195,  69757, 347976, 450980, 467488, 777371], dtype=int64),)
(array([  6630, 173114, 181454, 239577, 256637, 256880, 300524, 371552,
       378220, 433978, 491311, 537117, 542223, 614805, 622486, 733540,
       777032, 787625, 801668, 806897, 813648, 817548], dtype=int64),)
39 39 39
700000 700000
100000 100000
200000 200000
50000 50000
78049 78049

In [4]:
#learning_rate : float, optional (default=0.1)
#n_estimators : int (default=100)
#max_depth : integer, optional (default=3)
#criterion : string, optional (default=”friedman_mse”)
#min_samples_split : int, float, optional (default=2)
#min_samples_leaf : int, float, optional (default=1)
#min_weight_fraction_leaf : float, optional (default=0.)
#subsample : float, optional (default=1.0)
#max_features : int, float, string or None, optional (default=None)
#max_leaf_nodes : int or None, optional (default=None)

nList = [75, 125]
depthList = [1, 5]
leafList = [2, 7]
featuresList = [8, 17]

In [ ]:
#gb_param_grid = {'n_estimators':[25, 75, 250, 750], 'max_depth': [1, 5, 9], \
#                 'min_samples_leaf': [2, 7, 12], 'max_features': [3, 8, 17]}
#GB = GridSearchCV(GradientBoostingClassifier(), param_grid=gb_param_grid, scoring='neg_log_loss')
#GB.fit(train_data, train_labels)

In [ ]:
for n_estimators in nList:
    for max_depth in depthList:
        for min_samples_leaf in leafList:
            for max_features in featuresList:
                gbTest = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, \
                                                    min_samples_leaf=min_samples_leaf, max_features=max_features)
                gbTest.fit(mini_train_data, mini_train_labels)
                gbTestPredictionProbabilities = gbTest.predict_proba(mini_dev_data)
                print("Parameters:")
                print("n_estimators:", str(n_estimators)+";", " max_depth:", str(max_depth)+";", \
                      " min_samples_leaf:", str(min_samples_leaf)+";", " max_features:", str(max_features))
                print("Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = gbTestPredictionProbabilities, \
                                                        labels = crime_labels_mini_dev), "\n\n")
                print()


Parameters:
n_estimators: 75;  max_depth: 1;  min_samples_leaf: 2;  max_features: 8
Multi-class Log Loss: 2.58843401767 



Parameters:
n_estimators: 75;  max_depth: 1;  min_samples_leaf: 2;  max_features: 17
Multi-class Log Loss: 2.58836661326 



Parameters:
n_estimators: 75;  max_depth: 1;  min_samples_leaf: 7;  max_features: 8
Multi-class Log Loss: 2.58858672498 



Parameters:
n_estimators: 75;  max_depth: 1;  min_samples_leaf: 7;  max_features: 17
Multi-class Log Loss: 2.58815404629 



Parameters:
n_estimators: 75;  max_depth: 5;  min_samples_leaf: 2;  max_features: 8
Multi-class Log Loss: 2.5658087233 



Parameters:
n_estimators: 75;  max_depth: 5;  min_samples_leaf: 2;  max_features: 17
Multi-class Log Loss: 2.5749636863