In [2]:
# Additional Libraries
%matplotlib inline
import matplotlib.pyplot as plt
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
In [3]:
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()
# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)
####
X = np.around(X, decimals=2)
####
# Shuffle data to remove any underlying pattern that may exist. Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
print(np.where(y == 'TREA'))
print(np.where(y == 'PORNOGRAPHY/OBSCENE MAT'))
## Due to difficulties with log loss and set(y_pred) needing to match set(labels), we will remove the extremely rare
## crimes from the data for quality issues.
#X_minus_trea = X[np.where(y != 'TREA')]
#y_minus_trea = y[np.where(y != 'TREA')]
#X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
#y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
## Separate training, dev, and test data:
#test_data, test_labels = X_final[800000:], y_final[800000:]
#dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000]
#train_data, train_labels = X_final[100000:700000], y_final[100000:700000]
#calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000]
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[100000:700000], y[100000:700000]
train_data, train_labels = X[:700000], y[:700000]
#calibrate_data, calibrate_labels = X[:100000], y[:100000]
# Create mini versions of the above sets
#mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000]
#mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[19000:28000]
#mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000]
#mini_train_data, mini_train_labels = X[:20000], y[:20000]
mini_train_data, mini_train_labels = X[:200000], y[:200000]
#mini_calibrate_data, mini_calibrate_labels = X[19000:28000], y[19000:28000]
mini_dev_data, mini_dev_labels = X[430000:480000], y[430000:480000]
## Create list of the crime type labels. This will act as the "labels" parameter for the log loss functions that follow
#crime_labels = list(set(y_final))
#crime_labels_mini_train = list(set(mini_train_labels))
#crime_labels_mini_dev = list(set(mini_dev_labels))
#crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
#print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))
crime_labels = list(set(y))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
#crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
#print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev))
print(len(train_data),len(train_labels))
print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
print(len(test_data),len(test_labels))
#print(len(mini_calibrate_data),len(mini_calibrate_labels))
#print(len(calibrate_data),len(calibrate_labels))
In [4]:
#learning_rate : float, optional (default=0.1)
#n_estimators : int (default=100)
#max_depth : integer, optional (default=3)
#criterion : string, optional (default=”friedman_mse”)
#min_samples_split : int, float, optional (default=2)
#min_samples_leaf : int, float, optional (default=1)
#min_weight_fraction_leaf : float, optional (default=0.)
#subsample : float, optional (default=1.0)
#max_features : int, float, string or None, optional (default=None)
#max_leaf_nodes : int or None, optional (default=None)
nList = [75, 125]
depthList = [1, 5]
leafList = [2, 7]
featuresList = [8, 17]
In [ ]:
#gb_param_grid = {'n_estimators':[25, 75, 250, 750], 'max_depth': [1, 5, 9], \
# 'min_samples_leaf': [2, 7, 12], 'max_features': [3, 8, 17]}
#GB = GridSearchCV(GradientBoostingClassifier(), param_grid=gb_param_grid, scoring='neg_log_loss')
#GB.fit(train_data, train_labels)
In [ ]:
for n_estimators in nList:
for max_depth in depthList:
for min_samples_leaf in leafList:
for max_features in featuresList:
gbTest = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, \
min_samples_leaf=min_samples_leaf, max_features=max_features)
gbTest.fit(mini_train_data, mini_train_labels)
gbTestPredictionProbabilities = gbTest.predict_proba(mini_dev_data)
print("Parameters:")
print("n_estimators:", str(n_estimators)+";", " max_depth:", str(max_depth)+";", \
" min_samples_leaf:", str(min_samples_leaf)+";", " max_features:", str(max_features))
print("Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = gbTestPredictionProbabilities, \
labels = crime_labels_mini_dev), "\n\n")
print()