In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
In [2]:
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()
# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)
####
X = np.around(X, decimals=2)
####
# Shuffle data to remove any underlying pattern that may exist. Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
train_data, train_labels = X[:700000], y[:700000]
mini_train_data, mini_train_labels = X[:200000], y[:200000]
mini_dev_data, mini_dev_labels = X[430000:480000], y[430000:480000]
crime_labels = list(set(y))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev))
print(len(train_data),len(train_labels))
print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
print(len(test_data),len(test_labels))
In [4]:
mnb_param_grid = {'alpha': [0.01, 1.0, 10.0, 50.0]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [6]:
MNBPredictionProbabilities = MNB.predict_proba(dev_data)
In [7]:
print("Multi-class Log Loss:", log_loss(y_true = dev_labels, y_pred = MNBPredictionProbabilities, labels = crime_labels), "\n\n")
In [8]:
mnb_param_grid = {'alpha': [0.5, 1.0, 2.5, 5.0]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [9]:
MNBPredictionProbabilities = MNB.predict_proba(dev_data)
print("Multi-class Log Loss:", log_loss(y_true = dev_labels, y_pred = MNBPredictionProbabilities, labels = crime_labels), "\n\n")
In [10]:
mnb_param_grid = {'alpha': [0.0001, 0.01, 0.1, 0.5]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [11]:
mnb_param_grid = {'alpha': [0.2, 0.4, 0.6, 0.8]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [12]:
mnb_param_grid = {'alpha': [0.3, 0.35, 0.4, 0.45, 0.5]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [15]:
mnb_param_grid = {'alpha': [0.31, 0.33, 0.35, 0.37, 0.39]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [16]:
MNBPredictionProbabilities = MNB.predict_proba(dev_data)
print("Multi-class Log Loss:", log_loss(y_true = dev_labels, y_pred = MNBPredictionProbabilities, labels = crime_labels), "\n\n")
In [17]:
mnb_param_grid = {'alpha': [0.340, 0.345, 0.35, 0.355, 0.360]}
MNB = GridSearchCV(MultinomialNB(), param_grid=mnb_param_grid, scoring = 'neg_log_loss')
MNB.fit(train_data, train_labels)
print("the best alpha value is:", str(MNB.best_params_['alpha']))
In [18]:
MNBPredictionProbabilities = MNB.predict_proba(dev_data)
print("Multi-class Log Loss:", log_loss(y_true = dev_labels, y_pred = MNBPredictionProbabilities, labels = crime_labels), "\n\n")