In [1]:
# Additional Libraries
%matplotlib inline
import matplotlib.pyplot as plt
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
In [2]:
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "./data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()
# Impute missing values with mean values:
#x_complete = df.fillna(df.mean())
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)
####
X = np.around(X, decimals=2)
####
# Shuffle data to remove any underlying pattern that may exist. Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
train_data, train_labels = X[:700000], y[:700000]
mini_train_data, mini_train_labels = X[:200000], y[:200000]
mini_dev_data, mini_dev_labels = X[430000:480000], y[430000:480000]
crime_labels = list(set(y))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev))
print(len(train_data),len(train_labels))
print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
print(len(test_data),len(test_labels))
In [4]:
cValsL1 = [7.5, 10.0, 12.5, 20.0]
methods = ['sigmoid', 'isotonic']
cv = 2
tol = 0.01
for c in cValsL1:
for m in methods:
ccvL1 = CalibratedClassifierCV(LogisticRegression(penalty='l1', C=c, tol=tol), method=m, cv=cv)
ccvL1.fit(mini_train_data, mini_train_labels)
print(ccvL1.get_params)
ccvL1_prediction_probabilities = ccvL1.predict_proba(mini_dev_data)
ccvL1_predictions = ccvL1.predict(mini_dev_data)
print("L1 Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = ccvL1_prediction_probabilities, labels = crime_labels_mini_dev), "\n\n")
print()
In [5]:
cValsL1 = [15.0, 20.0, 25.0, 50.0]
method = 'sigmoid'
cv = 2
tol = 0.01
for c in cValsL1:
ccvL1 = CalibratedClassifierCV(LogisticRegression(penalty='l1', C=c, tol=tol), method=method, cv=cv)
ccvL1.fit(mini_train_data, mini_train_labels)
print(ccvL1.get_params)
ccvL1_prediction_probabilities = ccvL1.predict_proba(mini_dev_data)
ccvL1_predictions = ccvL1.predict(mini_dev_data)
print("L1 Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = ccvL1_prediction_probabilities, labels = crime_labels_mini_dev), "\n\n")
print()
In [ ]:
In [ ]:
columns = ['hour_of_day','dayofweek',\
'x','y','bayview','ingleside','northern',\
'central','mission','southern','tenderloin',\
'park','richmond','taraval','HOURLYDRYBULBTEMPF',\
'HOURLYRelativeHumidity','HOURLYWindSpeed',\
'HOURLYSeaLevelPressure','HOURLYVISIBILITY',\
'Daylight']
allCoefsL1 = pd.DataFrame(index=columns)
for a in range(len(bestL1.coef_)):
allCoefsL1[crime_labels[a]] = bestL1.coef_[a]
allCoefsL1
In [ ]:
f = plt.figure(figsize=(15,8))
allCoefsL1.plot(kind='bar', figsize=(15,8))
plt.legend(loc='center left', bbox_to_anchor=(1.0,0.5))
plt.show()
In [6]:
cValsL2 = [75.0, 100.0, 150.0, 250.0]
methods = ['sigmoid', 'isotonic']
cv = 2
tol = 0.01
for c in cValsL2:
for m in methods:
ccvL2 = CalibratedClassifierCV(LogisticRegression(penalty='l2', solver='newton-cg', C=c, tol=tol), method=m, cv=cv)
ccvL2.fit(mini_train_data, mini_train_labels)
print(ccvL2.get_params)
ccvL2_prediction_probabilities = ccvL2.predict_proba(mini_dev_data)
ccvL2_predictions = ccvL2.predict(mini_dev_data)
print("L2 Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = ccvL2_prediction_probabilities, labels = crime_labels_mini_dev), "\n\n")
print()
In [7]:
cValsL2 = [200.0, 250.0, 300.0, 500.0]
method = 'isotonic'
cv = 2
tol = 0.01
for c in cValsL2:
for m in methods:
ccvL2 = CalibratedClassifierCV(LogisticRegression(penalty='l2', solver='newton-cg', C=c, tol=tol), method=method, cv=cv)
ccvL2.fit(mini_train_data, mini_train_labels)
print(ccvL2.get_params)
ccvL2_prediction_probabilities = ccvL2.predict_proba(mini_dev_data)
ccvL2_predictions = ccvL2.predict(mini_dev_data)
print("L2 Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = ccvL2_prediction_probabilities, labels = crime_labels_mini_dev), "\n\n")
print()
In [11]:
cValsL2 = [400.0, 500.0, 750.0, 1000.0]
method = 'isotonic'
cv = 2
tol = 0.01
for c in cValsL2:
for m in methods:
ccvL2 = CalibratedClassifierCV(LogisticRegression(penalty='l2', solver='newton-cg', C=c, tol=tol), method=method, cv=cv)
ccvL2.fit(mini_train_data, mini_train_labels)
print(ccvL2.get_params)
ccvL2_prediction_probabilities = ccvL2.predict_proba(mini_dev_data)
ccvL2_predictions = ccvL2.predict(mini_dev_data)
print("L2 Multi-class Log Loss:", log_loss(y_true = mini_dev_labels, y_pred = ccvL2_prediction_probabilities, labels = crime_labels_mini_dev), "\n\n")
print()
In [ ]:
In [ ]:
columns = ['hour_of_day','dayofweek',\
'x','y','bayview','ingleside','northern',\
'central','mission','southern','tenderloin',\
'park','richmond','taraval','HOURLYDRYBULBTEMPF',\
'HOURLYRelativeHumidity','HOURLYWindSpeed',\
'HOURLYSeaLevelPressure','HOURLYVISIBILITY',\
'Daylight']
allCoefsL2 = pd.DataFrame(index=columns)
for a in range(len(bestL2.coef_)):
allCoefsL2[crime_labels[a]] = bestL2.coef_[a]
allCoefsL2
In [ ]:
f = plt.figure(figsize=(15,8))
allCoefsL2.plot(kind='bar', figsize=(15,8))
plt.legend(loc='center left', bbox_to_anchor=(1.0,0.5))
plt.show()