In [83]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);
\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;
SELECT
category,
date_part('hour', dates) AS hour_of_day,
CASE
WHEN dayofweek = 'Monday' then 1
WHEN dayofweek = 'Tuesday' THEN 2
WHEN dayofweek = 'Wednesday' THEN 3
WHEN dayofweek = 'Thursday' THEN 4
WHEN dayofweek = 'Friday' THEN 5
WHEN dayofweek = 'Saturday' THEN 6
WHEN dayofweek = 'Sunday' THEN 7
END AS dayofweek_numeric,
X,
Y,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS bayview_binary,
CASE
WHEN pd_district = 'INGLESIDE' THEN 1
ELSE 0
END AS ingleside_binary,
CASE
WHEN pd_district = 'NORTHERN' THEN 1
ELSE 0
END AS northern_binary,
CASE
WHEN pd_district = 'CENTRAL' THEN 1
ELSE 0
END AS central_binary,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS pd_bayview_binary,
CASE
WHEN pd_district = 'MISSION' THEN 1
ELSE 0
END AS mission_binary,
CASE
WHEN pd_district = 'SOUTHERN' THEN 1
ELSE 0
END AS southern_binary,
CASE
WHEN pd_district = 'TENDERLOIN' THEN 1
ELSE 0
END AS tenderloin_binary,
CASE
WHEN pd_district = 'PARK' THEN 1
ELSE 0
END AS park_binary,
CASE
WHEN pd_district = 'RICHMOND' THEN 1
ELSE 0
END AS richmond_binary,
CASE
WHEN pd_district = 'TARAVAL' THEN 1
ELSE 0
END AS taraval_binary
FROM kaggle_sf_crime;
We seek to add features to our models that will improve performance with respect to out desired performance metric. There is evidence that there is a correlation between weather patterns and crime, with some experts even arguing for a causal relationship between weather and crime [1]. More specifically, a 2013 paper published in Science showed that higher temperatures and extreme rainfall led to large increases in conflict. In the setting of strong evidence that weather influences crime, we see it as a candidate for additional features to improve the performance of our classifiers. Weather data was gathered from (insert source). Certain features from this data set were incorporated into the original crime data set in order to add features that were hypothesizzed to improve performance. These features included (insert what we eventually include).
In [ ]:
#data_path = "./data/train_transformed.csv"
#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()
########## Adding the date back into the data
#import csv
#import time
#import calendar
#data_path = "./data/train.csv"
#dataCSV = open(data_path, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allData = csvData[1:]
#dataCSV.close()
#df2 = pd.DataFrame(allData)
#df2.columns = csvFields
#dates = df2['Dates']
#dates = dates.apply(time.strptime, args=("%Y-%m-%d %H:%M:%S",))
#dates = dates.apply(calendar.timegm)
#print(dates.head())
#x_data['secondsFromEpoch'] = dates
#colnames = x_data.columns.tolist()
#colnames = colnames[-1:] + colnames[:-1]
#x_data = x_data[colnames]
##########
########## Adding the weather data into the original crime data
#weatherData1 = "./data/1027175.csv"
#weatherData2 = "./data/1027176.csv"
#dataCSV = open(weatherData1, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allWeatherData1 = csvData[1:]
#dataCSV.close()
#dataCSV = open(weatherData2, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allWeatherData2 = csvData[1:]
#dataCSV.close()
#weatherDF1 = pd.DataFrame(allWeatherData1)
#weatherDF1.columns = csvFields
#dates1 = weatherDF1['DATE']
#sunrise1 = weatherDF1['DAILYSunrise']
#sunset1 = weatherDF1['DAILYSunset']
#weatherDF2 = pd.DataFrame(allWeatherData2)
#weatherDF2.columns = csvFields
#dates2 = weatherDF2['DATE']
#sunrise2 = weatherDF2['DAILYSunrise']
#sunset2 = weatherDF2['DAILYSunset']
#functions for processing the sunrise and sunset times of each day
#def get_hour_and_minute(milTime):
# hour = int(milTime[:-2])
# minute = int(milTime[-2:])
# return [hour, minute]
#def get_date_only(date):
# return time.struct_time(tuple([date[0], date[1], date[2], 0, 0, 0, date[6], date[7], date[8]]))
#def structure_sun_time(timeSeries, dateSeries):
# sunTimes = timeSeries.copy()
# for index in range(len(dateSeries)):
# sunTimes[index] = time.struct_time(tuple([dateSeries[index][0], dateSeries[index][1], dateSeries[index][2], timeSeries[index][0], timeSeries[index][1], dateSeries[index][5], dateSeries[index][6], dateSeries[index][7], dateSeries[index][8]]))
# return sunTimes
#dates1 = dates1.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
#sunrise1 = sunrise1.apply(get_hour_and_minute)
#sunrise1 = structure_sun_time(sunrise1, dates1)
#sunrise1 = sunrise1.apply(calendar.timegm)
#sunset1 = sunset1.apply(get_hour_and_minute)
#sunset1 = structure_sun_time(sunset1, dates1)
#sunset1 = sunset1.apply(calendar.timegm)
#dates1 = dates1.apply(calendar.timegm)
#dates2 = dates2.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
#sunrise2 = sunrise2.apply(get_hour_and_minute)
#sunrise2 = structure_sun_time(sunrise2, dates2)
#sunrise2 = sunrise2.apply(calendar.timegm)
#sunset2 = sunset2.apply(get_hour_and_minute)
#sunset2 = structure_sun_time(sunset2, dates2)
#sunset2 = sunset2.apply(calendar.timegm)
#dates2 = dates2.apply(calendar.timegm)
#weatherDF1['DATE'] = dates1
#weatherDF1['DAILYSunrise'] = sunrise1
#weatherDF1['DAILYSunset'] = sunset1
#weatherDF2['DATE'] = dates2
#weatherDF2['DAILYSunrise'] = sunrise2
#weatherDF2['DAILYSunset'] = sunset2
#weatherDF = pd.concat([weatherDF1,weatherDF2[32:]],ignore_index=True)
# Starting off with some of the easier features to work with-- more to come here . . . still in beta
#weatherMetrics = weatherDF[['DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed', \
# 'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY', 'DAILYSunrise', 'DAILYSunset']]
#weatherMetrics = weatherMetrics.convert_objects(convert_numeric=True)
#weatherDates = weatherMetrics['DATE']
#'DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed',
#'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY'
#timeWindow = 10800 #3 hours
#hourlyDryBulbTemp = []
#hourlyRelativeHumidity = []
#hourlyWindSpeed = []
#hourlySeaLevelPressure = []
#hourlyVisibility = []
#dailySunrise = []
#dailySunset = []
#daylight = []
#test = 0
#for timePoint in dates:#dates is the epoch time from the kaggle data
# relevantWeather = weatherMetrics[(weatherDates <= timePoint) & (weatherDates > timePoint - timeWindow)]
# hourlyDryBulbTemp.append(relevantWeather['HOURLYDRYBULBTEMPF'].mean())
# hourlyRelativeHumidity.append(relevantWeather['HOURLYRelativeHumidity'].mean())
# hourlyWindSpeed.append(relevantWeather['HOURLYWindSpeed'].mean())
# hourlySeaLevelPressure.append(relevantWeather['HOURLYSeaLevelPressure'].mean())
# hourlyVisibility.append(relevantWeather['HOURLYVISIBILITY'].mean())
# dailySunrise.append(relevantWeather['DAILYSunrise'].iloc[-1])
# dailySunset.append(relevantWeather['DAILYSunset'].iloc[-1])
# daylight.append(1.0*((timePoint >= relevantWeather['DAILYSunrise'].iloc[-1]) and (timePoint < relevantWeather['DAILYSunset'].iloc[-1])))
#if timePoint < relevantWeather['DAILYSunset'][-1]:
#daylight.append(1)
#else:
#daylight.append(0)
# if test%100000 == 0:
# print(relevantWeather)
# test += 1
#hourlyDryBulbTemp = pd.Series.from_array(np.array(hourlyDryBulbTemp))
#hourlyRelativeHumidity = pd.Series.from_array(np.array(hourlyRelativeHumidity))
#hourlyWindSpeed = pd.Series.from_array(np.array(hourlyWindSpeed))
#hourlySeaLevelPressure = pd.Series.from_array(np.array(hourlySeaLevelPressure))
#hourlyVisibility = pd.Series.from_array(np.array(hourlyVisibility))
#dailySunrise = pd.Series.from_array(np.array(dailySunrise))
#dailySunset = pd.Series.from_array(np.array(dailySunset))
#daylight = pd.Series.from_array(np.array(daylight))
#x_data['HOURLYDRYBULBTEMPF'] = hourlyDryBulbTemp
#x_data['HOURLYRelativeHumidity'] = hourlyRelativeHumidity
#x_data['HOURLYWindSpeed'] = hourlyWindSpeed
#x_data['HOURLYSeaLevelPressure'] = hourlySeaLevelPressure
#x_data['HOURLYVISIBILITY'] = hourlyVisibility
#x_data['DAILYSunrise'] = dailySunrise
#x_data['DAILYSunset'] = dailySunset
#x_data['Daylight'] = daylight
#x_data.to_csv(path_or_buf="C:/MIDS/W207 final project/x_data.csv")
##########
# Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)
# Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]
# Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]
#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))
#print(train_data[:10])
In [85]:
# Data path to your local copy of Sam's "train_transformed.csv", which was produced by ?separate Python script?
data_path_for_labels_only = "/Users/Bryan/Desktop/UC_Berkeley_MIDS_files/Courses/W207_Intro_To_Machine_Learning/Final_Project/sf_crime-master/data/train_transformed.csv"
df = pd.read_csv(data_path_for_labels_only, header=0)
y = df.category.as_matrix()
# Data path to your local copy of Kalvin's "x_data.csv", which was produced by the negated cell above
data_path = "/Users/Bryan/Desktop/UC_Berkeley_MIDS_files/Courses/W207_Intro_To_Machine_Learning/Final_Project/x_data_08_15.csv"
df = pd.read_csv(data_path, header=0)
# Impute missing values with mean values:
x_complete = df.fillna(df.mean())
X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)
# Shuffle data to remove any underlying pattern that may exist:
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
# Separate training, dev, and test data:
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
train_data, train_labels = X[:700000], y[:700000]
mini_train_data, mini_train_labels = X[:5000], y[:5000]
mini_dev_data, mini_dev_labels = X[5000:7000], y[5000:7000]
# Create list of the crime type labels. This will act as the "labels" parameter for the log loss functions that follow
crime_labels = ['DRUG/NARCOTIC', 'RUNAWAY', 'DRUNKENNESS', 'LOITERING', 'STOLEN PROPERTY', 'MISSING PERSON', 'ARSON', 'FRAUD', 'SEX OFFENSES NON FORCIBLE', 'NON-CRIMINAL', 'RECOVERED VEHICLE', 'WEAPON LAWS', 'ASSAULT', 'TRESPASS', 'GAMBLING', 'SUSPICIOUS OCC', 'BAD CHECKS', 'VANDALISM', 'FAMILY OFFENSES', 'DRIVING UNDER THE INFLUENCE', 'PROSTITUTION', 'WARRANTS', 'SEX OFFENSES FORCIBLE', 'DISORDERLY CONDUCT', 'LIQUOR LAWS', 'FORGERY/COUNTERFEITING', 'ROBBERY', 'OTHER OFFENSES', 'EXTORTION', 'VEHICLE THEFT', 'SUICIDE', 'LARCENY/THEFT', 'BRIBERY', 'EMBEZZLEMENT', 'SECONDARY CODES', 'KIDNAPPING', 'BURGLARY']
print(len(train_data),len(train_labels))
print(len(dev_data),len(dev_labels))
print(len(mini_train_data),len(mini_train_labels))
print(len(mini_dev_data),len(mini_dev_labels))
In [ ]:
### Read in zip code data
#data_path_zip = "./data/2016_zips.csv"
#zips = pd.read_csv(data_path_zip, header=0, sep ='\t', usecols = [0,5,6], names = ["GEOID", "INTPTLAT", "INTPTLONG"], dtype ={'GEOID': int, 'INTPTLAT': float, 'INTPTLONG': float})
#sf_zips = zips[(zips['GEOID'] > 94000) & (zips['GEOID'] < 94189)]
### Mapping longitude/latitude to zipcodes
#def dist(lat1, long1, lat2, long2):
# return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
# return abs(lat1-lat2)+abs(long1-long2)
#def find_zipcode(lat, long):
# distances = sf_zips.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
# return sf_zips.loc[distances.idxmin(), "GEOID"]
#x_data['zipcode'] = 0
#for i in range(0, 1):
# x_data['zipcode'][i] = x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
#x_data['zipcode']= x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
### Read in school data
#data_path_schools = "./data/pubschls.csv"
#schools = pd.read_csv(data_path_schools,header=0, sep ='\t', usecols = ["CDSCode","StatusType", "School", "EILCode", "EILName", "Zip", "Latitude", "Longitude"], dtype ={'CDSCode': str, 'StatusType': str, 'School': str, 'EILCode': str,'EILName': str,'Zip': str, 'Latitude': float, 'Longitude': float})
#schools = schools[(schools["StatusType"] == 'Active')]
### Find the closest school
#def dist(lat1, long1, lat2, long2):
# return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
#def find_closest_school(lat, long):
# distances = schools.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
# return min(distances)
#x_data['closest_school'] = x_data_sub.apply(lambda row: find_closest_school(row['y'], row['x']), axis=1)
In [86]:
# The Kaggle submission format requires listing the ID of each example.
# This is to remember the order of the IDs after shuffling
#allIDs = np.array(list(df.axes[0]))
#allIDs = allIDs[shuffle]
#testIDs = allIDs[800000:]
#devIDs = allIDs[700000:800000]
#trainIDs = allIDs[:700000]
# Extract the column names for the required submission format
#sampleSubmission_path = "./data/sampleSubmission.csv"
#sampleDF = pd.read_csv(sampleSubmission_path)
#allColumns = list(sampleDF.columns)
#featureColumns = allColumns[1:]
# Extracting the test data for a baseline submission
#real_test_path = "./data/test_transformed.csv"
#testDF = pd.read_csv(real_test_path, header=0)
#real_test_data = testDF
#test_complete = real_test_data.fillna(real_test_data.mean())
#Test_raw = test_complete.as_matrix()
#TestData = MinMaxScaler().fit_transform(Test_raw)
# Here we remember the ID of each test data point, in case we ever decide to shuffle the test data for some reason
#testIDs = list(testDF.axes[0])
In [87]:
# Generate a baseline MNB classifier and make it return prediction probabilities for the actual test data
#def MNB():
# mnb = MultinomialNB(alpha = 0.0000001)
# mnb.fit(train_data, train_labels)
# print("\n\nMultinomialNB accuracy on dev data:", mnb.score(dev_data, dev_labels))
# return mnb.predict_proba(dev_data)
#MNB()
#baselinePredictionProbabilities = MNB()
# Place the resulting prediction probabilities in a .csv file in the required format
# First, turn the prediction probabilties into a data frame
#resultDF = pd.DataFrame(baselinePredictionProbabilities,columns=featureColumns)
# Add the IDs as a final column
#resultDF.loc[:,'Id'] = pd.Series(testIDs,index=resultDF.index)
# Make the 'Id' column the first column
#colnames = resultDF.columns.tolist()
#colnames = colnames[-1:] + colnames[:-1]
#resultDF = resultDF[colnames]
# Output to a .csv file
# resultDF.to_csv('result.csv',index=False)
Note: the code above will shuffle data differently every time it's run, so model accuracies will vary accordingly.
In [26]:
## Data sub-setting quality check-point
print(train_data[:1])
print(train_labels[:1])
In [27]:
# Modeling quality check-point with MNB--fast model
def MNB():
mnb = MultinomialNB(alpha = 0.0000001)
mnb.fit(train_data, train_labels)
print("\n\nMultinomialNB accuracy on dev data:", mnb.score(dev_data, dev_labels))
MNB()
As determined by the Kaggle submission guidelines, the performance criteria metric for the San Francisco Crime Classification competition is Multi-class Logarithmic Loss (also known as cross-entropy). There are various other performance metrics that are appropriate for different domains: accuracy, F-score, Lift, ROC Area, average precision, precision/recall break-even point, and squared error.
(Describe each performance metric and a domain in which it is preferred. Give Pros/Cons if able)
Multi-class Log Loss:
Accuracy:
F-score:
Lift:
ROC Area:
Average precision
Precision/Recall break-even point:
Squared-error:
In [71]:
def model_prototype(train_data, train_labels, eval_data, eval_labels):
knn = KNeighborsClassifier(n_neighbors=5).fit(train_data, train_labels)
bnb = BernoulliNB(alpha=1, binarize = 0.5).fit(train_data, train_labels)
mnb = MultinomialNB().fit(train_data, train_labels)
log_reg = LogisticRegression().fit(train_data, train_labels)
neural_net = MLPClassifier().fit(train_data, train_labels)
random_forest = RandomForestClassifier().fit(train_data, train_labels)
decision_tree = DecisionTreeClassifier().fit(train_data, train_labels)
support_vm_step_one = svm.SVC(probability = True)
support_vm = support_vm_step_one.fit(train_data, train_labels)
models = [knn, bnb, mnb, log_reg, neural_net, random_forest, decision_tree, support_vm]
for model in models:
eval_prediction_probabilities = model.predict_proba(eval_data)
eval_predictions = model.predict(eval_data)
print(model, "Multi-class Log Loss:", log_loss(y_true = eval_labels, y_pred = eval_prediction_probabilities, labels = crime_labels), "\n\n")
model_prototype(mini_train_data, mini_train_labels, mini_dev_data, mini_dev_labels)
Here we seek to optimize the performance of our classifiers in a three-step, dynamnic engineering process.
We previously added components from the weather data into the original SF crime data as new features. We will not repeat work done in our initial submission, where our training dataset did not include these features. For comparision with respoect to how the added features improved our performance with respect to log loss, please refer back to our initial submission.
We can have Kalvin expand on exactly what he did here.
Each classifier has parameters that we can engineer to further optimize performance, as opposed to using the default parameter values as we did above in the model prototyping cell. This will be specific to each classifier as detailed below.
We can calibrate the models via Platt Scaling or Isotonic Regression to attempt to improve their performance.
Platt Scaling: (brief explanation of how it works)
Isotonic Regression: ((brief explanation of how it works))
In [82]:
list_for_ks = []
list_for_ws = []
list_for_ps = []
list_for_log_loss = []
def k_neighbors_tuned(k,w,p):
tuned_KNN = KNeighborsClassifier(n_neighbors=k, weights=w, p=p).fit(mini_train_data, mini_train_labels)
dev_prediction_probabilities = tuned_KNN.predict_proba(mini_dev_data)
list_for_ks.append(this_k)
list_for_ws.append(this_w)
list_for_ps.append(this_p)
working_log_loss = log_loss(y_true = mini_dev_labels, y_pred = dev_prediction_probabilities, labels = crime_labels)
list_for_log_loss.append(working_log_loss)
#print("Multi-class Log Loss with KNN and k,w,p =", k,",",w,",", p, "is:", working_log_loss)
k_value_tuning = [i for i in range(1,1000,3)]
weight_tuning = ['uniform', 'distance']
power_parameter_tuning = [1,2]
start = time.clock()
for this_k in k_value_tuning:
for this_w in weight_tuning:
for this_p in power_parameter_tuning:
k_neighbors_tuned(this_k, this_w, this_p)
index_best_logloss = np.argmin(list_for_log_loss)
print('For KNN the best log loss with hyperparameter tuning is',list_for_log_loss[index_best_logloss], 'with k =', list_for_ks[index_best_logloss], 'w =', list_for_ws[index_best_logloss], 'p =', list_for_ps[index_best_logloss])
end = time.clock()
print("Computation time for this step is %.2f" % (end-start), 'seconds')
We will consider embeding this step within the for loop for the hyperparameter tuning. More likely we will pipeline it along with the hyperparameter tuning steps. We will then use GridSearchCV top find the optimized parameters based on our performance metric of Mutli-Class Log Loss.
In [ ]:
# def k_calibrated():
# Here we will calibrate the KNN classifier with both Platt Scaling and with Isotonic Regression using CalibratedClassifierCV
# with various parameter settings. The "method" parameter can be set to "sigmoid" or to "isotonic",
# corresponding to Platt Scaling and to Isotonic Regression respectively.
methods = ['signoid', 'isotonic']
for m in methods:
CCV_for_KNN = CalibratedClassifierCV()
# Will likely embed this step within the for loop for the hyperparameter tuning as that makes more sense. #
# Or will pipeline it along with the hyperparameter tuning steps #
# k_calibrated()
In [ ]:
def GNB():
gnb = GaussianNB()
gnb.fit(train_data, train_labels)
print("GaussianNB accuracy on dev data:",
gnb.score(dev_data, dev_labels))
# Gaussian Naive Bayes requires the data to have a relative normal distribution. Sometimes
# adding noise can improve performance by making the data more normal:
train_data_noise = np.random.rand(train_data.shape[0],train_data.shape[1])
modified_train_data = np.multiply(train_data,train_data_noise)
gnb_noise = GaussianNB()
gnb.fit(modified_train_data, train_labels)
print("GaussianNB accuracy with added noise:",
gnb.score(dev_data, dev_labels))
# Going slightly deeper with hyperparameter tuning and model calibration:
def BNB(alphas):
bnb_one = BernoulliNB(binarize = 0.5)
bnb_one.fit(train_data, train_labels)
print("\n\nBernoulli Naive Bayes accuracy when alpha = 1 (the default value):",
bnb_one.score(dev_data, dev_labels))
bnb_zero = BernoulliNB(binarize = 0.5, alpha=0)
bnb_zero.fit(train_data, train_labels)
print("BNB accuracy when alpha = 0:", bnb_zero.score(dev_data, dev_labels))
bnb = BernoulliNB(binarize=0.5)
clf = GridSearchCV(bnb, param_grid = alphas)
clf.fit(train_data, train_labels)
print("Best parameter for BNB on the dev data:", clf.best_params_)
clf_tuned = BernoulliNB(binarize = 0.5, alpha=0.00000000000000000000001)
clf_tuned.fit(train_data, train_labels)
print("Accuracy using the tuned Laplace smoothing parameter:",
clf_tuned.score(dev_data, dev_labels), "\n\n")
def investigate_model_calibration(buckets, correct, total):
clf_tuned = BernoulliNB(binarize = 0.5, alpha=0.00000000000000000000001)
clf_tuned.fit(train_data, train_labels)
# Establish data sets
pred_probs = clf_tuned.predict_proba(dev_data)
max_pred_probs = np.array(pred_probs.max(axis=1))
preds = clf_tuned.predict(dev_data)
# For each bucket, look at the predictions that the model yields.
# Keep track of total & correct predictions within each bucket.
bucket_bottom = 0
bucket_top = 0
for bucket_index, bucket in enumerate(buckets):
bucket_top = bucket
for pred_index, pred in enumerate(preds):
if (max_pred_probs[pred_index] <= bucket_top) and (max_pred_probs[pred_index] > bucket_bottom):
total[bucket_index] += 1
if preds[pred_index] == dev_labels[pred_index]:
correct[bucket_index] += 1
bucket_bottom = bucket_top
def MNB():
mnb = MultinomialNB(alpha = 0.0000001)
mnb.fit(train_data, train_labels)
print("\n\nMultinomialNB accuracy on dev data:", mnb.score(dev_data, dev_labels))
alphas = {'alpha': [0.00000000000000000000001, 0.0000001, 0.0001, 0.001,
0.01, 0.1, 0.0, 0.5, 1.0, 2.0, 10.0]}
buckets = [0.5, 0.9, 0.99, 0.999, .9999, 0.99999, 1.0]
correct = [0 for i in buckets]
total = [0 for i in buckets]
MNB()
GNB()
BNB(alphas)
investigate_model_calibration(buckets, correct, total)
for i in range(len(buckets)):
accuracy = 0.0
if (total[i] > 0): accuracy = correct[i] / total[i]
print('p(pred) <= %.13f total = %3d accuracy = %.3f' %(buckets[i], total[i], accuracy))
The Bernoulli Naive Bayes and Multinomial Naive Bayes models can predict whether a loan will be good or bad with XXX% accuracy.
We will prune the work above. Will seek to optimize the alpha parameter (Laplace smoothing parameter) for MNB and BNB classifiers.
Here we will calibrate the MNB, BNB and GNB classifiers with both Platt Scaling and with Isotonic Regression using CalibratedClassifierCV with various parameter settings. The "method" parameter can be set to "sigmoid" or to "isotonic", corresponding to Platt Scaling and to Isotonic Regression respectively. Will likely embed this step within the for loop for the hyperparameter tuning as that makes more sense. Or will pipeline it along with the hyperparameter tuning steps. We will then use GridSearchCV top find the optimized parameters based on our performance metric of Mutli-Class Log Loss.
THE REST OF THE MODEL CALIBRATION SECTIONS ARE SIMILAR AND THE OUTLINE WILL NOT BE REPEATED AS WOULD BE REDUNDANT.
For the Decision Tree classifier, we can seek to optimize the following classifier parameters: min_samples_leaf (the minimum number of samples required to be at a leaf node), max_depth
From readings, setting min_samples_leaf to approximately 1% of the data points can stop the tree from inappropriately classifying outliers, which can help to improve accuracy (unsure if significantly improves MCLL).
See above
For the SVM classifier, we can seek to optimize the following classifier parameters: C (penalty parameter C of the error term), kernel ('linear', 'poly', 'rbf', sigmoid', or 'precomputed')
See source [2] for parameter optimization in SVM
See above
In [91]:
### All the work from Sarah's notebook:
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
print (theano.config.device) # We're using CPUs (for now)
print (theano.config.floatX )# Should be 64 bit for CPUs
np.random.seed(0)
from IPython.display import display, clear_output
In [92]:
numFeatures = train_data[1].size
numTrainExamples = train_data.shape[0]
numTestExamples = test_data.shape[0]
print ('Features = %d' %(numFeatures))
print ('Train set = %d' %(numTrainExamples))
print ('Test set = %d' %(numTestExamples))
class_labels = list(set(train_labels))
print(class_labels)
numClasses = len(class_labels)
In [93]:
### Binarize the class labels
def binarizeY(data):
binarized_data = np.zeros((data.size,39))
for j in range(0,data.size):
feature = data[j]
i = class_labels.index(feature)
binarized_data[j,i]=1
return binarized_data
train_labels_b = binarizeY(train_labels)
test_labels_b = binarizeY(test_labels)
numClasses = train_labels_b[1].size
print ('Classes = %d' %(numClasses))
print ('\n', train_labels_b[:5, :], '\n')
print (train_labels[:10], '\n')
In [94]:
###1) Parameters
numFeatures = train_data.shape[1]
numHiddenNodeslayer1 = 50
numHiddenNodeslayer2 = 30
w_1 = theano.shared(np.asarray((np.random.randn(*(numFeatures, numHiddenNodeslayer1))*0.01)))
w_2 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer1, numHiddenNodeslayer2))*0.01)))
w_3 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer2, numClasses))*0.01)))
params = [w_1, w_2, w_3]
###2) Model
X = T.matrix()
Y = T.matrix()
srng = RandomStreams()
def dropout(X, p=0.):
if p > 0:
X *= srng.binomial(X.shape, p=1 - p)
X /= 1 - p
return X
def model(X, w_1, w_2, w_3, p_1, p_2, p_3):
return T.nnet.softmax(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(X, p_1), w_1)),p_2), w_2)),p_3),w_3))
y_hat_train = model(X, w_1, w_2, w_3, 0.2, 0.5,0.5)
y_hat_predict = model(X, w_1, w_2, w_3, 0., 0., 0.)
### (3) Cost function
cost = T.mean(T.sqr(y_hat - Y))
cost = T.mean(T.nnet.categorical_crossentropy(y_hat_train, Y))
In [95]:
### (4) Objective (and solver)
alpha = 0.01
def backprop(cost, w):
grads = T.grad(cost=cost, wrt=w)
updates = []
for wi, grad in zip(w, grads):
updates.append([wi, wi - grad * alpha])
return updates
update = backprop(cost, params)
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
y_pred = T.argmax(y_hat_predict, axis=1)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)
miniBatchSize = 10
def gradientDescent(epochs):
for i in range(epochs):
for start, end in zip(range(0, len(train_data), miniBatchSize), range(miniBatchSize, len(train_data), miniBatchSize)):
cc = train(train_data[start:end], train_labels_b[start:end])
clear_output(wait=True)
print ('%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(test_labels_b, axis=1) == predict(test_data))) )
gradientDescent(50)
### How to decide what # to use for epochs? epochs in this case are how many rounds?
### plot costs for each of the 50 iterations and see how much it decline.. if its still very decreasing, you should
### do more iterations; otherwise if its looking like its flattening, you can stop
In [ ]:
For the Random Forest classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_features, max_depth, min_samples_leaf, bootstrap (whether or not bootstrap samples are used when building trees), oob_score (whether or not out-of-bag samples are used to estimate the generalization accuracy)
See above
There are no major changes that we seek to make in the AdaBoostClassifier with respect to default parameter values.
We will run the AdaBoostClassifier on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
For the Bagging meta classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_samples, max_features, bootstrap (whether or not bootstrap samples are used when building trees), bootstrap_features (whether features are drawn with replacement), and oob_score (whether or not out-of-bag samples are used to estimate the generalization accuracy)
We will run the BaggingClassifier on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
For the Gradient Boosting meta classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_depth, min_samples_leaf, and max_features
We will run the GradientBoostingClassifier with loss = 'deviance' (as loss = 'exponential' uses the AdaBoost algorithm) on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
In [ ]:
# Here we will likely use Pipeline and GridSearchCV in order to find the overall classifier with optimized Multi-class Log Loss.
# This will be the last step after all attempts at feature addition, hyperparameter tuning, and calibration are completed
# and the corresponding performance metrics are gathered.
1) Hsiang, Solomon M. and Burke, Marshall and Miguel, Edward. "Quantifying the Influence of Climate on Human Conflict". Science, Vol 341, Issue 6151, 2013
2) Huang, Cheng-Lung. Wang, Chieh-Jen. "A GA-based feature selection and parameters optimization for support vector machines". Expert Systems with Applications, Vol 31, 2006, p 231-240
3) More to come
In [ ]: