In [1]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);
\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;
SELECT
category,
date_part('hour', dates) AS hour_of_day,
CASE
WHEN dayofweek = 'Monday' then 1
WHEN dayofweek = 'Tuesday' THEN 2
WHEN dayofweek = 'Wednesday' THEN 3
WHEN dayofweek = 'Thursday' THEN 4
WHEN dayofweek = 'Friday' THEN 5
WHEN dayofweek = 'Saturday' THEN 6
WHEN dayofweek = 'Sunday' THEN 7
END AS dayofweek_numeric,
X,
Y,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS bayview_binary,
CASE
WHEN pd_district = 'INGLESIDE' THEN 1
ELSE 0
END AS ingleside_binary,
CASE
WHEN pd_district = 'NORTHERN' THEN 1
ELSE 0
END AS northern_binary,
CASE
WHEN pd_district = 'CENTRAL' THEN 1
ELSE 0
END AS central_binary,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS pd_bayview_binary,
CASE
WHEN pd_district = 'MISSION' THEN 1
ELSE 0
END AS mission_binary,
CASE
WHEN pd_district = 'SOUTHERN' THEN 1
ELSE 0
END AS southern_binary,
CASE
WHEN pd_district = 'TENDERLOIN' THEN 1
ELSE 0
END AS tenderloin_binary,
CASE
WHEN pd_district = 'PARK' THEN 1
ELSE 0
END AS park_binary,
CASE
WHEN pd_district = 'RICHMOND' THEN 1
ELSE 0
END AS richmond_binary,
CASE
WHEN pd_district = 'TARAVAL' THEN 1
ELSE 0
END AS taraval_binary
FROM kaggle_sf_crime;
We seek to add features to our models that will improve performance with respect to out desired performance metric. There is evidence that there is a correlation between weather patterns and crime, with some experts even arguing for a causal relationship between weather and crime [1]. More specifically, a 2013 paper published in Science showed that higher temperatures and extreme rainfall led to large increases in conflict. In the setting of strong evidence that weather influences crime, we see it as a candidate for additional features to improve the performance of our classifiers. Weather data was gathered from (insert source). Certain features from this data set were incorporated into the original crime data set in order to add features that were hypothesizzed to improve performance. These features included (insert what we eventually include).
In [ ]:
#data_path = "./data/train_transformed.csv"
#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()
########## Adding the date back into the data
#import csv
#import time
#import calendar
#data_path = "./data/train.csv"
#dataCSV = open(data_path, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allData = csvData[1:]
#dataCSV.close()
#df2 = pd.DataFrame(allData)
#df2.columns = csvFields
#dates = df2['Dates']
#dates = dates.apply(time.strptime, args=("%Y-%m-%d %H:%M:%S",))
#dates = dates.apply(calendar.timegm)
#print(dates.head())
#x_data['secondsFromEpoch'] = dates
#colnames = x_data.columns.tolist()
#colnames = colnames[-1:] + colnames[:-1]
#x_data = x_data[colnames]
##########
########## Adding the weather data into the original crime data
#weatherData1 = "./data/1027175.csv"
#weatherData2 = "./data/1027176.csv"
#dataCSV = open(weatherData1, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allWeatherData1 = csvData[1:]
#dataCSV.close()
#dataCSV = open(weatherData2, 'rt')
#csvData = list(csv.reader(dataCSV))
#csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#allWeatherData2 = csvData[1:]
#dataCSV.close()
#weatherDF1 = pd.DataFrame(allWeatherData1)
#weatherDF1.columns = csvFields
#dates1 = weatherDF1['DATE']
#sunrise1 = weatherDF1['DAILYSunrise']
#sunset1 = weatherDF1['DAILYSunset']
#weatherDF2 = pd.DataFrame(allWeatherData2)
#weatherDF2.columns = csvFields
#dates2 = weatherDF2['DATE']
#sunrise2 = weatherDF2['DAILYSunrise']
#sunset2 = weatherDF2['DAILYSunset']
#functions for processing the sunrise and sunset times of each day
#def get_hour_and_minute(milTime):
# hour = int(milTime[:-2])
# minute = int(milTime[-2:])
# return [hour, minute]
#def get_date_only(date):
# return time.struct_time(tuple([date[0], date[1], date[2], 0, 0, 0, date[6], date[7], date[8]]))
#def structure_sun_time(timeSeries, dateSeries):
# sunTimes = timeSeries.copy()
# for index in range(len(dateSeries)):
# sunTimes[index] = time.struct_time(tuple([dateSeries[index][0], dateSeries[index][1], dateSeries[index][2], timeSeries[index][0], timeSeries[index][1], dateSeries[index][5], dateSeries[index][6], dateSeries[index][7], dateSeries[index][8]]))
# return sunTimes
#dates1 = dates1.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
#sunrise1 = sunrise1.apply(get_hour_and_minute)
#sunrise1 = structure_sun_time(sunrise1, dates1)
#sunrise1 = sunrise1.apply(calendar.timegm)
#sunset1 = sunset1.apply(get_hour_and_minute)
#sunset1 = structure_sun_time(sunset1, dates1)
#sunset1 = sunset1.apply(calendar.timegm)
#dates1 = dates1.apply(calendar.timegm)
#dates2 = dates2.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
#sunrise2 = sunrise2.apply(get_hour_and_minute)
#sunrise2 = structure_sun_time(sunrise2, dates2)
#sunrise2 = sunrise2.apply(calendar.timegm)
#sunset2 = sunset2.apply(get_hour_and_minute)
#sunset2 = structure_sun_time(sunset2, dates2)
#sunset2 = sunset2.apply(calendar.timegm)
#dates2 = dates2.apply(calendar.timegm)
#weatherDF1['DATE'] = dates1
#weatherDF1['DAILYSunrise'] = sunrise1
#weatherDF1['DAILYSunset'] = sunset1
#weatherDF2['DATE'] = dates2
#weatherDF2['DAILYSunrise'] = sunrise2
#weatherDF2['DAILYSunset'] = sunset2
#weatherDF = pd.concat([weatherDF1,weatherDF2[32:]],ignore_index=True)
# Starting off with some of the easier features to work with-- more to come here . . . still in beta
#weatherMetrics = weatherDF[['DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed', \
# 'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY', 'DAILYSunrise', 'DAILYSunset']]
#weatherMetrics = weatherMetrics.convert_objects(convert_numeric=True)
#weatherDates = weatherMetrics['DATE']
#'DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed',
#'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY'
#timeWindow = 10800 #3 hours
#hourlyDryBulbTemp = []
#hourlyRelativeHumidity = []
#hourlyWindSpeed = []
#hourlySeaLevelPressure = []
#hourlyVisibility = []
#dailySunrise = []
#dailySunset = []
#daylight = []
#test = 0
#for timePoint in dates:#dates is the epoch time from the kaggle data
# relevantWeather = weatherMetrics[(weatherDates <= timePoint) & (weatherDates > timePoint - timeWindow)]
# hourlyDryBulbTemp.append(relevantWeather['HOURLYDRYBULBTEMPF'].mean())
# hourlyRelativeHumidity.append(relevantWeather['HOURLYRelativeHumidity'].mean())
# hourlyWindSpeed.append(relevantWeather['HOURLYWindSpeed'].mean())
# hourlySeaLevelPressure.append(relevantWeather['HOURLYSeaLevelPressure'].mean())
# hourlyVisibility.append(relevantWeather['HOURLYVISIBILITY'].mean())
# dailySunrise.append(relevantWeather['DAILYSunrise'].iloc[-1])
# dailySunset.append(relevantWeather['DAILYSunset'].iloc[-1])
# daylight.append(1.0*((timePoint >= relevantWeather['DAILYSunrise'].iloc[-1]) and (timePoint < relevantWeather['DAILYSunset'].iloc[-1])))
#if timePoint < relevantWeather['DAILYSunset'][-1]:
#daylight.append(1)
#else:
#daylight.append(0)
# if test%100000 == 0:
# print(relevantWeather)
# test += 1
#hourlyDryBulbTemp = pd.Series.from_array(np.array(hourlyDryBulbTemp))
#hourlyRelativeHumidity = pd.Series.from_array(np.array(hourlyRelativeHumidity))
#hourlyWindSpeed = pd.Series.from_array(np.array(hourlyWindSpeed))
#hourlySeaLevelPressure = pd.Series.from_array(np.array(hourlySeaLevelPressure))
#hourlyVisibility = pd.Series.from_array(np.array(hourlyVisibility))
#dailySunrise = pd.Series.from_array(np.array(dailySunrise))
#dailySunset = pd.Series.from_array(np.array(dailySunset))
#daylight = pd.Series.from_array(np.array(daylight))
#x_data['HOURLYDRYBULBTEMPF'] = hourlyDryBulbTemp
#x_data['HOURLYRelativeHumidity'] = hourlyRelativeHumidity
#x_data['HOURLYWindSpeed'] = hourlyWindSpeed
#x_data['HOURLYSeaLevelPressure'] = hourlySeaLevelPressure
#x_data['HOURLYVISIBILITY'] = hourlyVisibility
#x_data['DAILYSunrise'] = dailySunrise
#x_data['DAILYSunset'] = dailySunset
#x_data['Daylight'] = daylight
#x_data.to_csv(path_or_buf="C:/MIDS/W207 final project/x_data.csv")
##########
# Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)
# Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]
# Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]
#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))
#print(train_data[:10])
In [2]:
data_path = "/Users/sarahcha/Documents/sf_crime/data/x_data_3.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()
# Impute missing values with mean values:
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
# Scale the data between 0 and 1:
X = MinMaxScaler().fit_transform(X_raw)
# Shuffle data to remove any underlying pattern that may exist. Must re-run random seed step each time:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
# Due to difficulties with log loss and set(y_pred) needing to match set(labels), we will remove the extremely rare
# crimes from the data for quality issues.
X_minus_trea = X[np.where(y != 'TREA')]
y_minus_trea = y[np.where(y != 'TREA')]
X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')]
# Separate training, dev, and test data:
test_data, test_labels = X_final[800000:], y_final[800000:]
dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000]
train_data, train_labels = X_final[100000:700000], y_final[100000:700000]
calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000]
# Create mini versions of the above sets
mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000]
mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[19000:28000]
mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000]
# Create list of the crime type labels. This will act as the "labels" parameter for the log loss functions that follow
crime_labels = list(set(y_final))
crime_labels_mini_train = list(set(mini_train_labels))
crime_labels_mini_dev = list(set(mini_dev_labels))
crime_labels_mini_calibrate = list(set(mini_calibrate_labels))
print(len(crime_labels), len(crime_labels_mini_train), len(crime_labels_mini_dev),len(crime_labels_mini_calibrate))
#print(len(train_data),len(train_labels))
#print(len(dev_data),len(dev_labels))
#print(len(mini_train_data),len(mini_train_labels))
#print(len(mini_dev_data),len(mini_dev_labels))
#print(len(test_data),len(test_labels))
#print(len(mini_calibrate_data),len(mini_calibrate_labels))
#print(len(calibrate_data),len(calibrate_labels))
In [8]:
len(crime_labels_mini_dev)
Out[8]:
The benefit of Neural Nets is that it can learn a non-linear function for classification. Relative to that of logistic regression, you have multiple "hidden" layers in between the input and output layers.
The disadvantages are however that 1) you could have more than one local minimum due to a convex loss function which means depending on where the weights initialize you could end up with very different results and 2) the number of hyperparameters to tune is high -- requires lots of computing power.
MLP trains using backpropgation - it trains using gradient descent and for classification, it minimizes the Cross-Entropy loss function, giving a vector of probability estimates.
For the Neural Networks MLP classifier, the following classifier parameters are subject to optimization: 1) hidden_layer_sizes - which includes number of hidden layers as well as number of hidden nodes; 2) activation function - 'identity', 'logistic', 'tanh', 'relu'; 3) alpha - L2 regularization term to avoid overfitting; 4) learning_rate - 'constant', 'invscaling','adaptive'; 5) solver - 'lbfgs','sgd', adam' and 6) batch size.
Given the limitations to compute power, we took a targeted approach to training our MLP model. We ran gridsearchCV several times over subsets of parameters which is obviously not ideal given the number of parameters. We also attempted to test the potential impact of scikit's neural network classifier which also can incorporate drop out methods.
In [ ]:
## one of the iterations of gridSearchCV we ran
import itertools
hidden_layers = [2, 3, 4]
for i in hidden_layers:
hidden_layer_sizes = [x for x in itertools.product((4, 8, 12, 16, 20),repeat=i)]
parameters = {'activation' : ('logistic', 'tanh', 'relu'),
'batch_size': (50, 100, 200),
'alpha': (0.001, 0.01, 0.1, 0.25),
'hidden_layer_sizes': hidden_layer_sizes}
clf = GridSearchCV(MLPClassifier(tol = 0.01), parameters, scoring = "log_loss")
clf.fit(mini_train_data, mini_train_labels).predict_proba(mini_dev_data)
print("optimal parameters: ",clf.best_params_)
In [108]:
## our best parameter return
clf = MLPClassifier(tol=0.01, activation = 'tanh', hidden_layer_sizes = (20,), batch_size = 10, alpha = 0.01).fit(mini_train_data, mini_train_labels)
predicted = clf.predict(mini_dev_data)
predicted_prob = clf.predict_proba(mini_dev_data)
score = log_loss(mini_dev_labels, predicted_prob)
print(score)
In [107]:
##further calibration using CalibratedClassifierCV
list_for_log_loss = []
def NN_calibrated():
tuned_NN = clf = MLPClassifier(tol=0.01, activation = 'tanh', hidden_layer_sizes = (20,), batch_size = 10, alpha = 0.001).fit(mini_train_data, mini_train_labels)
dev_prediction_probabilities = tuned_NN.predict_proba(mini_dev_data)
ccv = CalibratedClassifierCV(tuned_NN, method = m, cv = 'prefit')
ccv.fit(mini_calibrate_data, mini_calibrate_labels)
ccv_prediction_probabilities = ccv.predict_proba(mini_dev_data)
working_log_loss = log_loss(y_true = mini_dev_labels, y_pred = ccv_prediction_probabilities, labels = crime_labels_mini_dev)
list_for_log_loss.append(working_log_loss)
print("Multi-class Log Loss with NN is:", working_log_loss)
methods = ['sigmoid', 'isotonic']
for m in methods:
NN_calibrated()
index_best_logloss = np.argmin(list_for_log_loss)
print('For KNN the best log loss with hyperparameter tuning and calibration is',list_for_log_loss[index_best_logloss])
We take the best performing parameters from the tuning procedures above and add dropout method to illustrate the potential impact there. Dropout method would not have materially improved the score had we used it.
In [3]:
from sknn.mlp import Classifier, Layer
nn = Classifier(
layers=[
Layer("Tanh", units = 20, dropout = .25),
Layer("Softmax")], batch_size = 10, learning_rate = .001)
In [4]:
clf = nn.fit(mini_train_data, mini_train_labels)
predicted = clf.predict(mini_dev_data)
predicted_prob = clf.predict_proba(mini_dev_data)
score = log_loss(mini_dev_labels, predicted_prob)
print(score)
In [50]:
##We attempted to run a more comprehensive hyperparameter tuning procedure including number of iterations and ranges for dropout method. It was too computationally intensive. Even on a remote AWS instance on pyspark it kept stalling out.
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(nn, param_grid={
'learning_rate': [0.05, 0.01, 0.005, 0.001],
'n_iter': [25, 50, 100, 200],
'hidden0__units': [4, 8, 12, 16, 20],
'hidden0__type': ["Rectifier", "Sigmoid", "Tanh"],
'hidden0__dropout':[0.2, 0.3, 0.4],
'hidden1__units': [4, 8, 12, 16, 20],
'hidden1__type': ["Rectifier", "Sigmoid", "Tanh"],
'hidden1__dropout':[0.2, 0.3, 0.4],
'hidden2__units': [4, 8, 12, 16, 20],
'hidden2__type': ["Rectifier", "Sigmoid", "Tanh"],
'hidden2__dropout':[0.2, 0.3, 0.4]})
gs.fit(mini_train_data, mini_train_labels)
predicted = gs.predict(mini_dev_data)
predicted_prob = gs.predict_proba(mini_dev_data)
score = log_loss(mini_dev_labels, predicted_prob)
For the SVM classifier, we can seek to optimize the following classifier parameters: C (penalty parameter C of the error term), kernel ('linear', 'poly', 'rbf', sigmoid', or 'precomputed')
See source [2] for parameter optimization in SVM
See above
In [ ]:
from sklearn import svm
from sklearn.metrics import log_loss
class_labels = list(set(train_labels))
def enumerateY(data):
enumerated_data = np.zeros(data.size)
for j in range(0,data.size):
feature = data[j]
i = class_labels.index(feature)
enumerated_data[j]+= i
return (np.array(enumerated_data))
train_labels_enum = enumerateY(mini_train_labels)
dev_labels_enum = enumerateY(mini_dev_labels)
In [ ]:
##gridSearchCV to find optimal parameters
parameters = {'C': (0.001, 0.01, 0.1, 0.5, 1, 5, 10, 25, 100),
'decision_function_shape': ('ovo', 'ovr'),
'kernel': ('linear', 'sigmoid', 'rbf', 'poly')}
clf = GridSearchCV(svm.SVC(tol = 0.01, probability = True), parameters, scoring = "log_loss")
clf.fit(mini_train_data, mini_train_labels).predict_proba(mini_dev_data)
print("optimal parameters:", clf.best_params_)
In [ ]:
##most optimal parameters returned score of 2.62651014795
clf_svc = svm.SVC(kernel='rbf', C=100, decision_function_shape='ovr', probability = True).fit(mini_train_data, train_labels_enum)
predicted = clf_svc.fit(mini_train_data, train_labels_enum).predict_proba(mini_dev_data)
print(log_loss(y_true = dev_labels_enum, y_pred = predicted, labels = dev_labels_enum))
For the Random Forest classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_features, max_depth, min_samples_leaf, bootstrap (whether or not bootstrap samples are used when building trees), oob_score (whether or not out-of-bag samples are used to estimate the generalization accuracy)
See above
There are no major changes that we seek to make in the AdaBoostClassifier with respect to default parameter values.
We will run the AdaBoostClassifier on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
For the Bagging meta classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_samples, max_features, bootstrap (whether or not bootstrap samples are used when building trees), bootstrap_features (whether features are drawn with replacement), and oob_score (whether or not out-of-bag samples are used to estimate the generalization accuracy)
We will run the BaggingClassifier on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
For the Gradient Boosting meta classifier, we can seek to optimize the following classifier parameters: n_estimators (the number of trees in the forsest), max_depth, min_samples_leaf, and max_features
We will run the GradientBoostingClassifier with loss = 'deviance' (as loss = 'exponential' uses the AdaBoost algorithm) on each different classifier from above, using the classifier settings with optimized Multi-class Log Loss after hyperparameter tuning and calibration.
In [ ]:
# Here we will likely use Pipeline and GridSearchCV in order to find the overall classifier with optimized Multi-class Log Loss.
# This will be the last step after all attempts at feature addition, hyperparameter tuning, and calibration are completed
# and the corresponding performance metrics are gathered.
1) Hsiang, Solomon M. and Burke, Marshall and Miguel, Edward. "Quantifying the Influence of Climate on Human Conflict". Science, Vol 341, Issue 6151, 2013
2) Huang, Cheng-Lung. Wang, Chieh-Jen. "A GA-based feature selection and parameters optimization for support vector machines". Expert Systems with Applications, Vol 31, 2006, p 231-240
3) More to come
In [ ]: