In [2]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);
\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;
SELECT
category,
date_part('hour', dates) AS hour_of_day,
CASE
WHEN dayofweek = 'Monday' then 1
WHEN dayofweek = 'Tuesday' THEN 2
WHEN dayofweek = 'Wednesday' THEN 3
WHEN dayofweek = 'Thursday' THEN 4
WHEN dayofweek = 'Friday' THEN 5
WHEN dayofweek = 'Saturday' THEN 6
WHEN dayofweek = 'Sunday' THEN 7
END AS dayofweek_numeric,
X,
Y,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS bayview_binary,
CASE
WHEN pd_district = 'INGLESIDE' THEN 1
ELSE 0
END AS ingleside_binary,
CASE
WHEN pd_district = 'NORTHERN' THEN 1
ELSE 0
END AS northern_binary,
CASE
WHEN pd_district = 'CENTRAL' THEN 1
ELSE 0
END AS central_binary,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS pd_bayview_binary,
CASE
WHEN pd_district = 'MISSION' THEN 1
ELSE 0
END AS mission_binary,
CASE
WHEN pd_district = 'SOUTHERN' THEN 1
ELSE 0
END AS southern_binary,
CASE
WHEN pd_district = 'TENDERLOIN' THEN 1
ELSE 0
END AS tenderloin_binary,
CASE
WHEN pd_district = 'PARK' THEN 1
ELSE 0
END AS park_binary,
CASE
WHEN pd_district = 'RICHMOND' THEN 1
ELSE 0
END AS richmond_binary,
CASE
WHEN pd_district = 'TARAVAL' THEN 1
ELSE 0
END AS taraval_binary
FROM kaggle_sf_crime;
In [4]:
data_path = "./data/train_transformed.csv"
df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()
In [5]:
## read in zip code data
data_path_zip = "./data/2016_zips.csv"
zips = pd.read_csv(data_path_zip, header=0, sep ='\t', usecols = [0,5,6], names = ["GEOID", "INTPTLAT", "INTPTLONG"], dtype ={'GEOID': int, 'INTPTLAT': float, 'INTPTLONG': float})
sf_zips = zips[(zips['GEOID'] > 94000) & (zips['GEOID'] < 94189)]
In [32]:
len(sf_zips)
Out[32]:
In [6]:
###mapping longitude/latitude to zipcodes
def dist(lat1, long1, lat2, long2):
#return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
return abs(lat1-lat2)+abs(long1-long2)
def find_zipcode(lat, long):
distances = sf_zips.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
return sf_zips.loc[distances.idxmin(), "GEOID"]
#x_data['zipcode'] = 0
#for i in range(0, 1):
# x_data['zipcode'][i] = x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
x_data['zipcode']= x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
In [10]:
x_data[:10]
Out[10]:
In [7]:
### read in school data
data_path_schools = "./data/pubschls.csv"
schools = pd.read_csv(data_path_schools,header=0, sep ='\t', usecols = ["CDSCode","StatusType", "School", "EILCode", "EILName", "Zip", "Latitude", "Longitude"], dtype ={'CDSCode': str, 'StatusType': str, 'School': str, 'EILCode': str,'EILName': str,'Zip': str, 'Latitude': float, 'Longitude': float})
schools = schools[(schools["StatusType"] == 'Active')]
In [8]:
x_data_sub= x_data[0:5]
In [11]:
### find closest school
def dist(lat1, long1, lat2, long2):
return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
def find_closest_school(lat, long):
distances = schools.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
return min(distances)
x_data['closest_school'] = x_data_sub.apply(lambda row: find_closest_school(row['y'], row['x']), axis=1)
In [ ]:
# Impute missing values with mean values:
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
X = X_raw
# Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)
# Shuffle data to remove any underlying pattern that may exist:
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
# Separate training, dev, and test data:
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
train_data, train_labels = X[:700000], y[:700000]
mini_train_data, mini_train_labels = X[:75000], y[:75000]
mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
In [ ]:
#the submission format requires that we list the ID of each example?
#this is to remember the order of the IDs after shuffling
#(not used for anything right now)
allIDs = np.array(list(df.axes[0]))
allIDs = allIDs[shuffle]
testIDs = allIDs[800000:]
devIDs = allIDs[700000:800000]
trainIDs = allIDs[:700000]
#this is for extracting the column names for the required submission format
sampleSubmission_path = "./data/sampleSubmission.csv"
sampleDF = pd.read_csv(sampleSubmission_path)
allColumns = list(sampleDF.columns)
featureColumns = allColumns[1:]
#this is for extracting the test data for our baseline submission
real_test_path = "./data/test_transformed.csv"
testDF = pd.read_csv(real_test_path, header=0)
real_test_data = testDF
test_complete = real_test_data.fillna(real_test_data.mean())
Test_raw = test_complete.as_matrix()
TestData = MinMaxScaler().fit_transform(Test_raw)
#here we remember the ID of each test data point
#(in case we ever decide to shuffle the test data for some reason)
testIDs = list(testDF.axes[0])
In [ ]:
In [ ]:
train_data[:5]
Note: the code above will shuffle data differently every time it's run, so model accuracies will vary accordingly.
In [ ]:
## Data sanity checks
print(train_data[:1])
print(train_labels[:1])
In [ ]:
##Neural Network
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
print (theano.config.device) # We're using CPUs (for now)
print (theano.config.floatX )# Should be 64 bit for CPUs
np.random.seed(0)
from IPython.display import display, clear_output
In [ ]:
numFeatures = train_data[1].size
numTrainExamples = train_data.shape[0]
numTestExamples = test_data.shape[0]
print ('Features = %d' %(numFeatures))
print ('Train set = %d' %(numTrainExamples))
print ('Test set = %d' %(numTestExamples))
In [ ]:
class_labels = list(set(train_labels))
print(class_labels)
numClasses = len(class_labels)
In [ ]:
print(train_labels[:5])
In [ ]:
##binarize the class labels
def binarizeY(data):
binarized_data = np.zeros((data.size,39))
for j in range(0,data.size):
feature = data[j]
i = class_labels.index(feature)
binarized_data[j,i]=1
return binarized_data
train_labels_b = binarizeY(train_labels)
test_labels_b = binarizeY(test_labels)
numClasses = train_labels_b[1].size
print ('Classes = %d' %(numClasses))
print ('\n', train_labels_b[:5, :], '\n')
print (train_labels[:10], '\n')
In [ ]:
#1) Parameters
numFeatures = train_data.shape[1]
numHiddenNodeslayer1 = 50
numHiddenNodeslayer2 = 30
w_1 = theano.shared(np.asarray((np.random.randn(*(numFeatures, numHiddenNodeslayer1))*0.01)))
w_2 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer1, numHiddenNodeslayer2))*0.01)))
w_3 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer2, numClasses))*0.01)))
params = [w_1, w_2, w_3]
In [ ]:
#2) Model
X = T.matrix()
Y = T.matrix()
srng = RandomStreams()
def dropout(X, p=0.):
if p > 0:
X *= srng.binomial(X.shape, p=1 - p)
X /= 1 - p
return X
def model(X, w_1, w_2, w_3, p_1, p_2, p_3):
return T.nnet.softmax(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(X, p_1), w_1)),p_2), w_2)),p_3),w_3))
y_hat_train = model(X, w_1, w_2, w_3, 0.2, 0.5,0.5)
y_hat_predict = model(X, w_1, w_2, w_3, 0., 0., 0.)
In [ ]:
## (3) Cost function
#cost = T.mean(T.sqr(y_hat - Y))
cost = T.mean(T.nnet.categorical_crossentropy(y_hat_train, Y))
In [ ]:
## (4) Objective (and solver)
alpha = 0.01
def backprop(cost, w):
grads = T.grad(cost=cost, wrt=w)
updates = []
for wi, grad in zip(w, grads):
updates.append([wi, wi - grad * alpha])
return updates
update = backprop(cost, params)
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
y_pred = T.argmax(y_hat_predict, axis=1)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)
miniBatchSize = 10
def gradientDescent(epochs):
for i in range(epochs):
for start, end in zip(range(0, len(train_data), miniBatchSize), range(miniBatchSize, len(train_data), miniBatchSize)):
cc = train(train_data[start:end], train_labels_b[start:end])
clear_output(wait=True)
print ('%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(test_labels_b, axis=1) == predict(test_data))) )
gradientDescent(50)
### How to decide what # to use for epochs? epochs in this case are how many rounds?
### plot costs for each of the 50 iterations and see how much it decline.. if its still very decreasing, you should
### do more iterations; otherwise if its looking like its flattening, you can stop