SF Crime

W207 Final Project

Basic Modeling

Environment and Data


In [2]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)


/Users/sarahcha/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/sarahcha/anaconda3/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

DDL to construct table for SQL transformations:

CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,                                
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);

Getting training data into a locally hosted PostgreSQL database:

\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;

SQL Query used for transformations:

SELECT
  category,
  date_part('hour', dates) AS hour_of_day,
  CASE
    WHEN dayofweek = 'Monday' then 1
    WHEN dayofweek = 'Tuesday' THEN 2
    WHEN dayofweek = 'Wednesday' THEN 3
    WHEN dayofweek = 'Thursday' THEN 4
    WHEN dayofweek = 'Friday' THEN 5
    WHEN dayofweek = 'Saturday' THEN 6
    WHEN dayofweek = 'Sunday' THEN 7
  END AS dayofweek_numeric,
  X,
  Y,
  CASE
    WHEN pd_district = 'BAYVIEW' THEN 1
    ELSE 0
  END AS bayview_binary,
    CASE
    WHEN pd_district = 'INGLESIDE' THEN 1
    ELSE 0
  END AS ingleside_binary,
    CASE
    WHEN pd_district = 'NORTHERN' THEN 1
    ELSE 0
  END AS northern_binary,
    CASE
    WHEN pd_district = 'CENTRAL' THEN 1
    ELSE 0
  END AS central_binary,
    CASE
    WHEN pd_district = 'BAYVIEW' THEN 1
    ELSE 0
  END AS pd_bayview_binary,
    CASE
    WHEN pd_district = 'MISSION' THEN 1
    ELSE 0
  END AS mission_binary,
    CASE
    WHEN pd_district = 'SOUTHERN' THEN 1
    ELSE 0
  END AS southern_binary,
    CASE
    WHEN pd_district = 'TENDERLOIN' THEN 1
    ELSE 0
  END AS tenderloin_binary,
    CASE
    WHEN pd_district = 'PARK' THEN 1
    ELSE 0
  END AS park_binary,
    CASE
    WHEN pd_district = 'RICHMOND' THEN 1
    ELSE 0
  END AS richmond_binary,
    CASE
    WHEN pd_district = 'TARAVAL' THEN 1
    ELSE 0
  END AS taraval_binary
FROM kaggle_sf_crime;

Load the data into training, development, and test:


In [4]:
data_path = "./data/train_transformed.csv"

df = pd.read_csv(data_path, header=0)
x_data = df.drop('category', 1)
y = df.category.as_matrix()

In [5]:
## read in zip code data

data_path_zip = "./data/2016_zips.csv"
zips = pd.read_csv(data_path_zip, header=0, sep ='\t', usecols = [0,5,6], names = ["GEOID", "INTPTLAT", "INTPTLONG"], dtype ={'GEOID': int, 'INTPTLAT': float, 'INTPTLONG': float})

sf_zips = zips[(zips['GEOID'] > 94000) & (zips['GEOID'] < 94189)]

In [32]:
len(sf_zips)


Out[32]:
61

In [6]:
###mapping longitude/latitude to zipcodes

def dist(lat1, long1, lat2, long2):
    #return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
    return abs(lat1-lat2)+abs(long1-long2)

def find_zipcode(lat, long):
    
    distances = sf_zips.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
    return sf_zips.loc[distances.idxmin(), "GEOID"]

#x_data['zipcode'] = 0
#for i in range(0, 1):
#    x_data['zipcode'][i] = x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
x_data['zipcode']= x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-17121169996e> in <module>()
     13 #for i in range(0, 1):
     14 #    x_data['zipcode'][i] = x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
---> 15 x_data['zipcode']= x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
     16 
     17 #print(find_zipcode(-122.37,37.727))

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4059                     if reduce is None:
   4060                         reduce = True
-> 4061                     return self._apply_standard(f, axis, reduce=reduce)
   4062             else:
   4063                 return self._apply_broadcast(f, axis)

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4115                     labels = self._get_agg_axis(axis)
   4116                     result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 4117                                         labels=labels)
   4118                     return Series(result, index=labels)
   4119                 except Exception:

pandas/src/reduce.pyx in pandas.lib.reduce (pandas/lib.c:43539)()

pandas/src/reduce.pyx in pandas.lib.Reducer.get_result (pandas/lib.c:33736)()

<ipython-input-6-17121169996e> in <lambda>(row)
     13 #for i in range(0, 1):
     14 #    x_data['zipcode'][i] = x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
---> 15 x_data['zipcode']= x_data.apply(lambda row: find_zipcode(row['x'], row['y']), axis=1)
     16 
     17 #print(find_zipcode(-122.37,37.727))

<ipython-input-6-17121169996e> in find_zipcode(lat, long)
      7 def find_zipcode(lat, long):
      8 
----> 9     distances = sf_zips.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
     10     return sf_zips.loc[distances.idxmin(), "GEOID"]
     11 

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4059                     if reduce is None:
   4060                         reduce = True
-> 4061                     return self._apply_standard(f, axis, reduce=reduce)
   4062             else:
   4063                 return self._apply_broadcast(f, axis)

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4115                     labels = self._get_agg_axis(axis)
   4116                     result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 4117                                         labels=labels)
   4118                     return Series(result, index=labels)
   4119                 except Exception:

pandas/src/reduce.pyx in pandas.lib.reduce (pandas/lib.c:43539)()

pandas/src/reduce.pyx in pandas.lib.Reducer.get_result (pandas/lib.c:33736)()

<ipython-input-6-17121169996e> in <lambda>(row)
      7 def find_zipcode(lat, long):
      8 
----> 9     distances = sf_zips.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
     10     return sf_zips.loc[distances.idxmin(), "GEOID"]
     11 

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/series.py in __getitem__(self, key)
    581         key = com._apply_if_callable(key, self)
    582         try:
--> 583             result = self.index.get_value(self, key)
    584 
    585             if not lib.isscalar(result):

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   1978         try:
   1979             return self._engine.get_value(s, k,
-> 1980                                           tz=getattr(series.dtype, 'tz', None))
   1981         except KeyError as e1:
   1982             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

/Users/sarahcha/anaconda3/lib/python3.5/site-packages/pandas/core/series.py in dtype(self)
    311 
    312     # ndarray compatibility
--> 313     @property
    314     def dtype(self):
    315         """ return the dtype object of the underlying data """

KeyboardInterrupt: 

In [10]:
x_data[:10]


Out[10]:
hour_of_day dayofweek_numeric x y bayview_binary ingleside_binary northern_binary central_binary pd_bayview_binary mission_binary southern_binary tenderloin_binary park_binary richmond_binary taraval_binary zipcode
0 23 3 -122.425892 37.774599 0 0 1 0 0 0 0 0 0 0 0 0
1 23 3 -122.425892 37.774599 0 0 1 0 0 0 0 0 0 0 0 1
2 23 3 -122.424363 37.800414 0 0 1 0 0 0 0 0 0 0 0 0
3 23 3 -122.426995 37.800873 0 0 1 0 0 0 0 0 0 0 0 0
4 23 3 -122.438738 37.771541 0 0 0 0 0 0 0 0 1 0 0 0
5 23 3 -122.403252 37.713431 0 1 0 0 0 0 0 0 0 0 0 0
6 23 3 -122.423327 37.725138 0 1 0 0 0 0 0 0 0 0 0 0
7 23 3 -122.371274 37.727564 1 0 0 0 1 0 0 0 0 0 0 0
8 23 3 -122.508194 37.776601 0 0 0 0 0 0 0 0 0 1 0 0
9 23 3 -122.419088 37.807802 0 0 0 1 0 0 0 0 0 0 0 0

In [7]:
### read in school data
data_path_schools = "./data/pubschls.csv"
schools = pd.read_csv(data_path_schools,header=0, sep ='\t', usecols = ["CDSCode","StatusType", "School", "EILCode", "EILName", "Zip", "Latitude", "Longitude"], dtype ={'CDSCode': str, 'StatusType': str, 'School': str, 'EILCode': str,'EILName': str,'Zip': str, 'Latitude': float, 'Longitude': float})
schools = schools[(schools["StatusType"] == 'Active')]

In [8]:
x_data_sub= x_data[0:5]

In [11]:
### find closest school

def dist(lat1, long1, lat2, long2):
    return np.sqrt((lat1-lat2)**2+(long1-long2)**2)

def find_closest_school(lat, long):
    
    distances = schools.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
    return min(distances)

x_data['closest_school'] = x_data_sub.apply(lambda row: find_closest_school(row['y'], row['x']), axis=1)


/Users/sarahcha/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [ ]:
# Impute missing values with mean values:
x_complete = x_data.fillna(x_data.mean())
X_raw = x_complete.as_matrix()
X = X_raw
# Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)

# Shuffle data to remove any underlying pattern that may exist:
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]

# Separate training, dev, and test data:
test_data, test_labels = X[800000:], y[800000:]
dev_data, dev_labels = X[700000:800000], y[700000:800000]
train_data, train_labels = X[:700000], y[:700000]

mini_train_data, mini_train_labels = X[:75000], y[:75000]
mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]

In [ ]:
#the submission format requires that we list the ID of each example?
#this is to remember the order of the IDs after shuffling
#(not used for anything right now)
allIDs = np.array(list(df.axes[0]))
allIDs = allIDs[shuffle]

testIDs = allIDs[800000:]
devIDs = allIDs[700000:800000]
trainIDs = allIDs[:700000]

#this is for extracting the column names for the required submission format
sampleSubmission_path = "./data/sampleSubmission.csv"
sampleDF = pd.read_csv(sampleSubmission_path)
allColumns = list(sampleDF.columns)
featureColumns = allColumns[1:]

#this is for extracting the test data for our baseline submission
real_test_path = "./data/test_transformed.csv"
testDF = pd.read_csv(real_test_path, header=0)
real_test_data = testDF

test_complete = real_test_data.fillna(real_test_data.mean())
Test_raw = test_complete.as_matrix()

TestData = MinMaxScaler().fit_transform(Test_raw)

#here we remember the ID of each test data point
#(in case we ever decide to shuffle the test data for some reason)
testIDs = list(testDF.axes[0])

In [ ]:


In [ ]:
train_data[:5]

Note: the code above will shuffle data differently every time it's run, so model accuracies will vary accordingly.


In [ ]:
## Data sanity checks
print(train_data[:1])
print(train_labels[:1])

Model Prototyping

Rapidly assessing the viability of different model forms:


In [ ]:
##Neural Network

import theano 
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
print (theano.config.device) # We're using CPUs (for now)
print (theano.config.floatX )# Should be 64 bit for CPUs

np.random.seed(0)

from IPython.display import display, clear_output

In [ ]:
numFeatures = train_data[1].size
numTrainExamples = train_data.shape[0]
numTestExamples = test_data.shape[0]
print ('Features = %d' %(numFeatures))
print ('Train set = %d' %(numTrainExamples))
print ('Test set = %d' %(numTestExamples))

In [ ]:
class_labels = list(set(train_labels))
print(class_labels)
numClasses = len(class_labels)

In [ ]:
print(train_labels[:5])

In [ ]:
##binarize the class labels

def binarizeY(data):
    binarized_data = np.zeros((data.size,39))
    for j in range(0,data.size):
        feature = data[j]
        i = class_labels.index(feature)
        binarized_data[j,i]=1
    return binarized_data

train_labels_b = binarizeY(train_labels)
test_labels_b = binarizeY(test_labels)
numClasses = train_labels_b[1].size

print ('Classes = %d' %(numClasses))

print ('\n', train_labels_b[:5, :], '\n')
print (train_labels[:10], '\n')

In [ ]:
#1) Parameters
numFeatures = train_data.shape[1]

numHiddenNodeslayer1 = 50
numHiddenNodeslayer2 = 30

w_1 = theano.shared(np.asarray((np.random.randn(*(numFeatures, numHiddenNodeslayer1))*0.01)))
w_2 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer1, numHiddenNodeslayer2))*0.01)))
w_3 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodeslayer2, numClasses))*0.01)))
params = [w_1, w_2, w_3]

In [ ]:
#2) Model
X = T.matrix()
Y = T.matrix()

srng = RandomStreams()
def dropout(X, p=0.):
    if p > 0:
        X *= srng.binomial(X.shape, p=1 - p)
        X /= 1 - p
    return X

def model(X, w_1, w_2, w_3, p_1, p_2, p_3):
    return T.nnet.softmax(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(T.nnet.sigmoid(T.dot(dropout(X, p_1), w_1)),p_2), w_2)),p_3),w_3))
y_hat_train = model(X, w_1, w_2, w_3, 0.2, 0.5,0.5)
y_hat_predict = model(X, w_1, w_2, w_3, 0., 0., 0.)

In [ ]:
## (3) Cost function
#cost = T.mean(T.sqr(y_hat - Y))
cost = T.mean(T.nnet.categorical_crossentropy(y_hat_train, Y))

In [ ]:
## (4) Objective (and solver)

alpha = 0.01
def backprop(cost, w):
    grads = T.grad(cost=cost, wrt=w)
    updates = []
    for wi, grad in zip(w, grads):
        updates.append([wi, wi - grad * alpha])
    return updates

update = backprop(cost, params)
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
y_pred = T.argmax(y_hat_predict, axis=1)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)

miniBatchSize = 10 

def gradientDescent(epochs):
    for i in range(epochs):
        for start, end in zip(range(0, len(train_data), miniBatchSize), range(miniBatchSize, len(train_data), miniBatchSize)):
            cc = train(train_data[start:end], train_labels_b[start:end])
        clear_output(wait=True)
        print ('%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(test_labels_b, axis=1) == predict(test_data))) )

gradientDescent(50)

### How to decide what # to use for epochs? epochs in this case are how many rounds?
### plot costs for each of the 50 iterations and see how much it decline.. if its still very decreasing, you should
### do more iterations; otherwise if its looking like its flattening, you can stop