Kaggle San Francisco Crime Classification

Berkeley MIDS W207 Final Project: Sam Goodgame, Sarah Cha, Kalvin Kao, Bryan Moore

Basic Modeling

Environment and Data


In [1]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV

# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

DDL to construct table for SQL transformations:

CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,                                
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);

Getting training data into a locally hosted PostgreSQL database:

\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;

SQL Query used for transformations:

SELECT
  category,
  date_part('hour', dates) AS hour_of_day,
  CASE
    WHEN dayofweek = 'Monday' then 1
    WHEN dayofweek = 'Tuesday' THEN 2
    WHEN dayofweek = 'Wednesday' THEN 3
    WHEN dayofweek = 'Thursday' THEN 4
    WHEN dayofweek = 'Friday' THEN 5
    WHEN dayofweek = 'Saturday' THEN 6
    WHEN dayofweek = 'Sunday' THEN 7
  END AS dayofweek_numeric,
  X,
  Y,
  CASE
    WHEN pd_district = 'BAYVIEW' THEN 1
    ELSE 0
  END AS bayview_binary,
    CASE
    WHEN pd_district = 'INGLESIDE' THEN 1
    ELSE 0
  END AS ingleside_binary,
    CASE
    WHEN pd_district = 'NORTHERN' THEN 1
    ELSE 0
  END AS northern_binary,
    CASE
    WHEN pd_district = 'CENTRAL' THEN 1
    ELSE 0
  END AS central_binary,
    CASE
    WHEN pd_district = 'BAYVIEW' THEN 1
    ELSE 0
  END AS pd_bayview_binary,
    CASE
    WHEN pd_district = 'MISSION' THEN 1
    ELSE 0
  END AS mission_binary,
    CASE
    WHEN pd_district = 'SOUTHERN' THEN 1
    ELSE 0
  END AS southern_binary,
    CASE
    WHEN pd_district = 'TENDERLOIN' THEN 1
    ELSE 0
  END AS tenderloin_binary,
    CASE
    WHEN pd_district = 'PARK' THEN 1
    ELSE 0
  END AS park_binary,
    CASE
    WHEN pd_district = 'RICHMOND' THEN 1
    ELSE 0
  END AS richmond_binary,
    CASE
    WHEN pd_district = 'TARAVAL' THEN 1
    ELSE 0
  END AS taraval_binary
FROM kaggle_sf_crime;

Load the data into training, development, and test:


In [2]:
#data_path = "./data/train_transformed.csv"

#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()

## Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()

## Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)

## Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]

## Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]

#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))


{'ASSAULT', 'NON-CRIMINAL', 'SECONDARY CODES', 'BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'SEX OFFENSES FORCIBLE', 'WEAPON LAWS', 'SUSPICIOUS OCC', 'TRESPASS', 'DRUNKENNESS', 'BAD CHECKS', 'VANDALISM', 'FORGERY/COUNTERFEITING', 'PROSTITUTION', 'EMBEZZLEMENT', 'WARRANTS', 'KIDNAPPING', 'OTHER OFFENSES', 'LIQUOR LAWS', 'GAMBLING', 'DISORDERLY CONDUCT', 'ARSON', 'EXTORTION', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'ROBBERY', 'FAMILY OFFENSES', 'LOITERING', 'RUNAWAY', 'DRUG/NARCOTIC', 'SEX OFFENSES NON FORCIBLE', 'LARCENY/THEFT', 'BRIBERY', 'RECOVERED VEHICLE', 'MISSING PERSON', 'FRAUD', 'SUICIDE'}
37

Loading the data, version 2, with weather features to improve performance:

We seek to add features to our models that will improve performance with respect to out desired performance metric. There is evidence that there is a correlation between weather patterns and crime, with some experts even arguing for a causal relationship between weather and crime [1]. More specifically, a 2013 paper published in Science showed that higher temperatures and extreme rainfall led to large increases in conflict. In the setting of strong evidence that weather influences crime, we see it as a candidate for additional features to improve the performance of our classifiers. Weather data was gathered from (insert source). Certain features from this data set were incorporated into the original crime data set in order to add features that were hypothesizzed to improve performance. These features included (insert what we eventually include).


In [2]:
data_path = "./data/train_transformed.csv"

#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()
x_data = pd.read_csv(data_path, header=0)

########## Adding the date back into the data
import csv
import time
import calendar
data_path = "./data/train.csv"
dataCSV = open(data_path, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allData = csvData[1:]
dataCSV.close()

df2 = pd.DataFrame(allData)
df2.columns = csvFields
dates = df2['Dates']
dates = dates.apply(time.strptime, args=("%Y-%m-%d %H:%M:%S",))
dates = dates.apply(calendar.timegm)
#print(dates.head())

x_data['secondsFromEpoch'] = dates
colnames = x_data.columns.tolist()
colnames = colnames[-1:] + colnames[:-1]
x_data = x_data[colnames]
##########

########## Adding the weather data into the original crime data
weatherData1 = "./data/1027175.csv"
weatherData2 = "./data/1027176.csv"
dataCSV = open(weatherData1, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allWeatherData1 = csvData[1:]
dataCSV.close()

dataCSV = open(weatherData2, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allWeatherData2 = csvData[1:]
dataCSV.close()

weatherDF1 = pd.DataFrame(allWeatherData1)
weatherDF1.columns = csvFields
dates1 = weatherDF1['DATE']
sunrise1 = weatherDF1['DAILYSunrise']
sunset1 = weatherDF1['DAILYSunset']

weatherDF2 = pd.DataFrame(allWeatherData2)
weatherDF2.columns = csvFields
dates2 = weatherDF2['DATE']
sunrise2 = weatherDF2['DAILYSunrise']
sunset2 = weatherDF2['DAILYSunset']

#functions for processing the sunrise and sunset times of each day
def get_hour_and_minute(milTime):
    hour = int(milTime[:-2])
    minute = int(milTime[-2:])
    return [hour, minute]

def get_date_only(date):
    return time.struct_time(tuple([date[0], date[1], date[2], 0, 0, 0, date[6], date[7], date[8]]))

def structure_sun_time(timeSeries, dateSeries):
    sunTimes = timeSeries.copy()
    for index in range(len(dateSeries)):
        sunTimes[index] = time.struct_time(tuple([dateSeries[index][0], dateSeries[index][1], dateSeries[index][2], timeSeries[index][0], timeSeries[index][1], dateSeries[index][5], dateSeries[index][6], dateSeries[index][7], dateSeries[index][8]]))
    return sunTimes

dates1 = dates1.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
sunrise1 = sunrise1.apply(get_hour_and_minute)
sunrise1 = structure_sun_time(sunrise1, dates1)
sunrise1 = sunrise1.apply(calendar.timegm)
sunset1 = sunset1.apply(get_hour_and_minute)
sunset1 = structure_sun_time(sunset1, dates1)
sunset1 = sunset1.apply(calendar.timegm)
dates1 = dates1.apply(calendar.timegm)

dates2 = dates2.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
sunrise2 = sunrise2.apply(get_hour_and_minute)
sunrise2 = structure_sun_time(sunrise2, dates2)
sunrise2 = sunrise2.apply(calendar.timegm)
sunset2 = sunset2.apply(get_hour_and_minute)
sunset2 = structure_sun_time(sunset2, dates2)
sunset2 = sunset2.apply(calendar.timegm)
dates2 = dates2.apply(calendar.timegm)

weatherDF1['DATE'] = dates1
weatherDF1['DAILYSunrise'] = sunrise1
weatherDF1['DAILYSunset'] = sunset1
weatherDF2['DATE'] = dates2
weatherDF2['DAILYSunrise'] = sunrise2
weatherDF2['DAILYSunset'] = sunset2

weatherDF = pd.concat([weatherDF1,weatherDF2[32:]],ignore_index=True)

# Starting off with some of the easier features to work with-- more to come here . . . still in beta
weatherMetrics = weatherDF[['DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed', \
                            'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY', 'DAILYSunrise', 'DAILYSunset']]
weatherMetrics = weatherMetrics.convert_objects(convert_numeric=True)
weatherDates = weatherMetrics['DATE']
#'DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed',
#'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY'
timeWindow = 10800 #3 hours
hourlyDryBulbTemp = []
hourlyRelativeHumidity = []
hourlyWindSpeed = []
hourlySeaLevelPressure = []
hourlyVisibility = []
dailySunrise = []
dailySunset = []
daylight = []
test = 0
for timePoint in dates:#dates is the epoch time from the kaggle data
    relevantWeather = weatherMetrics[(weatherDates <= timePoint) & (weatherDates > timePoint - timeWindow)]
    hourlyDryBulbTemp.append(relevantWeather['HOURLYDRYBULBTEMPF'].mean())
    hourlyRelativeHumidity.append(relevantWeather['HOURLYRelativeHumidity'].mean())
    hourlyWindSpeed.append(relevantWeather['HOURLYWindSpeed'].mean())
    hourlySeaLevelPressure.append(relevantWeather['HOURLYSeaLevelPressure'].mean())
    hourlyVisibility.append(relevantWeather['HOURLYVISIBILITY'].mean())
    dailySunrise.append(relevantWeather['DAILYSunrise'].iloc[-1])
    dailySunset.append(relevantWeather['DAILYSunset'].iloc[-1])
    daylight.append(1.0*((timePoint >= relevantWeather['DAILYSunrise'].iloc[-1]) and (timePoint < relevantWeather['DAILYSunset'].iloc[-1])))
    #if timePoint < relevantWeather['DAILYSunset'][-1]:
        #daylight.append(1)
    #else:
        #daylight.append(0)
    
    if test%100000 == 0:
        print(relevantWeather)
    test += 1

hourlyDryBulbTemp = pd.Series.from_array(np.array(hourlyDryBulbTemp))
hourlyRelativeHumidity = pd.Series.from_array(np.array(hourlyRelativeHumidity))
hourlyWindSpeed = pd.Series.from_array(np.array(hourlyWindSpeed))
hourlySeaLevelPressure = pd.Series.from_array(np.array(hourlySeaLevelPressure))
hourlyVisibility = pd.Series.from_array(np.array(hourlyVisibility))
dailySunrise = pd.Series.from_array(np.array(dailySunrise))
dailySunset = pd.Series.from_array(np.array(dailySunset))
daylight = pd.Series.from_array(np.array(daylight))

x_data['HOURLYDRYBULBTEMPF'] = hourlyDryBulbTemp
x_data['HOURLYRelativeHumidity'] = hourlyRelativeHumidity
x_data['HOURLYWindSpeed'] = hourlyWindSpeed
x_data['HOURLYSeaLevelPressure'] = hourlySeaLevelPressure
x_data['HOURLYVISIBILITY'] = hourlyVisibility
x_data['DAILYSunrise'] = dailySunrise
x_data['DAILYSunset'] = dailySunset
x_data['Daylight'] = daylight


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:102: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
              DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
140303  1431550560                56.0                    70.0   
140304  1431554040                57.0                    67.0   
140305  1431554160                57.0                    69.0   
140306  1431554400                57.0                    69.0   
140307  1431557760                57.0                    74.0   
140308  1431558900                57.0                    72.0   

        HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
140303              0.0                   29.93              10.0   
140304              0.0                     NaN              10.0   
140305              8.0                   29.92              10.0   
140306              8.0                   29.92               NaN   
140307              9.0                   29.91              10.0   
140308              8.0                     NaN              10.0   

        DAILYSunrise  DAILYSunset  
140303    1431493260   1431544260  
140304    1431493260   1431544260  
140305    1431493260   1431544260  
140306    1431493260   1431544260  
140307    1431493260   1431544260  
140308    1431493260   1431544260  
              DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
124567  1389459360                54.0                    77.0   
124568  1389462960                53.0                    80.0   
124569  1389464640                54.0                    77.0   
124570  1389465540                52.0                    82.0   
124571  1389466560                52.0                    83.0   

        HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
124567             14.0                   30.22              10.0   
124568              9.0                   30.23              10.0   
124569             10.0                     NaN              10.0   
124570             11.0                     NaN              10.0   
124571              7.0                   30.25              10.0   

        DAILYSunrise  DAILYSunset  
124567    1389425040   1389460320  
124568    1389425040   1389460320  
124569    1389425040   1389460320  
124570    1389425040   1389460320  
124571    1389425040   1389460320  
              DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
109394  1348037760                56.0                    77.0   
109395  1348041360                57.0                    74.0   
109396  1348044960                57.0                    74.0   

        HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
109394             14.0                   30.12              10.0   
109395              9.0                   30.13              10.0   
109396             15.0                   30.14              10.0   

        DAILYSunrise  DAILYSunset  
109394    1348034100   1348078200  
109395    1348034100   1348078200  
109396    1348034100   1348078200  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
92913  1301896560                50.0                    66.0   
92914  1301900160                52.0                    64.0   
92915  1301903760                56.0                    55.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
92913              0.0                   30.17              10.0   
92914              0.0                   30.17              10.0   
92915              0.0                   30.17              10.0   

       DAILYSunrise  DAILYSunset  
92913    1301896260   1301942100  
92914    1301896260   1301942100  
92915    1301896260   1301942100  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
77034  1254243360                60.0                    60.0   
77035  1254246960                58.0                    67.0   
77036  1254250560                57.0                    72.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
77034             39.0                   29.99              10.0   
77035             37.0                   29.99              10.0   
77036             25.0                   30.01              10.0   

       DAILYSunrise  DAILYSunset  
77034    1254204240   1254246840  
77035    1254204240   1254246840  
77036    1254204240   1254246840  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
61113  1209480960                58.0                    60.0   
61114  1209484560                57.0                    58.0   
61115  1209484800                57.0                    58.0   
61116  1209488160                55.0                    64.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
61113             32.0                   30.07             10.00   
61114             33.0                   30.06             10.00   
61115             33.0                   30.06              9.94   
61116             34.0                   30.05             10.00   

       DAILYSunrise  DAILYSunset  
61113    1209446100   1209495540  
61114    1209446100   1209495540  
61115    1209446040   1209495540  
61116    1209446040   1209495540  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
43577  1162742160                65.0                    73.0   
43578  1162742400                65.0                    73.0   
43579  1162745760                63.0                    75.0   
43580  1162749360                62.0                    78.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
43577             15.0                   30.20             10.00   
43578             15.0                   30.20              9.94   
43579             14.0                   30.19             10.00   
43580             14.0                   30.21             10.00   

       DAILYSunrise  DAILYSunset  
43577    1162708740   1162746420  
43578    1162708800   1162746360  
43579    1162708800   1162746360  
43580    1162708800   1162746360  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
27369  1118696400                56.0                    81.0   
27370  1118699760                55.0                    82.0   
27371  1118700000                55.0                    83.0   
27372  1118703600                55.0                    83.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
27369             17.0                   29.86              8.95   
27370             13.0                   29.85             10.00   
27371             13.0                   29.85              9.94   
27372             10.0                   29.85              9.94   

       DAILYSunrise  DAILYSunset  
27369    1118638020   1118691120  
27370    1118638020   1118691120  
27371    1118638020   1118691120  
27372    1118638020   1118691120  
             DATE  HOURLYDRYBULBTEMPF  HOURLYRelativeHumidity  \
11993  1074722400                51.0                    77.0   
11994  1074726000                48.0                    68.0   
11995  1074729600                49.0                    71.0   

       HOURLYWindSpeed  HOURLYSeaLevelPressure  HOURLYVISIBILITY  \
11993              0.0                   30.29               NaN   
11994              3.0                   30.30              9.94   
11995              0.0                   30.31               NaN   

       DAILYSunrise  DAILYSunset  
11993    1074669600   1074705720  
11994    1074669600   1074705720  
11995    1074756000   1074792120  

In [3]:
## read in zip code data

data_path_zip = "./data/2016_zips.csv"
zips = pd.read_csv(data_path_zip, header=0, sep ='\t', usecols = [0,5,6], names = ["GEOID", "INTPTLAT", "INTPTLONG"], dtype ={'GEOID': str, 'INTPTLAT': float, 'INTPTLONG': float})
zips_cali = zips[(zips['INTPTLAT'] > 36) & (zips['INTPTLAT'] < 42) & (zips['INTPTLONG'] > -125) & (zips['INTPTLONG'] < -118)]

In [4]:
###mapping longitude/latitude to zipcodes

def dist(lat1, long1, lat2, long2):
    return np.sqrt((lat1-lat2)**2+(long1-long2)**2)

def find_zipcode(lat, long):
    
    distances = zips_cali.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
    return zips_cali.loc[distances.idxmin(), "GEOID"]

x_data['zipcode'] = x_data.apply(lambda row: find_zipcode(row['y'], row['x']), axis=1)

In [5]:
x_data.columns


Out[5]:
Index(['secondsFromEpoch', 'hour_of_day', 'dayofweek_numeric', 'x', 'y',
       'bayview_binary', 'ingleside_binary', 'northern_binary',
       'central_binary', 'pd_bayview_binary', 'mission_binary',
       'southern_binary', 'tenderloin_binary', 'park_binary',
       'richmond_binary', 'taraval_binary', 'HOURLYDRYBULBTEMPF',
       'HOURLYRelativeHumidity', 'HOURLYWindSpeed', 'HOURLYSeaLevelPressure',
       'HOURLYVISIBILITY', 'DAILYSunrise', 'DAILYSunset', 'Daylight',
       'zipcode'],
      dtype='object')

In [6]:
type(x_data)


Out[6]:
pandas.core.frame.DataFrame

In [7]:
### read in school data
data_path_schools = "./data/pubschls.csv"
schools = pd.read_csv(data_path_schools,header=0, sep ='\t', usecols = ["StatusType", "School", "EILCode", "EILName", "Zip", "Latitude", "Longitude"], dtype ={'StatusType': str, 'School': str, 'EILCode': str,'EILName': str,'Zip': str, 'Latitude': float, 'Longitude': float})
schools = schools[(schools["StatusType"] == 'Active')]

In [ ]:
schools.head()


Out[ ]:
StatusType School Zip EILCode EILName Latitude Longitude
0 Active NaN 94544-1136 NaN NaN 37.658212 -122.09713
2 Active Envision Academy for Arts & Technology 94612-3355 HS High School 37.804520 -122.26815
4 Active Community School for Creative Education 94606-4903 ELEM Elementary 37.784648 -122.23863
5 Active Yu Ming Charter 94608-1265 ELEM Elementary 37.847375 -122.28356
6 Active Urban Montessori Charter 94619-3312 ELEM Elementary 37.778352 -122.18950

In [ ]:
### find closest school, get the distance, and the type

def dist(lat1, long1, lat2, long2):
    return np.sqrt((lat1-lat2)**2+(long1-long2)**2)

def find_closest_school(lat, long):
    
    distances = schools.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
    return schools.loc[distances.idxmin(), "School"]

def get_school_distance(lat, long, schoolName):
    
    lat2 = schools[schools['School'] == schoolName]['Latitude']
    long2 = schools[schools['School'] == schoolName]['Longitude']
    return dist(lat, long, lat2, long2)

def get_school_type(schoolName):
    
    return schools[schools['School'] == schoolName]['EILCode']

x_data['closest_school'] = x_data.apply(lambda row: find_closest_school(row['y'], row['x']), axis=1)
x_data['school_distance'] = x_data.apply(lambda row: get_school_distance(row['y'], row['x'], row['closest_school']), axis=1)
x_data['school_type'] = x_data.apply(lambda row: get_school_type(row['closest_school']), axis=1)

In [ ]:
x_data[0:10]

In [ ]:
x_data.to_csv(path_or_buf="C:/MIDS/W207 final project/x_data_2.csv")

In [ ]:
## Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()

## Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)

## Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]

## Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]

#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))
#print(train_data[:10])

In [ ]:
### Data sub-setting quality check-point
#print(train_data[:1])
#print(train_labels[:1])

In [ ]: