In [1]:
# Import relevant libraries:
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Import Meta-estimators
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Import Calibration tools
from sklearn.calibration import CalibratedClassifierCV
# Set random seed and format print output:
np.random.seed(0)
np.set_printoptions(precision=3)
CREATE TABLE kaggle_sf_crime (
dates TIMESTAMP,
category VARCHAR,
descript VARCHAR,
dayofweek VARCHAR,
pd_district VARCHAR,
resolution VARCHAR,
addr VARCHAR,
X FLOAT,
Y FLOAT);
\copy kaggle_sf_crime FROM '/Users/Goodgame/Desktop/MIDS/207/final/sf_crime_train.csv' DELIMITER ',' CSV HEADER;
SELECT
category,
date_part('hour', dates) AS hour_of_day,
CASE
WHEN dayofweek = 'Monday' then 1
WHEN dayofweek = 'Tuesday' THEN 2
WHEN dayofweek = 'Wednesday' THEN 3
WHEN dayofweek = 'Thursday' THEN 4
WHEN dayofweek = 'Friday' THEN 5
WHEN dayofweek = 'Saturday' THEN 6
WHEN dayofweek = 'Sunday' THEN 7
END AS dayofweek_numeric,
X,
Y,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS bayview_binary,
CASE
WHEN pd_district = 'INGLESIDE' THEN 1
ELSE 0
END AS ingleside_binary,
CASE
WHEN pd_district = 'NORTHERN' THEN 1
ELSE 0
END AS northern_binary,
CASE
WHEN pd_district = 'CENTRAL' THEN 1
ELSE 0
END AS central_binary,
CASE
WHEN pd_district = 'BAYVIEW' THEN 1
ELSE 0
END AS pd_bayview_binary,
CASE
WHEN pd_district = 'MISSION' THEN 1
ELSE 0
END AS mission_binary,
CASE
WHEN pd_district = 'SOUTHERN' THEN 1
ELSE 0
END AS southern_binary,
CASE
WHEN pd_district = 'TENDERLOIN' THEN 1
ELSE 0
END AS tenderloin_binary,
CASE
WHEN pd_district = 'PARK' THEN 1
ELSE 0
END AS park_binary,
CASE
WHEN pd_district = 'RICHMOND' THEN 1
ELSE 0
END AS richmond_binary,
CASE
WHEN pd_district = 'TARAVAL' THEN 1
ELSE 0
END AS taraval_binary
FROM kaggle_sf_crime;
In [2]:
#data_path = "./data/train_transformed.csv"
#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()
## Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()
## Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)
## Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]
## Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]
#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))
We seek to add features to our models that will improve performance with respect to out desired performance metric. There is evidence that there is a correlation between weather patterns and crime, with some experts even arguing for a causal relationship between weather and crime [1]. More specifically, a 2013 paper published in Science showed that higher temperatures and extreme rainfall led to large increases in conflict. In the setting of strong evidence that weather influences crime, we see it as a candidate for additional features to improve the performance of our classifiers. Weather data was gathered from (insert source). Certain features from this data set were incorporated into the original crime data set in order to add features that were hypothesizzed to improve performance. These features included (insert what we eventually include).
In [2]:
data_path = "./data/train_transformed.csv"
#df = pd.read_csv(data_path, header=0)
#x_data = df.drop('category', 1)
#y = df.category.as_matrix()
x_data = pd.read_csv(data_path, header=0)
########## Adding the date back into the data
import csv
import time
import calendar
data_path = "./data/train.csv"
dataCSV = open(data_path, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allData = csvData[1:]
dataCSV.close()
df2 = pd.DataFrame(allData)
df2.columns = csvFields
dates = df2['Dates']
dates = dates.apply(time.strptime, args=("%Y-%m-%d %H:%M:%S",))
dates = dates.apply(calendar.timegm)
#print(dates.head())
x_data['secondsFromEpoch'] = dates
colnames = x_data.columns.tolist()
colnames = colnames[-1:] + colnames[:-1]
x_data = x_data[colnames]
##########
########## Adding the weather data into the original crime data
weatherData1 = "./data/1027175.csv"
weatherData2 = "./data/1027176.csv"
dataCSV = open(weatherData1, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allWeatherData1 = csvData[1:]
dataCSV.close()
dataCSV = open(weatherData2, 'rt')
csvData = list(csv.reader(dataCSV))
csvFields = csvData[0] #['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
allWeatherData2 = csvData[1:]
dataCSV.close()
weatherDF1 = pd.DataFrame(allWeatherData1)
weatherDF1.columns = csvFields
dates1 = weatherDF1['DATE']
sunrise1 = weatherDF1['DAILYSunrise']
sunset1 = weatherDF1['DAILYSunset']
weatherDF2 = pd.DataFrame(allWeatherData2)
weatherDF2.columns = csvFields
dates2 = weatherDF2['DATE']
sunrise2 = weatherDF2['DAILYSunrise']
sunset2 = weatherDF2['DAILYSunset']
#functions for processing the sunrise and sunset times of each day
def get_hour_and_minute(milTime):
hour = int(milTime[:-2])
minute = int(milTime[-2:])
return [hour, minute]
def get_date_only(date):
return time.struct_time(tuple([date[0], date[1], date[2], 0, 0, 0, date[6], date[7], date[8]]))
def structure_sun_time(timeSeries, dateSeries):
sunTimes = timeSeries.copy()
for index in range(len(dateSeries)):
sunTimes[index] = time.struct_time(tuple([dateSeries[index][0], dateSeries[index][1], dateSeries[index][2], timeSeries[index][0], timeSeries[index][1], dateSeries[index][5], dateSeries[index][6], dateSeries[index][7], dateSeries[index][8]]))
return sunTimes
dates1 = dates1.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
sunrise1 = sunrise1.apply(get_hour_and_minute)
sunrise1 = structure_sun_time(sunrise1, dates1)
sunrise1 = sunrise1.apply(calendar.timegm)
sunset1 = sunset1.apply(get_hour_and_minute)
sunset1 = structure_sun_time(sunset1, dates1)
sunset1 = sunset1.apply(calendar.timegm)
dates1 = dates1.apply(calendar.timegm)
dates2 = dates2.apply(time.strptime, args=("%Y-%m-%d %H:%M",))
sunrise2 = sunrise2.apply(get_hour_and_minute)
sunrise2 = structure_sun_time(sunrise2, dates2)
sunrise2 = sunrise2.apply(calendar.timegm)
sunset2 = sunset2.apply(get_hour_and_minute)
sunset2 = structure_sun_time(sunset2, dates2)
sunset2 = sunset2.apply(calendar.timegm)
dates2 = dates2.apply(calendar.timegm)
weatherDF1['DATE'] = dates1
weatherDF1['DAILYSunrise'] = sunrise1
weatherDF1['DAILYSunset'] = sunset1
weatherDF2['DATE'] = dates2
weatherDF2['DAILYSunrise'] = sunrise2
weatherDF2['DAILYSunset'] = sunset2
weatherDF = pd.concat([weatherDF1,weatherDF2[32:]],ignore_index=True)
# Starting off with some of the easier features to work with-- more to come here . . . still in beta
weatherMetrics = weatherDF[['DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed', \
'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY', 'DAILYSunrise', 'DAILYSunset']]
weatherMetrics = weatherMetrics.convert_objects(convert_numeric=True)
weatherDates = weatherMetrics['DATE']
#'DATE','HOURLYDRYBULBTEMPF','HOURLYRelativeHumidity', 'HOURLYWindSpeed',
#'HOURLYSeaLevelPressure', 'HOURLYVISIBILITY'
timeWindow = 10800 #3 hours
hourlyDryBulbTemp = []
hourlyRelativeHumidity = []
hourlyWindSpeed = []
hourlySeaLevelPressure = []
hourlyVisibility = []
dailySunrise = []
dailySunset = []
daylight = []
test = 0
for timePoint in dates:#dates is the epoch time from the kaggle data
relevantWeather = weatherMetrics[(weatherDates <= timePoint) & (weatherDates > timePoint - timeWindow)]
hourlyDryBulbTemp.append(relevantWeather['HOURLYDRYBULBTEMPF'].mean())
hourlyRelativeHumidity.append(relevantWeather['HOURLYRelativeHumidity'].mean())
hourlyWindSpeed.append(relevantWeather['HOURLYWindSpeed'].mean())
hourlySeaLevelPressure.append(relevantWeather['HOURLYSeaLevelPressure'].mean())
hourlyVisibility.append(relevantWeather['HOURLYVISIBILITY'].mean())
dailySunrise.append(relevantWeather['DAILYSunrise'].iloc[-1])
dailySunset.append(relevantWeather['DAILYSunset'].iloc[-1])
daylight.append(1.0*((timePoint >= relevantWeather['DAILYSunrise'].iloc[-1]) and (timePoint < relevantWeather['DAILYSunset'].iloc[-1])))
#if timePoint < relevantWeather['DAILYSunset'][-1]:
#daylight.append(1)
#else:
#daylight.append(0)
if test%100000 == 0:
print(relevantWeather)
test += 1
hourlyDryBulbTemp = pd.Series.from_array(np.array(hourlyDryBulbTemp))
hourlyRelativeHumidity = pd.Series.from_array(np.array(hourlyRelativeHumidity))
hourlyWindSpeed = pd.Series.from_array(np.array(hourlyWindSpeed))
hourlySeaLevelPressure = pd.Series.from_array(np.array(hourlySeaLevelPressure))
hourlyVisibility = pd.Series.from_array(np.array(hourlyVisibility))
dailySunrise = pd.Series.from_array(np.array(dailySunrise))
dailySunset = pd.Series.from_array(np.array(dailySunset))
daylight = pd.Series.from_array(np.array(daylight))
x_data['HOURLYDRYBULBTEMPF'] = hourlyDryBulbTemp
x_data['HOURLYRelativeHumidity'] = hourlyRelativeHumidity
x_data['HOURLYWindSpeed'] = hourlyWindSpeed
x_data['HOURLYSeaLevelPressure'] = hourlySeaLevelPressure
x_data['HOURLYVISIBILITY'] = hourlyVisibility
x_data['DAILYSunrise'] = dailySunrise
x_data['DAILYSunset'] = dailySunset
x_data['Daylight'] = daylight
In [3]:
## read in zip code data
data_path_zip = "./data/2016_zips.csv"
zips = pd.read_csv(data_path_zip, header=0, sep ='\t', usecols = [0,5,6], names = ["GEOID", "INTPTLAT", "INTPTLONG"], dtype ={'GEOID': str, 'INTPTLAT': float, 'INTPTLONG': float})
zips_cali = zips[(zips['INTPTLAT'] > 36) & (zips['INTPTLAT'] < 42) & (zips['INTPTLONG'] > -125) & (zips['INTPTLONG'] < -118)]
In [4]:
###mapping longitude/latitude to zipcodes
def dist(lat1, long1, lat2, long2):
return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
def find_zipcode(lat, long):
distances = zips_cali.apply(lambda row: dist(lat, long, row["INTPTLAT"], row["INTPTLONG"]), axis=1)
return zips_cali.loc[distances.idxmin(), "GEOID"]
x_data['zipcode'] = x_data.apply(lambda row: find_zipcode(row['y'], row['x']), axis=1)
In [5]:
x_data.columns
Out[5]:
In [6]:
type(x_data)
Out[6]:
In [7]:
### read in school data
data_path_schools = "./data/pubschls.csv"
schools = pd.read_csv(data_path_schools,header=0, sep ='\t', usecols = ["StatusType", "School", "EILCode", "EILName", "Zip", "Latitude", "Longitude"], dtype ={'StatusType': str, 'School': str, 'EILCode': str,'EILName': str,'Zip': str, 'Latitude': float, 'Longitude': float})
schools = schools[(schools["StatusType"] == 'Active')]
In [ ]:
schools.head()
Out[ ]:
In [ ]:
### find closest school, get the distance, and the type
def dist(lat1, long1, lat2, long2):
return np.sqrt((lat1-lat2)**2+(long1-long2)**2)
def find_closest_school(lat, long):
distances = schools.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
return schools.loc[distances.idxmin(), "School"]
def get_school_distance(lat, long, schoolName):
lat2 = schools[schools['School'] == schoolName]['Latitude']
long2 = schools[schools['School'] == schoolName]['Longitude']
return dist(lat, long, lat2, long2)
def get_school_type(schoolName):
return schools[schools['School'] == schoolName]['EILCode']
x_data['closest_school'] = x_data.apply(lambda row: find_closest_school(row['y'], row['x']), axis=1)
x_data['school_distance'] = x_data.apply(lambda row: get_school_distance(row['y'], row['x'], row['closest_school']), axis=1)
x_data['school_type'] = x_data.apply(lambda row: get_school_type(row['closest_school']), axis=1)
In [ ]:
x_data[0:10]
In [ ]:
x_data.to_csv(path_or_buf="C:/MIDS/W207 final project/x_data_2.csv")
In [ ]:
## Impute missing values with mean values:
#x_complete = x_data.fillna(x_data.mean())
#X_raw = x_complete.as_matrix()
## Scale the data between 0 and 1:
#X = MinMaxScaler().fit_transform(X_raw)
## Shuffle data to remove any underlying pattern that may exist:
#shuffle = np.random.permutation(np.arange(X.shape[0]))
#X, y = X[shuffle], y[shuffle]
## Separate training, dev, and test data:
#test_data, test_labels = X[800000:], y[800000:]
#dev_data, dev_labels = X[700000:800000], y[700000:800000]
#train_data, train_labels = X[:700000], y[:700000]
#mini_train_data, mini_train_labels = X[:75000], y[:75000]
#mini_dev_data, mini_dev_labels = X[75000:100000], y[75000:100000]
#labels_set = set(mini_dev_labels)
#print(labels_set)
#print(len(labels_set))
#print(train_data[:10])
In [ ]:
### Data sub-setting quality check-point
#print(train_data[:1])
#print(train_labels[:1])
In [ ]: