From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.
Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.
From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.
In [1]:
__author__ = 'alaa'
# Step 1 - importing classes we plan to use
import csv as csv
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
# show plots inline
%matplotlib inline
In [3]:
# Global constants and variables
TRAIN_FILENAME = 'train.csv'
TEST_FILENAME = 'test.csv'
train = pd.read_csv('../input/'+TRAIN_FILENAME, parse_dates=['Dates'], dtype={"X": np.float64,"Y": np.float64}, )
test = pd.read_csv('../input/'+TEST_FILENAME, parse_dates=['Dates'], dtype={"X": np.float64,"Y": np.float64}, )
In [4]:
train.info()
train = train.drop(['Descript', 'Resolution', 'Address'], axis = 1)
test = test.drop(['Address'], axis = 1)
In [5]:
def feature_engineering(data):
data['Day'] = data['Dates'].dt.day
data['Month'] = data['Dates'].dt.month
data['Year'] = data['Dates'].dt.year
data['Hour'] = data['Dates'].dt.hour
data['Minute'] = data['Dates'].dt.minute
data['DayOfWeek'] = data['Dates'].dt.dayofweek
data['WeekOfYear'] = data['Dates'].dt.weekofyear
return data
train = feature_engineering(train)
test = feature_engineering(test)
In [6]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
train['PdDistrict'] = enc.fit_transform(train['PdDistrict'])
category_encoder = LabelEncoder()
category_encoder.fit(train['Category'])
train['CategoryEncoded'] = category_encoder.transform(train['Category'])
print(category_encoder.classes_)
enc = LabelEncoder()
test['PdDistrict'] = enc.fit_transform(test['PdDistrict'])
print(train.columns)
print(test.columns)
In [7]:
x_cols = list(train.columns[2:12].values)
x_cols.remove('Minute')
print(x_cols)
In [8]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = list(train.columns[2:12].values)
predictors.remove('Minute')
print(predictors)
# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(n_estimators=10)
scores = cross_validation.cross_val_score(alg, train[predictors], train["CategoryEncoded"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
In [9]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(train[predictors], train["CategoryEncoded"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
# Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
In [10]:
# Pick only the four best features.
predictors = ["Y", "Day", "Month","WeekOfYear"]
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)
scores = cross_validation.cross_val_score(alg, train[predictors], train["CategoryEncoded"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
In [12]:
alg.fit(train[predictors], train['CategoryEncoded'])
test['predictions'] = alg.predict(test[predictors])
test['Category'] = category_encoder.inverse_transform(test['predictions'])
test.tail()
Out[12]:
In [14]:
y = train['Category'].astype('category')
submit = pd.DataFrame({'Id': test.Id.tolist()})
for category in y.cat.categories:
submit[category] = np.where(test.Category == category, 1, 0)
submit.to_csv('kaggle_random_forest.csv', index = False)
In [ ]: