In [331]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [332]:
train_df = pd.read_csv('train.csv', parse_dates=['Dates'])
#test_df = pd.read_csv('test.csv')
print(train_df.shape)
#print(test_df.shape)
In [333]:
# from datetime import date
# date_x = date(2006, 1, 1)
# train_df = train_df[pd.to_datetime(train_df['Dates'].dt.date) >= date_x]
# print(train_df.shape)
In [334]:
train_df = train_df.sample(frac=0.5, random_state=1)
In [335]:
all_classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
'WARRANTS', 'WEAPON LAWS']
In [336]:
print('Any missing values?')
print('In train: ', pd.isnull(train_df).values.any())
#print('In test: ', pandas.isnull(test).values.any())
In [337]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.tseries.holiday import Holiday, HolidayCalendarFactory, FR, DateOffset
def transform_date(df):
"""
Split Dates into Date, Year, Month, Day, DayofYear, Hour and Minute
"""
df['Date'] = pd.to_datetime(df['Dates'].dt.date)
df['Year'] = df['Dates'].dt.year
df['Month'] = df['Dates'].dt.month
df['Day'] = df['Dates'].dt.day
df['DayOfYear'] = df['Dates'].dt.dayofyear # the ordinal day of the year
df['Time'] = df['Dates'].dt.hour + df['Dates'].dt.minute/60
df['Hour'] = df['Dates'].dt.hour
df['Minutes'] = df['Dates'].dt.minute
del df['Dates']
return df
def dummy_day_of_week(df):
listOfDays = df['DayOfWeek'].unique()
for day in listOfDays:
col_name = "is" + day
df[col_name] = ((df['DayOfWeek'] == day) == True)
return df
def compute_working_days(df):
"""
Check if working day, include information about holidays:
New Year, Memorial Day, Independence Day, Labor Day, Thanksgiving, Black Friday, Christmas
"""
cal = calendar()
# create new rule for Black Friday
USBlackFriday = Holiday('BlackFriday', month=11, day=1,
offset=DateOffset(weekday=FR(4)))
# create own holiday calendar based on the above rules
ownCal = HolidayCalendarFactory('OwnCalendar', cal, USBlackFriday)
cal = ownCal()
holidays = cal.holidays(start='2003-01-01', end='2015-05-13')
# set flag according to whether the day is a holiday, a weekend or not
df['isWorkingday'] = ((df['DayOfWeek'].isin(['Saturday', 'Sunday'])==False) &
(df['Date'].isin(holidays)==False))
# transform from True/False into 1/0
df['isWorkingday'] = df['isWorkingday'].astype(int)
return df
def compute_night(df):
df['Night'] = True
df.loc[(df['Month'] == 1) & (df['Time'] > 7.3) & (df['Time'] < 17.3), 'Night'] = False
df.loc[(df['Month'] == 2) & (df['Time'] > 7.0) & (df['Time'] < 17.8), 'Night'] = False
df.loc[(df['Month'] == 3) & (df['Time'] > 6.3) & (df['Time'] < 18.3), 'Night'] = False
df.loc[(df['Month'] == 4) & (df['Time'] > 5.5) & (df['Time'] < 18.8), 'Night'] = False
df.loc[(df['Month'] == 5) & (df['Time'] > 5.0) & (df['Time'] < 19.3), 'Night'] = False
df.loc[(df['Month'] == 6) & (df['Time'] > 4.8) & (df['Time'] < 19.5), 'Night'] = False
df.loc[(df['Month'] == 7) & (df['Time'] > 5.0) & (df['Time'] < 19.5), 'Night'] = False
df.loc[(df['Month'] == 8) & (df['Time'] > 5.5) & (df['Time'] < 19.0), 'Night'] = False
df.loc[(df['Month'] == 9) & (df['Time'] > 5.8) & (df['Time'] < 18.3), 'Night'] = False
df.loc[(df['Month'] == 10) & (df['Time'] > 6.3) & (df['Time'] < 17.5), 'Night'] = False
df.loc[(df['Month'] == 11) & (df['Time'] > 6.8) & (df['Time'] < 17.0), 'Night'] = False
df.loc[(df['Month'] == 12) & (df['Time'] > 7.3) & (df['Time'] < 16.9), 'Night'] = False
# transform from True/False into 1/0
df['Night'] = df['Night'].astype(int)
return df
def features_date(df):
df = transform_date(df)
df = compute_working_days(df)
df = compute_night(df)
df = dummy_day_of_week(df)
return df
In [338]:
def categorize_crimes(df):
otherCrimes = ['WARRANTS','OTHER OFFENSES','WEAPON LAWS','SECONDARY CODES']
familyCrimes = ['MISSING PERSON','RUNAWAY', 'FAMILY OFFENSES','SUICIDE']
violentCrimes = ['ASSAULT','SEX OFFENSES FORCIBLE','ARSON','KIDNAPPING']
theftBurglary = ['LARCENY/THEFT','VEHICLE THEFT','ROBBERY','BURGLARY','STOLEN PROPERTY','RECOVERED VEHICLE']
economicCrimes = ['FORGERY/COUNTERFEITING','FRAUD', 'BRIBERY', 'EMBEZZLEMENT', 'BAD CHECKS', 'EXTORTION']
publicDisorder = ['VANDALISM','NON-CRIMINAL', 'SUSPICIOUS OCC','TRESPASS','PROSTITUTION', 'DISORDERLY CONDUCT',
'LOITERING', 'SEX OFFENSES NON FORCIBLE', 'PORNOGRAPHY/OBSCENE MAT','TREA', 'GAMBLING']
drugsAlcohol = ['DRUNKENNESS','DRUG/NARCOTIC', 'DRIVING UNDER THE INFLUENCE', 'LIQUOR LAWS']
df['isOtherCrimes'] = df['Category'].isin(otherCrimes)
df['isFamilyCrimes'] = df['Category'].isin(familyCrimes)
df['isViolentCrimes'] = df['Category'].isin(violentCrimes)
df['isTheftBurglary'] = df['Category'].isin(theftBurglary)
df['isEconomicCrimes'] = df['Category'].isin(economicCrimes)
return df
def check_intersection(df):
# check whether address is an intersection or not
# one column, i.e. 1 for intersection and 0 for block should suffice
df['isStreetCorner'] = df['Address'].apply(lambda x: 1 if '/' in x else 0)
return df
def dummy_districts(df):
listOfDistricts = df['PdDistrict'].unique()
for district in listOfDistricts:
col_name = "is" + district
df[col_name] = ((df['PdDistrict'] == district) == True)
return df
def features_crime(df):
#df = categorize_crimes(df)
df = check_intersection(df)
df = dummy_districts(df)
return df
In [339]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
def xy_clean(df):
# medians of X and Y by police district
listOfPrecincts = df['PdDistrict'].unique()
precinctX = (df.groupby('PdDistrict'))['X'].median()
precinctY = (df.groupby('PdDistrict'))['Y'].median()
# impute wrong values with the medians
for i in range(len(listOfPrecincts)) :
df.loc[ (df['Y'] > 38) & (df['PdDistrict'] == listOfPrecincts[i]), 'X'] = \
precinctX[listOfPrecincts[i]]
df.loc[(df['Y'] > 38) & (df['PdDistrict'] == listOfPrecincts[i]), 'Y'] = \
precinctY[listOfPrecincts[i]]
return df
def xy_scale(df):
xy_scaler = MinMaxScaler()
xy_scaler.fit(df[["X","Y"]])
df[["X","Y"]]=xy_scaler.transform(df[["X","Y"]])
return df
def xy_rotate(df):
"""
rotate coordinates by 30, 45, 60 degrees
"""
df["rot45_X"] = 0.707* df["Y"] + 0.707* df["X"]
df["rot45_Y"] = 0.707* df["Y"] - 0.707* df["X"]
df["rot30_X"] = 0.866* df["X"] + 0.5* df["Y"]
df["rot30_Y"] = 0.866* df["Y"] - 0.5* df["X"]
df["rot60_X"] = 0.5* df["X"] + 0.866* df["Y"]
df["rot60_Y"] = 0.5* df["Y"] - 0.866* df["X"]
return df
def xy_distance_from_center(df):
"""
Compute radial distance from center of map
"""
df["radial_r"] = np.sqrt( np.power(df["Y"],2) + np.power(df["X"],2) )
return df
def xy_distance_from_crime_median(df, df_test=None):
# medians of X and Y by crime category
listOfCrimes = df['Category'].unique()
crimeX = (df.groupby('Category'))['X'].median()
crimeY = (df.groupby('Category'))['Y'].median()
if df_test == None:
# compute distance from from crime median
for crime_name in listOfCrimes:
#for i in range(len(listOfCrimes)):
df[crime_name+"_r"] = np.sqrt( np.power(df["Y"]-crimeY[crime_name],2) + np.power(df["X"]-crimeX[crime_name],2) )
# debug version
#df[listOfCrimes[0]+"_r"] = numpy.sqrt( numpy.power(df["Y"]-crimeY[listOfCrimes[0]],2) + numpy.power(df["X"]-crimeX[listOfCrimes[0]],2) )
return df
else:
# compute distance from from crime median
for crime_name in listOfCrimes:
df_test[crime_name+"_r"] = np.sqrt( np.power(df_test["Y"]-crimeY[crime_name],2) + np.power(df_test["X"]-crimeX[crime_name],2) )
return df_test
def create_grid_xy(df,x_axis=5,y_axis=5):
step_x = 1.0/x_axis
step_y = 1.0/y_axis
for i in range(x_axis):
for j in range(y_axis):
bin_name = 'Bin'+str(i)+'_'+str(j)
if i == x_axis-1 & j != y_axis-1:
df[bin_name] = (((df['X'] >= i*step_x) == True) &
((df['X'] <=(i+1)*step_x) == True) &
((df['Y'] >= j*step_y) == True) &
((df['Y'] < (j+1)*step_y) == True))
elif i != x_axis-1 & j == y_axis-1:
df[bin_name] = (((df['X'] >= i*step_x) == True) &
((df['X'] <(i+1)*step_x) == True) &
((df['Y'] >= j*step_y) == True) &
((df['Y'] <= (j+1)*step_y) == True))
elif i == x_axis-1 & j == y_axis-1:
df[bin_name] = (((df['X'] >= i*step_x) == True) &
((df['X'] <=(i+1)*step_x) == True) &
((df['Y'] >= j*step_y) == True) &
((df['Y'] <= (j+1)*step_y) == True))
else:
df[bin_name] = (((df['X'] >= i*step_x) == True) &
((df['X'] <(i+1)*step_x) == True) &
((df['Y'] >= j*step_y) == True) &
((df['Y'] < (j+1)*step_y) == True))
return df
def features_xy(df):
df = xy_clean(df)
df = xy_scale(df)
df = xy_rotate(df)
df = xy_distance_from_center(df)
df =xy_distance_from_crime_median(df)
df = create_grid_xy(df,x_axis=5,y_axis=5)
return df
In [340]:
def label_encode(df):
"""
Label encode PdDistrict and DayOfWeek
"""
lab_enc = LabelEncoder()
df['PdDistrict'] = lab_enc.fit_transform(df.PdDistrict)
df['DayOfWeek'] = lab_enc.fit_transform(df.DayOfWeek)
#df['Bin'] = lab_enc.fit_transform(df.Bin)
return df
In [341]:
train_df = features_date(train_df)
train_df = features_crime(train_df)
train_df = features_xy(train_df)
train_df = label_encode(train_df)
In [342]:
train_df = train_df.drop('Resolution', 1)
train_df = train_df.drop('Descript', 1)
train_df = train_df.drop('Date', 1)
train_df = train_df.drop('Address', 1)
train_df = train_df.drop('PdDistrict', 1)
#train_df = train_df.drop('DayOfWeek', 1)
#train_df = train_df.drop('Year', 1)
In [343]:
best_features_sorted = ['Time','DayOfYear','Day','Minutes', 'Year','X','rot60_Y','Y','RUNAWAY_r','rot45_Y',
'rot30_X', 'rot30_Y','rot60_X','rot45_X','TREA_r','radial_r','RECOVERED VEHICLE_r','DRUG/NARCOTIC_r',
'VEHICLE THEFT_r','FAMILY OFFENSES_r', 'PORNOGRAPHY/OBSCENE MAT_r','SEX OFFENSES NON FORCIBLE_r','BURGLARY_r',
'isStreetCorner','TRESPASS_r','MISSING PERSON_r','GAMBLING_r','LARCENY/THEFT_r','VANDALISM_r','isWorkingday',
'ARSON_r','SECONDARY CODES_r','BRIBERY_r','DRIVING UNDER THE INFLUENCE_r', 'Month','WEAPON LAWS_r', 'KIDNAPPING_r',
'OTHER OFFENSES_r', 'EXTORTION_r', 'Night', 'SUICIDE_r', 'STOLEN PROPERTY_r', 'PROSTITUTION_r', 'WARRANTS_r', 'BAD CHECKS_r',
'isFriday','FORGERY/COUNTERFEITING_r', 'LOITERING_r', 'isWednesday', 'DISORDERLY CONDUCT_r', 'NON-CRIMINAL_r',
'isSunday', 'isMonday', 'FRAUD_r', 'isTuesday', 'EMBEZZLEMENT_r', 'isSaturday', 'isThursday', 'ROBBERY_r',
'ASSAULT_r', 'DRUNKENNESS_r','SUSPICIOUS OCC_r','LIQUOR LAWS_r','SEX OFFENSES FORCIBLE_r','isTENDERLOIN',
'Bin2_3','isNORTHERN','Bin2_1','Bin1_1','Bin1_2','isINGLESIDE','isSOUTHERN','isPARK','isCENTRAL','Bin3_3',
'Bin4_1','Bin3_1','Bin2_0','Bin1_3','Bin3_0','Bin2_2','Bin1_0','isRICHMOND','Bin0_1','Bin3_4','Bin4_2','isBAYVIEW',
'Bin0_3','Bin0_2','Bin4_3','Bin3_2','Bin2_4','isTARAVAL','isMISSION','Bin4_0','Bin0_0']
In [344]:
y_name = 'Category'
X_names = [col for col in train_df.columns.values if col not in y_name and col in best_features_sorted]
X = train_df[X_names]
y = train_df[y_name]
print(X.shape)
print(y.shape)
In [345]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
In [346]:
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree
def xgb_probs(X_train, y_train, X_test):
xgb_model = XGBClassifier(n_estimators=20,
learning_rate=0.2,
max_depth=11,
min_child_weight=4,
gamma=0.4,
reg_alpha=0.05,
reg_lambda=2,
subsample=1.0,
colsample_bytree=1.0,
max_delta_step=1,
scale_pos_weight=1,
objective='multi:softprob',
nthread=8,
seed=0 # ,
# silent = False
)
xgb_model.fit(X_train, y_train)
y_probs = xgb_model.predict_proba(X_test)
return (y_probs, xgb_model)
In [347]:
from sklearn.ensemble import RandomForestClassifier
def rf_probs(X_train, y_train, X_test):
rf_model = RandomForestClassifier(n_estimators=10,
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_split=1e-07,
bootstrap=True,
oob_score=False,
n_jobs=4,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None)
rf_model.fit(X_train, y_train)
y_probs =rf_model.predict_proba(X_test)
return y_probs, rf_model
In [348]:
from sklearn.ensemble import GradientBoostingClassifier
def gb_probs(X_train, y_train, X_test):
gb_model = GradientBoostingClassifier(loss='deviance',
learning_rate=0.1,
n_estimators=100,
subsample=1.0,
criterion='friedman_mse',
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_depth=3,
min_impurity_split=1e-07,
init=None,
random_state=None,
max_features=None,
verbose=0,
max_leaf_nodes=None,
warm_start=False,
presort='auto')
gb_model.fit(X_train, y_train)
y_probs =gb_model.predict_proba(X_test)
return y_probs, gb_model
In [358]:
from sklearn.neural_network import MLPClassifier
def nn_probs(X_train, y_train, X_test):
nn_model = MLPClassifier(hidden_layer_sizes=(200,180,200),
activation='logistic',
solver='adam',
alpha=0.0001,
batch_size='auto',
learning_rate='constant',
learning_rate_init=0.001,
power_t=0.5, max_iter=200,
shuffle=True,
random_state=None,
tol=0.0001,
verbose=False,
warm_start=False,
momentum=0.9,
nesterovs_momentum=True,
early_stopping=False,
validation_fraction=0.1,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-08)
nn_model.fit(X_train, y_train)
y_probs =nn_model.predict_proba(X_test)
return y_probs, nn_model
In [ ]:
from sklearn.naive_bayes import MultinomialNB
def nb_probs(X_train, y_train, X_test):
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_probs =nb_model.predict_proba(X_test)
return y_probs, nb_model
In [362]:
from sklearn.metrics import log_loss
y_probs, model = xgb_probs(X_train, y_train, X_test)
score = log_loss(y_test, y_probs, labels=all_classes)
print("Score: {}".format(score))
In [352]:
feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
plt.figure(figsize=(20,10))
feat_imp.plot(kind='bar', title='Feature Importances')
plt.show()
In [354]:
features = list(feat_imp.index)
features
Out[354]: