In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
In [ ]:
#Load the data
training = pd.read_csv('data/train.csv', parse_dates = ["Dates"])
testing = pd.read_csv("data/test.csv", parse_dates = ["Dates"])
In [ ]:
#Convert Category into numbers
crime_OHE = preprocessing.LabelEncoder()
crime_labels = crime_OHE.fit_transform(training.Category)
In [2]:
def OHE_crime(df):
days = pd.get_dummies(df.DayOfWeek)
district = pd.get_dummies(df.PdDistrict)
new_df = pd.concat([days, district], axis = 1)
return new_df
In [4]:
#Build new training set
training_OHE = OHE_crime(training)
In [ ]:
#Build new testing set
testing_OHE = OHE_crime(testing)
In [ ]:
train, validation, train_labels, validation_labels = train_test_split(training_OHE, crime_labels train_size = 0.65)
In [ ]:
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
In [ ]:
#clf = BernoulliNB()
clf = LogisticRegression()
#clf= RandomForestClassifier()
In [ ]:
clf.fit(train, train_labels)
predicted = np.array(clf.predict_proba(validation))
log_loss(validation_labels)
In [ ]:
#Write results
results_predicted = clf.predict_proba(testing_OHE)
results = pd.DataFrame(results_predicted, columns = le_crime.classes_)
results.to_csv('results/testResults.csv', index = True, index_label = 'Id')