In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

In [ ]:
#Load the data
training = pd.read_csv('data/train.csv', parse_dates = ["Dates"])
testing = pd.read_csv("data/test.csv", parse_dates = ["Dates"])

In [ ]:
#Convert Category into numbers
crime_OHE = preprocessing.LabelEncoder()
crime_labels = crime_OHE.fit_transform(training.Category)

In [2]:
def OHE_crime(df):
    days = pd.get_dummies(df.DayOfWeek)
    district = pd.get_dummies(df.PdDistrict)
    new_df = pd.concat([days, district], axis = 1)
    return new_df

In [4]:
#Build new training set
training_OHE = OHE_crime(training)

In [ ]:
#Build new testing set
testing_OHE = OHE_crime(testing)

In [ ]:
train, validation, train_labels, validation_labels = train_test_split(training_OHE, crime_labels train_size = 0.65)

In [ ]:
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [ ]:
#clf = BernoulliNB()
clf = LogisticRegression()
#clf= RandomForestClassifier()

In [ ]:
clf.fit(train, train_labels)
predicted = np.array(clf.predict_proba(validation))
log_loss(validation_labels)

In [ ]:
#Write results
results_predicted = clf.predict_proba(testing_OHE)
results = pd.DataFrame(results_predicted, columns = le_crime.classes_)
results.to_csv('results/testResults.csv', index = True, index_label = 'Id')