In [1]:
from types import NoneType
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt
import gzip

%matplotlib inline


//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
YEARS = [x for x in range(2003, 2016)]
MONTHS = [x for x in range(-1, -13, -1)]
HOURS = [x for x in range(0, 24)]
DAYS_OF_WEEK = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
PD_DISTRICT = ['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
NX = 1000
NY = 1000
COORDS = ['B'+str(x) for x in range(0, NX*NY)]

In [3]:
def convert_data(data, limits):
    # Convert categorical variables (yeras, months, hours, weekdays, and districts) to binarized arrays
    year = data.Dates.dt.year
    year = pd.get_dummies(year)
    month = -data.Dates.dt.month
    month = pd.get_dummies(month)
    hour = data.Dates.dt.hour
    hour = pd.get_dummies(hour)
    day = pd.get_dummies(data.DayOfWeek)
    district = pd.get_dummies(data.PdDistrict)
    coord = get_coordinates_variable(data, limits)
    coord = pd.get_dummies(coord)
    return pd.concat([year, month, hour, day, district, coord], axis=1)

def get_coordinates_variable(data, limits):
    coordinates = [None] * len(data.X)
    x_min = limits[0]
    x_max = limits[1]
    x_step = (x_max - x_min) / NX
    y_min = limits[2]
    y_max = limits[3]
    y_step = (y_max - y_min) / NY
    for i in range(0, len(data.X)):
        x = data.X[i]
        y = data.Y[i]
        xn = (x - x_min) / x_step
        yn = (y - y_min) / y_step
        coordinates[i] = 'B' +  str(int(xn) + max(0, (int(yn) - 1)) * NX)
    return coordinates

def compute_coordinates_limits(train, test):
    return min(min(train.X), min(test.X)),\
           max(max(train.X), max(test.X)),\
           min(min(train.Y), min(test.Y)),\
           max(max(train.Y), max(test.Y))

In [5]:
# Load data
train = pd.read_csv(gzip.open('../input/train.csv.gz'), parse_dates=['Dates'])
test = pd.read_csv(gzip.open('../input/test.csv.gz'), parse_dates=['Dates'])

In [6]:
# Convert crime labels to numerical labels
crime_label_encoder = preprocessing.LabelEncoder()
crime_label = crime_label_encoder.fit_transform(train.Category)

# Compute coordinates limits
coord_limits = compute_coordinates_limits(train, test)

# Build train data
train_data = convert_data(train, coord_limits)
train_data['crime'] = crime_label

# Build test data
test_data = convert_data(test, coord_limits)

#Classification model based on year, month, time, day and district
COORDS = [x for x in COORDS if train_data.keys().__contains__(x) and test_data.keys().__contains__(x)]
features = YEARS + MONTHS + HOURS + DAYS_OF_WEEK + PD_DISTRICT + COORDS

In [7]:
model = BernoulliNB()
model.fit(train_data[features], train_data['crime'])
predicted = model.predict_proba(test_data[features])

result = pd.DataFrame(predicted, columns=crime_label_encoder.classes_)
result.to_csv('kaggle_bernoulli.csv', index=True, index_label='Id')

In [ ]: