In this excercise we will create an improved version of last years model.
Please be aware that the following packages and additional assets have to be installed in the conda environment to guarantue proper execution of this notebook
In [1]:
    
# imports
import math
import datetime
import matplotlib
import matplotlib.pyplot as plt
import osmnx as ox
import pandas as pd
import numpy as np
import pprint
import requests
import gmaps
import seaborn as sns
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.neural_network import MLPClassifier
    
First we load and explore the dataset a little.
In [2]:
    
train_data = pd.read_csv("../../data/raw/train.csv")
train_data['Dates'] = pd.to_datetime(train_data['Dates'])
test_data = pd.read_csv("../../data/raw/test.csv")
test_data['Dates'] = pd.to_datetime(test_data['Dates'])
    
In [3]:
    
print("Size of train_data: ", len(train_data))
print("Size of test_data: ", len(test_data))
    
    
In [4]:
    
train_data.head()
    
    Out[4]:
There are strong differences in the frequencies in which the different categories of crime occur. Larency/Theft make up most of the commited crimes, whereas other, possibly minor offenses are the second most
In [5]:
    
# visualizing category distribution of the crimes
#crimes = train_data['Category'].unique()
#print("Categories:", crimes)
#train_data['Category'].value_counts(sort=False).plot.bar()
#plt.show()
    
Next we take a look at the concentration of criminal activity for 2010-2015 which gives us a hint that crimes are most concentrated around the union square
In [6]:
    
# load private google api key
file = open("./assets/gapi.key", 'r')
key = file.read()
file.close()
gmaps.configure(api_key=key)
    
In [7]:
    
# Creating a location subset from the most current crimes (2010-2015) for heatmap visualization
import datetime
start_date = datetime.date(2010,1,1)
end_date = datetime.date(2016,1,1)
date_mask = (train_data['Dates'] > start_date) & (train_data['Dates'] <= end_date)
location_subset = train_data.loc[date_mask][['Y', 'X']]
locations = [tuple(x) for x in location_subset.values]
fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(locations))
fig
    
    
 
 
Next we look up the locations for different crime categories to examine if there are category patterns available in the underlying data
In [8]:
    
# Manual Inspection of specific incidents to check if different crime types are locally bounded
def draw_category_gmap(category, start_date, end_date):
    specific_incidents = train_data.loc[train_data['Category']==category]
    date_mask = (specific_incidents['Dates'] > start_date) & (specific_incidents['Dates'] <= end_date)
    location_subset = specific_incidents.loc[date_mask][['Y', 'X']]
    locations = [tuple(x) for x in location_subset.values]
    spec_fig = gmaps.figure()
    gmaps.heatmap_layer.max_intensity = 1000
    gmaps.heatmap_layer.point_radius = 2
    spec_fig.add_layer(gmaps.heatmap_layer(locations))
    return spec_fig
    
In [9]:
    
draw_category_gmap('KIDNAPPING', datetime.date(2010,1,1), datetime.date(2016,1,1))
    
    
 
 
In [10]:
    
draw_category_gmap('WARRANTS', datetime.date(2010,1,1), datetime.date(2016,1,1))
    
    
 
 
In [11]:
    
draw_category_gmap('DRUNKENNESS', datetime.date(2010,1,1), datetime.date(2016,1,1))
    
    
 
 
Most crimes are distributed arround the union square and in lesser density distributed over san francisco whereas PROSTITUTION is well contained in two areas
In the following we present our different features which we combine for a data preparation method.
In [12]:
    
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
# some functions for time based features
first_sixth = datetime.time(23,59,59)
second_sixth = datetime.time(3,59,59)
third_sixth = datetime.time(7,59,59)
fourth_sixth = datetime.time(11,59,59)
fifth_sixth = datetime.time(15,59,59)
sixth_sixth = datetime.time(19,59,59)
cal = calendar()
holiday_timetamps = cal.holidays(start='2003-01-01', end='2015-05-13')
holidays = []
for t in holiday_timetamps:
    holidays.append(t.date())
holidays = set(holidays)
def get_halfhour(minute):
    if minute < 30:
        return 0
    else:
        return 1
def get_daynight(hour):
    if 5 < hour and hour < 23:
        return 0
    else:
        return 1
def generate_day_sixths(times):
    """This function has to be executed on the original datetime features"""
    def day_sixths(dt):
        """
        Mapping time to sixths of a day
        """
        if dt.time() > datetime.time(0,0,0) and dt.time() <= second_sixth:
            return 0/6
        if dt.time() > second_sixth and dt.time() <= third_sixth:
            return 1/6
        if dt.time() > third_sixth and dt.time() <= fourth_sixth:
            return 2/6
        if dt.time() > fourth_sixth and dt.time() <= fifth_sixth:
            return 3/6
        if dt.time() > fifth_sixth and dt.time() <= sixth_sixth:
            return 4/6
        if dt.time() > sixth_sixth and dt.time() <= first_sixth:
            return 5/6
        
    return times.map(day_sixths)
def get_holiday(day, date):
    if day == "Sunday" or date in holidays:
        return 1
    else:
        return 0
    
def generate_time_features(data):
    times = data["Dates"]    
    days = data["DayOfWeek"]
    #perhaps try one hot encoding for some of the series.    
    minute_series = pd.Series([x.minute for x in times], name='minute')
    # halfhour_series = pd.Series([get_halfhour(x.minute) for x in times], name='halfhour')
    # hour_series = pd.Series([x.hour for x in times], name='hour')
    daynight_series = pd.Series([get_daynight(x.hour) for x in times], name='day_night')
    day_series = pd.Series([x.day for x in times], name='day')
    month_series = pd.Series([x.month for x in times], name='month')
    year_series = pd.Series([x.year for x in times], name='year')
    # sixths = pd.Series(generate_day_sixths(times), name='day_sixths')
    day_of_week = pd.get_dummies(days)
    is_holiday = pd.Series([get_holiday(days[i], times[i].date()) for i in range(len(times))], name='is_holiday')
    minute_one_hot = pd.get_dummies(minute_series)
    # better than phase and hour if no one hot encoding is used
    rel_time = pd.Series([(x.hour + x.minute / 60) for x in times], name='rel_time')
    
    time_features = pd.concat([minute_one_hot, rel_time, day_of_week, is_holiday, daynight_series,
                               day_series, month_series, year_series], axis=1)
    return time_features
    
In [13]:
    
# show the structure of our time based features]
#time_features = generate_time_features(data)
#time_features['sixths'] = generate_day_sixths(times)
#time_features.head()
    
Like in the presentation we create a grid which covers San Francisco and calculate the distribution of the different crime categories for each cell. You can see in the heatmap above, that there are certain hot spots for different categories of crimes.
In [14]:
    
# We define a bounding box arround San Francisco
min_x = -122.53
max_x = -122.35
min_y = 37.65
max_y = 37.84
dif_x = max_x - min_x
dif_y = max_y - min_y
    
We noticed that there a 67 crimes in the training set which were committed outside San Francisco. We notice that these outliers were all committed at the same position (X=-120.5, Y=90), which is right at the geographic North Pole. To deal with them we map all of them in an empty subregion into the sea in the north western corner of San Francisco.
In [15]:
    
# Please zoom out a little bit to identify the location of this point , 
marker_loc = [(37.783939,-122.412614)]
spec_fig = gmaps.figure()
spec_fig.add_layer(gmaps.marker_layer(marker_loc))
spec_fig
    
    
In [15]:
    
# Functions to reposition outliers into a separate valid region.
def reposition_x(x):
    if x < min_x or max_x <= x:
        return -122.412614
    else: 
        return x
def reposition_y(y):
    if y < min_y or max_y <= y:
        return 37.783939
    else:
        return y
def reposition_outliers(data):
    repositioning = data.copy()
    new_X = pd.Series([reposition_x(x) for x in data["X"]], name='X')
    repositioning = repositioning.drop('X', axis=1)
    new_Y = pd.Series([reposition_y(y) for y in data["Y"]], name="Y")
    repositioning = repositioning.drop('Y', axis=1)
    repositioning = pd.concat([repositioning, new_X, new_Y], axis=1)
    return repositioning
    
In [16]:
    
train_data = reposition_outliers(train_data)
    
Next we define functions to assign each crime according to its position to the right cell of the grid.
In [17]:
    
# grid functions
def assign_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = pos_x - min_x
    x_sec = int(x_sections * x / dif_x)
    y = pos_y - min_y
    y_sec = int(y_sections * y / dif_y)
    return x_sec + x_sections * y_sec
    
def create_subregion_series(data, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    subregion_list = []
    for i in range(len(data)):
        pos_x = data["X"][i]
        pos_y = data["Y"][i]
        subregion = assign_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, x_sections, y_sections)
        subregion_list.append(subregion)
    return pd.Series(subregion_list, name='subregion')
def get_subregion_pos(subregion_id, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = subregion_id % x_sections
    x_pos = ((x + 1/2) / x_sections) * dif_x + min_x
    y = subregion_id // x_sections
    y_pos = ((y + 1/2) / y_sections) * dif_y + min_y
    return (x_pos, y_pos)
def get_subregion_rectangle(subregion_id, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = subregion_id % x_sections
    y = subregion_id // x_sections
    x_pos_ll = (x / x_sections) * dif_x + min_x
    y_pos_ll = (y  / y_sections) * dif_y + min_y
    lower_left = (x_pos_ll, y_pos_ll)
    x_pos_ur = ((x + 1) / x_sections) * dif_x + min_x
    y_pos_ur = ((y + 1)  / y_sections) * dif_y + min_y
    upper_right= (x_pos_ur, y_pos_ur)
    return lower_left, upper_right
    
In [18]:
    
# show the structure of subregion feature
#subregions = create_subregion_series(train_data, min_x, min_y, dif_x, dif_y, 20, 20)
#subregions_df = pd.concat([subregions], axis=1)
#subregions_df.head()
    
Because many probabilities in the following statistics are close to zero, we use logit to avoid floating point inaccuracies.
In [19]:
    
from math import log
def logit(p):        
    return log(p / (1 - p))
logit_eps = 0.0001 
upper_bound = 1 - logit_eps
logit_one = logit(1 - logit_eps)
logit_zero = logit(logit_eps)
def calc_logit(p):
    if p < logit_eps:
        return logit_zero
    elif p > upper_bound:
        return logit_one
    else:
        return logit(p)
    
Like in the presentation we calculate the non-prosecution rate for each cell in the grid. Areas with high non-prosecution rate may have lower police presence.
In [20]:
    
# functions to calculate the non-prosecution rate for each cell in the grid.
def count_non_prosecuted_crimes(data, subregions, num_regions):
    none_prosecutions_local = {}
    none_prosecutions_overall = data["Resolution"].value_counts()["NONE"]
    resolution = data["Resolution"]
    for r in range(num_regions):
        none_prosecutions_local[r] = 0
    for i, r in enumerate(subregions):
        if resolution[i] == "NONE":
            none_prosecutions_local[r] += 1
    return none_prosecutions_local, none_prosecutions_overall
def calculate_prosection_rates(counts_local, counts_overall, subregions, num_regions, sufficient_n):
    none_prosecutions_rate_overall = calc_logit(counts_overall / len(subregions))
    none_prosecution_rate_local = {}
    occupied_regions = subregions.unique()
    counts = subregions.value_counts()
    for r in range(num_regions):
        if r in occupied_regions and counts[r] >= sufficient_n:
            none_prosecution_rate_local[r] = calc_logit(counts_local[r] / counts[r])
        else: 
            none_prosecution_rate_local[r] = none_prosecutions_rate_overall
    return none_prosecution_rate_local
            
def get_non_prosecution_rate_frame(data, subregions, num_regions, sufficient_n):
    counts_local, counts_overall = count_non_prosecuted_crimes(data, subregions, num_regions)
    rates_local = calculate_prosection_rates(counts_local, counts_overall, subregions, num_regions, sufficient_n)        
    non_prosecution_series = pd.Series([rates_local[x] for x in range(num_regions)], name='non_prosecution_rate')
    return non_prosecution_series
    
In [21]:
    
# show the structure of non-prosecution rate feature
#non_prosecution_rate = get_non_prosecution_rate_frame(train_data, subregions, 400, 50)
#non_prosecution_rate.head()
    
Our next feature is the distrubition of crime categories in a grid cell. Certain ares might be hot spots for different types of crime and we want to define what is normal for a region.
In [22]:
    
# functions to calculate the category distribution rate for each cell in the grid.
def count_crimes_per_category(data, subregions, crimes, num_regions):
    # count crimes per region and category
    criminal_activity_local = {}
    criminal_activity_overall = data["Category"].value_counts()
    category = data["Category"]
    for r in range(num_regions):
        criminal_activity_local[r] = {}
        for c in crimes:
            criminal_activity_local[r][c] = 0
    for i, r in enumerate(subregions):
        criminal_activity_local[r][category[i]] += 1
    return criminal_activity_local, criminal_activity_overall
def determine_distribution_categories(data, activity_lokal, activity_overall, subregions, crimes, num_regions, sufficient_n):
    distribution_global = {}
    for c in crimes:
        distribution_global[c] = calc_logit(activity_overall[c] / len(data))
    occupied_regions = subregions.unique()
    counts = subregions.value_counts()
    distribution_local = {}
    for r in range(num_regions):
        distribution_local[r] = {}
        for c in crimes: 
            if r in occupied_regions and counts[r] >= sufficient_n:
                distribution_local[r][c] = calc_logit(activity_lokal[r][c] / counts[r])
            else:
                distribution_local[r][c] = distribution_global[c]
    return distribution_local
def get_crime_distribution_frame(data, subregions, crimes, num_regions, sufficient_n):
    activity_lokal, activity_overall = count_crimes_per_category(data, subregions, crimes, num_regions)
    distribution_local = determine_distribution_categories(data, activity_lokal, activity_overall, subregions, crimes, num_regions, sufficient_n)
    # convert to dataframe
    distribution_frame = pd.DataFrame()
    for c in crimes:
        category_series = pd.Series([distribution_local[r][c] for r in range(num_regions)], name=c)
        distribution_frame = pd.concat([distribution_frame, category_series], axis=1)
    return distribution_frame
    
In [23]:
    
# show the structure of category distribution feature
#distribution_frame = get_crime_distribution_frame(train_data, subregions, crimes, 400, 50)
#distribution_frame.head()
    
Our next feature is the distrubition of crime categories per street. Certain streets might be hot spots for different types of crime.
In [24]:
    
# functions for street statistics
def get_relevant_streets(data, sufficient_n):
    streets = data["Address"]
    street_counts = streets.value_counts()
    relevant_streets = []
    for k in street_counts.keys():
        if street_counts[k] >= sufficient_n:
            relevant_streets.append(k)
        else:
            break
    return relevant_streets
def count_street_crime_per_category(data, relevant_streets, crimes):
    # count crimes per region and category
    street_activity = {}
    streets = data["Address"]
    category = data["Category"]
    for s in relevant_streets:
        street_activity[s] = {}
        street_activity[s]["crime_count"] = 0
        for c in crimes:
            street_activity[s][c] = 0
    for i, s in enumerate(streets):
        if s in street_activity:
            street_activity[s][category[i]] += 1
            street_activity[s]["crime_count"] += 1
    return street_activity
def determine_street_crime_distribution_categories(data, relevant_streets, street_activity, crimes):
    # default distribution
    street_distribution = {}
    street_distribution["DEFAULT_DISTRIBUTION"] = {}
    overall_counts = data["Category"].value_counts()
    for c in crimes:
         street_distribution["DEFAULT_DISTRIBUTION"][c] = calc_logit(overall_counts[c] / len(data))
    # street distribution
    for s in relevant_streets:
        street_distribution[s] = {}
        for c in crimes: 
            street_distribution[s][c] = calc_logit(street_activity[s][c] / street_activity[s]["crime_count"])
    return street_distribution
def get_street_crime_distribution_dict(data, crimes, sufficient_n=48):
    rel_streets = get_relevant_streets(data, sufficient_n)
    street_activity = count_street_crime_per_category(data, rel_streets, crimes)
    street_distribution = determine_street_crime_distribution_categories(data, rel_streets, street_activity, crimes)
    # convert to dataframe
    '''
    street_distribution_frame = pd.DataFrame()
    for c in crimes:
        category_series = pd.Series([street_distribution[s][c] for s in rel_streets] + [street_distribution["DEFAULT_DISTRIBUTION"][c]],
                                    name=("street_" + c))
        street_distribution_frame = pd.concat([street_distribution_frame, category_series], axis=1)
    '''
    return street_distribution
def get_street_crime_rate(street, crime, street_crime_distribution):
    if street in street_crime_distribution.keys():
        return street_crime_distribution[street][crime]
    else:
        return street_crime_distribution['DEFAULT_DISTRIBUTION'][crime]
    
In [25]:
    
def create_police_destrict_frame(data):
    one_hot_police_destricts = pd.get_dummies(data["PdDistrict"])
    return one_hot_police_destricts
    
In [26]:
    
# show the structure of the police destrict feature
#police_destricts = create_police_destrict_frame(train_data)
#police_destricts.head()
    
Our next feature is the distance between the place where the crime is committed and the nearest police station, because police presence may influence the category of crime which is committed.
In [27]:
    
# function to measure distance to next police station
from geopy.distance import vincenty
police_stations = pd.read_json("https://data.sfgov.org/resource/me2e-mc38.json")
police_station_coordinates = []
for elem in police_stations.iterrows():
    """Create police station coordinates as tuple"""
    police_station_coordinates.append(tuple(elem[1]['location']['coordinates']))
def get_crime_coordinate_series(data):
    # prepare crime X,Y coordinates as tuple of coordinates
    return list(zip(data['X'], data['Y']))
    
def caculate_min_distance_police(crime_coordinate):
    """calculate distance from crime to nearest police station"""
    current_min = 10000000
    for police_station_coordinate in police_station_coordinates:
        current_min = min(current_min, vincenty(police_station_coordinate, crime_coordinate).meters)
    return current_min
def get_police_distance_series(data):
    get_crime_coordinates = get_crime_coordinate_series(data)
    police_distance_series = pd.Series([caculate_min_distance_police(c) for c in get_crime_coordinates], name='police_distance')
    return police_distance_series
    
In [28]:
    
# show the structure of the police distance feature
#police_dist_series = get_police_distance_series(train_data)
#police_dist_pd = pd.concat([police_dist_series], axis=1)
#police_dist_pd.head()
    
If a mapping can be caclulated the corresponding street type is used. Always the first occuring street type will be used. If the crime address includes a "/" symbol this means a crossing like a corner to another street is meatn and the type CRO (Crossing) is inferred, else the most common type RD for Road is used. This represents basically a simple street taxonomy. A one-hot encoded dataset is returned.
In [29]:
    
types = ['ST','AV','BL', 'BL NORTH', 'BL SOUTH',  'AVE','AL', 'ALY', 'CT', 'WY' ,'WAY', 'TER', 'BLVD', 'RP','RAMP', 'PL', 'LN',
       'LOOP', 'DR', 'RD','CR', 'CIR','WL', 'WK' 'WALK','PK', 'PARK','RW', 'ROW', 'PATH','HY', 'HW',
       'HWY', 'EXPY', 'HL', 'PZ','PLZ', 'STPS','I-80', 'MAR','BLVD NORTH', 'BLVD SOUTH',
       'STWY','PALMS','WK','EX' , 'TR','TUNL','FERLINGHETTI', 'BUFANO']
def generate_street_types(addresses):
    
    def map_street_to_street_type(street):
        addrl = street.split(' ')
        if '/' in addrl:
            return 'CRO'
        elif '/' not in addrl:
            for elem in addrl:
                if elem in types:
                    return elem
        else:
            return 'RD'  
        
    return pd.get_dummies(pd.Series(addresses.map(map_street_to_street_type), name='StreetType'))
    
In [30]:
    
# Show the structure of the street type feature
#street_type_frame = generate_street_types(train_data['Address'])
#street_type_frame.head()
    
In [31]:
    
def create_subregions_graphs(subregions, x_sections, y_sections, from_point=False):
    """
    Creates a subregions graph dictionary for each unique
    subregion
    """
    global subregions_graphs
    subregions_graphs = {}
    for subregion in subregions.unique():
        if from_point:      
            subregion_graph = create_subregion_graph_from_coordinate(subregion, x_sections, y_sections)# create_subregion_graph_from_bb(bb_coord)
        else:
            bb_coord = get_subregion_rectangle(subregion, min_x, min_y, dif_x, dif_y, x_sections, y_sections)
            subregion_graph = create_subregion_graph_from_bb(bb_coord)
        if subregion_graph:
            subregions_graphs[subregion] = subregion_graph
    print(subregions_graphs.keys())
    return subregions_graphs
def create_subregion_graph_from_coordinate(subregion, x_sections, y_sections, radius=100):
    """
    Creates a subregion graph by a subregion coordinate and a radius
    """
    G = None
    try:
        point = get_subregion_pos(subregion, min_x, min_y, dif_x, dif_y, x_sections, y_sections)
        G = ox.graph_from_point((point[1],point[0]), distance=radius, network_type='all')
    except (Exception, RuntimeError, ValueError):
        print("A RuntimeError, ValueError, or NetworkXPointlessConcept Error occured probably due to invalid coordinates") 
    return G
    
def create_subregion_graph_from_bb(subregion_rectangle):
    """
    Creates a subregion graph by a subregion bounding box (rectangle)
    """
    G = None
    try:
        G = ox.graph_from_bbox(subregion_rectangle[1][1],
                               subregion_rectangle[0][1],
                               subregion_rectangle[1][0],
                               subregion_rectangle[0][0],
                               network_type='all')
    except (Exception, RuntimeError, ValueError):
        print("A RuntimeError, ValueError, or NetworkXPointlessConcept Error occured probably due to invalid coordinates")
    
    return G
def calculate_subregion_deden(subregion_id):
    """
    The cul de sac density is calculated as the ratio of
    dead end roads to all roads in the subregion
    """
    if subregion_id in subregions_graphs.keys():
        subregion = subregions_graphs[subregion_id]
        culdesacs = [key for key, value in subregion.graph['streets_per_node'].items() if value==1]
        return len(culdesacs) / len(subregion.edges())
    else:
        return 0
def calculate_subregion_reg(subregion_id):
    """
    The regularity of the street network is calculated as the standard
    deviation of node degrees for the subregion normalized by the average
    node degree
    """
    if subregion_id in subregions_graphs.keys():
        subregion = subregions_graphs[subregion_id]
        subregion_nodes = subregion.nodes()
        degree_values = [value for key, value in subregion.graph['streets_per_node'].items()]
        degree_sum = sum(degree_values)
        node_degree_mean = degree_sum/len(degree_values)
        var = 1/(len(degree_values)-1) * sum([(value - node_degree_mean)**2 for value in degree_values]) 
        return math.sqrt(var) / node_degree_mean
    else:
        return 0
def calculate_subregion_cnr(subregion_id):
    if subregion_id in subregions_graphs.keys():
        subregion = subregions_graphs[subregion_id]
        realintersect = [value for key, value in subregion.graph['streets_per_node'].items() if value>1]
        return len(realintersect) / len(subregion.nodes())
    else:
        return 0
def calculate_subregion_iden(subregion_id):
    pass
def calculate_subregion_osperc(subregion_id):
    pass
global betweenes_dict
betweenes_dict = {}
def calculate_subregion_bet(subregion_id):
    """
    calculates the betweenes centrality for the region 
    """  
    if subregion_id in betweenes_dict.keys():
        return betweenes_dict[subregion_id]
    else:
        if subregion_id in subregions_graphs.keys():
            subregion = subregions_graphs[subregion_id]
            extended_stats = ox.extended_stats(subregion, bc=True)
            betweenes_dict[subregion_id] = extended_stats['betweenness_centrality_avg'] 
            return betweenes_dict[subregion_id]
        else:
            return 0
def calculate_one_way_density(subregion_id):
    pass
def generate_subregion_architectural_features(subregions, x_sections, y_sections, from_point=False):
    """Generates a dataframe with the subregions architectural features"""
    subregions_graphs = create_subregions_graphs(subregions, x_sections, y_sections, from_point=from_point)
    deden = pd.Series(subregions.map(calculate_subregion_deden), name = 'DED')
    reg = pd.Series(subregions.map(calculate_subregion_reg), name = 'REG')
    cnr = pd.Series(subregions.map(calculate_subregion_cnr), name = 'CNR')
    # iden = pd.Series(subregions.map(calculate_subregion_iden), name = "IDEN")
    # osperc = pd.Series(subregions.map(calculate_subregion_osperc), name = 'OSPERC')
    bet = pd.Series(subregions.map(calculate_subregion_bet), name = 'BET')
    #owd = 
    return pd.concat([deden, reg, cnr, bet], axis=1)
    
In [32]:
    
# Generating and viewing subregion architectural features, which is a very expensive procedure! Please assure to save these features accordingly
# To ommit recalculation
#architectural_features = generate_subregion_architectural_features(subregions)
#architectural_features.head()
    
In [33]:
    
# Visualizing Street Network and cul-de-sacs in union square grid
union_square = ox.graph_from_point((37.787994,-122.407437), distance=300, network_type='all')
ox.plot_graph(union_square)
    
    
    Out[33]:
In [34]:
    
# Red Nodes are endpoints of cul-de-sacs
culdesacs = [key for key, value in union_square.graph['streets_per_node'].items() if value==1]
nc = ['r' if node in culdesacs else 'b' for node in union_square.nodes()]
ox.plot_graph(union_square, node_color=nc)
    
    
    Out[34]:
In [35]:
    
# Visualizing one way streets (network_type has to be drive)
# Red streets are one way streets
union_square_d = ox.graph_from_point((37.787994,-122.407437), distance=300, network_type='drive')
ec = ['r' if data['oneway'] else 'b' for u, v, key, data in union_square_d.edges(keys=True, data=True)]
ox.plot_graph(union_square_d, node_size=0, edge_color=ec)
    
    
    Out[35]:
In [36]:
    
# first initialise important variables
if not os.path.exists("../../data/interim/train_cleaned.csv"):
    train_data = pd.read_csv("../../data/raw/train.csv")
    cleaned_data = reposition_outliers(train_data)
    cleaned_data.to_csv("../../data/interim/train_cleaned.csv", index=False)
    
if not os.path.exists("../../data/interim/test_cleaned.csv"):
    test_data = pd.read_csv("../../data/raw/test.csv")
    cleaned_data = reposition_outliers(test_data)
    cleaned_data.to_csv("../../data/interim/test_cleaned.csv", index=False)
    
train_data = pd.read_csv("../../data/interim/train_cleaned.csv")
train_data['Dates'] = pd.to_datetime(train_data['Dates'])
test_data = pd.read_csv("../../data/interim/test_cleaned.csv")
test_data['Dates'] = pd.to_datetime(test_data['Dates'])
# bounding box
min_x = -122.53
max_x = -122.35
min_y = 37.65
max_y = 37.84
dif_x = max_x - min_x
dif_y = max_y - min_y
# grid resolution
x_sections = 30
y_sections = 30
num_subregions = x_sections * y_sections
sufficient_n_non_prosecution = 100
sufficient_n_distribution = 200
sufficient_n_streets = 48
crimes = train_data['Category'].unique()
street_crime_distribution_dict = get_street_crime_distribution_dict(train_data, crimes, sufficient_n_streets)
    
In [37]:
    
# a function which builds the dataset
# always calculate regional features first
def build_and_store_regional_features(train_data):
    print("Processing regional features")
    print("-----------------------------")
    
    if not os.path.exists("../../data/processed/train_subregions.csv"):
        subregions = create_subregion_series(train_data, min_x, min_y, dif_x, dif_y, x_sections, y_sections)
        subregions = pd.concat([subregions], axis=1)
        subregions.to_csv("../../data/processed/train_subregions.csv", index=False)
    print("Finished: subregions")
        
    if not os.path.exists("../../data/processed/regional_non_prosecution_rates.csv"):
        subregions = pd.read_csv("../../data/processed/train_subregions.csv")
        subregions = subregions["subregion"]
        non_prosecution_rate = get_non_prosecution_rate_frame(train_data, subregions, num_subregions,
                                                          sufficient_n_non_prosecution)
        non_prosecution_rate = pd.concat([non_prosecution_rate], axis=1)
        non_prosecution_rate.to_csv("../../data/processed/regional_non_prosecution_rates.csv", index=False)
    print("Finished: regional non prosecution rates")
        
    if not os.path.exists("../../data/processed/regional_crime_distribution.csv"):
        subregions = pd.read_csv("../../data/processed/train_subregions.csv")
        subregions = subregions["subregion"]
        crimes = train_data['Category'].unique()
        crime_distribution = get_crime_distribution_frame(train_data, subregions, crimes, num_subregions, 
                                                          sufficient_n_distribution)
        crime_distribution.to_csv("../../data/processed/regional_crime_distribution.csv", index=False)
    print("Finished: regional crime distributions")
    print("Finished: build_and_store_regional_features")
    print()
        
def build_and_store_crime_features(data, name):
    print("Processing crime features")
    print("-----------------------------")
    
    if not os.path.exists("../../data/processed/" + name + "_time_features.csv"):
        time_features = generate_time_features(data)
        time_features.to_csv("../../data/processed/" + name + "_time_features.csv", index=False)
    print("Finished: time features")
    
    if not os.path.exists("../../data/processed/" + name + "_subregions.csv"):
        subregions = create_subregion_series(data, min_x, min_y, dif_x, dif_y, x_sections, y_sections)
        subregions = pd.concat([subregions], axis=1)
        subregions.to_csv("../../data/processed/" + name + "_subregions.csv", index=False)
    print("Finished: subregions")
    if not os.path.exists("../../data/processed/" + name + "_police_destrict.csv"):
        police_destricts = create_police_destrict_frame(data)
        police_destricts.to_csv("../../data/processed/" + name + "_police_destrict.csv", index=False)
    print("Finished: police destricts")
        
    if not os.path.exists("../../data/processed/" + name + "_police_distance.csv"):
        police_distance = get_police_distance_series(data)
        police_distance = pd.concat([police_distance], axis=1)
        police_distance.to_csv("../../data/processed/" + name + "_police_distance.csv", index=False)
    print("Finished: police distances")
        
    if not os.path.exists("../../data/processed/" + name + "_street_types.csv"):
        street_type = generate_street_types(data['Address'])
        street_type.to_csv("../../data/processed/" + name + "_street_types.csv", index=False)
    print("Finished: street types")
        
    if not os.path.exists("../../data/processed/" + name + "_non_prosecution_rates.csv"):
        subregions = pd.read_csv("../../data/processed/" + name + "_subregions.csv")
        subregions = subregions["subregion"]
        regional_non_prosecution_rate = pd.read_csv("../../data/processed/regional_non_prosecution_rates.csv")
        regional_non_prosecution_rate = regional_non_prosecution_rate["non_prosecution_rate"]
        non_prosecution_rates = pd.Series([regional_non_prosecution_rate[r] for r in subregions], name='non_prosecution_rate')
        non_prosecution_rates = pd.concat([non_prosecution_rates], axis=1)
        non_prosecution_rates.to_csv("../../data/processed/" + name + "_non_prosecution_rates.csv", index=False)
    print("Finished: non prosecution rates")
        
    if not os.path.exists("../../data/processed/" + name + "_crime_distribution.csv"):
        subregions = pd.read_csv("../../data/processed/" + name + "_subregions.csv")
        subregions = subregions["subregion"]
        distribution_local = pd.read_csv("../../data/processed/regional_crime_distribution.csv")
        crime_distribution = pd.DataFrame()
        for c in crimes:
            category_series = pd.Series([distribution_local[c][r] for r in subregions], name=c)
            crime_distribution = pd.concat([crime_distribution, category_series], axis=1)
        crime_distribution.to_csv("../../data/processed/" + name + "_crime_distribution.csv", index=False)
    print("Finished: crime distributions")
    
    if not os.path.exists("../../data/processed/" + name + "_street_crime_distribution.csv"):
        streets = data["Address"]
        street_crime_distribution = pd.DataFrame()
        for c in crimes:
            category_series = pd.Series([get_street_crime_rate(s, c, street_crime_distribution_dict) for s in streets], name=("street_" + c))
            street_crime_distribution = pd.concat([street_crime_distribution, category_series], axis=1)
        street_crime_distribution.to_csv("../../data/processed/" + name + "_street_crime_distribution.csv", index=False)
    print("Finished: finished street crime distributions")
    # here only subregions with criminal activity is regarded
    if not os.path.exists("../../data/processed/" + name + "_architectural_features.csv"):
        subregions = pd.read_csv("../../data/processed/" + name + "_subregions.csv")
        subregions = subregions["subregion"]
        architectural_features = generate_subregion_architectural_features(subregions, x_sections, y_sections, from_point=False)
        architectural_features.to_csv("../../data/processed/" + name + "_architectural_features.csv", index=False)
    print("Finished: architectural features")
    print("Finished: build_and_store_crime_features")
    print()
    
In [38]:
    
build_and_store_regional_features(train_data)
build_and_store_crime_features(train_data, "train")
build_and_store_crime_features(test_data, "test")
    
    
In [39]:
    
t = pd.read_csv("../../data/processed/train_street_crime_distribution.csv")
t.head()
    
    Out[39]:
In [40]:
    
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
#global available label encoder to retrieve category description later on
le = preprocessing.LabelEncoder()
def concat_generated_subdataset(data, name):
    # loading distinct datasets
    base_path = "../../data/processed/"
    subregions = pd.read_csv(base_path + name + "_subregions.csv")
    non_presecution_rate = pd.read_csv(base_path + name + "_non_prosecution_rates.csv")
    crime_distribution = pd.read_csv(base_path + name + "_crime_distribution.csv")
    time_features = pd.read_csv(base_path + name + "_time_features.csv")
    police_districts = pd.read_csv(base_path + name + "_police_destrict.csv")
    police_distance = pd.read_csv(base_path + name + "_police_distance.csv")
    street_types = pd.read_csv(base_path + name + "_street_types.csv")
    street_crime_distribution = pd.read_csv(base_path + name + "_street_crime_distribution.csv")
    architectural_feat = pd.read_csv(base_path + name + "_architectural_features.csv" )
    
    #print("subregions: ", len(subregions))
    #print("non_pres: ", len(non_presecution_rate))
    #print("crim_dis: ", len(crime_distribution))
    #print("time feat: ", len(time_features))
    #print("police districts: ", len(police_districts))
    #print("police dist: ", len(police_distance))
    #print("street types: ", len(street_types))
    #print("architect: ", len(architectural_feat))
     
    series = [
        data['X'],
        data['Y'],
        subregions,
        time_features,
        police_districts,
        crime_distribution,
        street_crime_distribution,
        non_presecution_rate,
        police_distance, street_types,
        architectural_feat
    ]
    
    if name == 'train':
        # label encoding category
        categories = pd.Series(le.fit_transform(data['Category']), name='Category')
        series = [categories] + series
    
    
    return pd.concat(series, axis=1)
    
In [41]:
    
def build_final_datasets(pca_components=10):
    """
    Builds the final datasets for processing with the neuronal network
    Performs PCA and standard scaling on these datasets
    
    This is done this way instead of a pipline due to a separatly
    provided testset by kaggle
    """
    
    pca = PCA(n_components=pca_components)
    ss = StandardScaler()
    
    train_data = pd.read_csv("../../data/interim/train_cleaned.csv")
    train_data['Dates'] = pd.to_datetime(train_data['Dates'])
    train = concat_generated_subdataset(train_data, 'train')
    test_data = pd.read_csv("../../data/interim/test_cleaned.csv")
    test_data['Dates'] = pd.to_datetime(test_data['Dates'])
    test = concat_generated_subdataset(test_data, 'test')
    
    
    missing_columns = set(train.columns) - set(test.columns)
    missing_columns.remove('Category')
    
    print("Missing columns in test set: ", set(train.columns) - set(test.columns))
    
    print("Imputing empty {} (0) columns into test set".format(missing_columns))
    
    test['BUFANO'] = 0
    test['FERLINGHETTI'] = 0
      
    print("Extracting values and ravel categories")
    
    X = train.iloc[:,1:].values
    y = train.iloc[:,:1].values.ravel()
    test = test.iloc[:].values
    
    print("Standard Scaling train and test set")
    
    X = ss.fit_transform(X)
    test = ss.transform(test)
    
    print("Applying PCA on training and test set")
    
    # X = pca.fit_transform(X)
    # test = pca.transform(test)
  
    print("\n----Done----")
    
    return X, y, test
    
In [42]:
    
X,y,test = build_final_datasets()
    
    
In [43]:
    
mlp = MLPClassifier(hidden_layer_sizes=(200, 180, 200),
                    activation='tanh',
                    learning_rate_init=0.005,
                    max_iter=400)
    
In [ ]:
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
In [ ]:
    
mlp.fit(X_train, y_train)
    
In [ ]:
    
mlp_proba = mlp.predict_proba(X_test)
mlp_pred = mlp.predict(X_test)
log_score = log_loss(y_test, mlp_proba)
acc_score = accuracy_score(y_test, mlp_pred)
print("log_loss: ", log_score)
print("Accuracy: ", acc_score)
    
In [44]:
    
mlp.fit(X, y)
    
    Out[44]:
In [45]:
    
def create_submission(probabilities):
    """
    Creates a kaggle csv submission file within the notebook folder
    """
    submission = pd.DataFrame(probabilities, columns=list(le.classes_))
    submission.insert(0, 'Id', range(0, len(submission)))
    submission.to_csv("submission.csv", index=False)
    
Training MLP with sknn and grid search
In [46]:
    
def create_submission(probabilities):
    submission = pd.DataFrame(probabilities, columns=list(le.classes_))
    submission.insert(0, 'Id', range(0, len(submission)))
    submission.to_csv("submission.csv", index=False)
    
In [ ]:
    
create_submission(mlp.predict_proba(test))
    
In [ ]: