In [1]:
# Imports for data manipulation
import pandas as pd
import numpy as np
import pickle

In [2]:
# Importa for data visualization
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
def plotCrimeCounts(data, features, city):
    '''
    Data is the dataframe containing the crime statistics.
    Features is  disctionary mapping {description: column}
    city is the name of the city for which we are plotting.
    '''
    for description, columnName in features.iteritems():
        tmp = data.sort_values(by=columnName)
        sns.countplot(y=columnName, data=data)
        plt.title('Crimes in {} by {}'.format(city, description))
        plt.show()

In [35]:
# Helper functions to help clean SF data. We do this only once, as we 
# now save the results using pickle!
def bucket(series, n):
    # Takes a series and returns a series mapping each element to a
    # one of n buckets.
    mi, ma = series.min(), series.max()
    buckets = np.linspace(mi, ma, n + 1)
    
    res = np.zeros(len(series))
    array = series.values
    if np.isnan(array).any():
        print "Error! A total of {} nans!".format(np.isnan(array).sum())
    for i in xrange(n):
        res[(buckets[i] <= array) & (array < buckets[i+1])] = i
    return res

def cleanColumns(data):
    # Used to rename the columns in our data grame to their appropriate names.
    # Also drops unnecessary columns.
    data['Latitude'] = data['Y']
    data['Longitude'] = data['X']
    data['Type'] = data['Category']
    
    # print data.columns
    data = data.drop(['IncidntNum', 'Descript','Resolution','Address','X','Y', 'Location'], axis=1)
    
    return data

def createPartitions(data, n):
    # Remove outliers from the latitude/longitude issues
    # We know that the city lies between -130, -120 longitude
    # We also know that the citiy lies between 37 and 40 degrees latitude
    data = data[-120 > data.Longitude][data.Longitude > (-130)]
    data = data[data.Latitude > 37][data.Latitude < 40]
    
    # Each row is an occurrance of a single crime. 
    # Keep around the original data
    # data['Region'] =  n *  buckets(data['Latitude'], n) + buckets(data['Longitude'],n) + 1
    data['xRegion'] = bucket(data['Latitude'], n)
    data['yRegion'] =  bucket(data['Longitude'],n)
    data['Region'] =  n * data.xRegion + data.yRegion    
    
    # Add in the types into the results.
    mapping = {key: i for i,key in enumerate(data['Type'].unique())}
    data['TypeIndex'] = data['Type'].map(mapping)

    # Now we can add the crime counts. 
    data['CrimeCount'] = np.ones(len(data))
    return data

def extractDateFeatures(data):
    # Creates a new data frame and returns it as copy with all the data that we're interested in
    # Create map from week days to integers
    DayOfWeek = {'Sunday': 1,
                 'Monday': 2,
                 'Tuesday': 3,
                 'Wednesday': 4,
                 'Thursday': 5,
                 'Friday': 6,
                 'Saturday': 7 }
    data['DoW'] = data['DayOfWeek'].map(DayOfWeek)
    data = data.drop('DayOfWeek', axis=1)
    print "Created Weeks"
    
    # We assume that the Date column is already in datetime format
    data['Month'] = data.Date.map(lambda x: x.month)
    data['DoM'] = data.Date.map(lambda x: x.day)
    data['Year'] = data.Date.map(lambda x: x.year)
    data['ToD'] = data.Time.map(lambda x: x.minute)
    data['Time'] = data.Time.map(lambda x: x.value / 10**9) - data.Date.min().value / 10**9
    
    # We add an additional column that combines the month and the year into number of months since beginning
    min_year = data.Year.min()
    data['TimeFeature'] = data.ix[:, ['Year', 'Month']].apply(lambda s: 12*(s[0]-min_year) + s[1], axis=1)
    
    data = data.drop('Date', axis=1)
    
    print "Created the time data features!"
    
    return data

In [26]:
# Make histograms of the crimes by month for each given year
def histograms(data, city, directory):
    years = data.Year.unique()
    plt.clf()
    for year in years:
        # Subset the data
        tmp = data[data.Year == year]
        
        # Make Histogram
        sns.countplot(x = 'Month', data=tmp)
        plt.title('{} Crime Histogram for {}'.format(city, year))
        plt.savefig('../figures/{}/crimes_monthly_{}'.format(
                directory, year))
        plt.close()

In [6]:
# Let's do one really large heatmap
def createHeatmapData(data, size = 30):
    # data is what we're making a heatmap on.
    # Using the San Francisco data set, we create an nxn matrix
    # which counts the number of crimes in a given area!
    res = np.zeros((size, size))
    for i in range(size):
        for j in range(size):
            res[i,j] = len(data[(data.xRegion == i) & (data.yRegion == j)])
    return res

In [27]:
# Let's redo the heatmap year by year?
def yearByYear(data, folder, city):
    # Find unique years!
    years = data.Year.unique()
    
    plt.clf()  # Clear figure before plotting.
    for year in years:
        # Deal with data in SF set being zero indexed
        tmp = data[data.Year == year]
        # Only plot if there's data:
        if len(tmp) > 0:
            sns.heatmap(createHeatmapData(tmp))
            plt.title('Crime Distribution in {} for {}'.format(city, year))
            plt.savefig('../figures/{}/heat_map_{}'.format(folder, year))
            plt.close()

In [28]:
# Let's redo it month by month because that's what we want to analyze?
def monthByMonth(data, folder, city):
    years = data.Year.unique()
    months = data.Month.unique()
    
    plt.clf()  # Clear figure before plotting
    for year in years:
        for month in months:
            tmp = data[(data.Year == year) & (data.Month == month)]
            # Only plot if there is data to be plotted
            if len(tmp) > 0:
                sns.heatmap(createHeatmapData(tmp))
                plt.title('Crime Distribution in {} for {}, {}'.format(
                        city, month, year))
                plt.savefig('../figures/{}/heat_map_{}_{}'.format(
                        folder, month, year))
                plt.close()

In [34]:
# Now we write a function to partition the data
def createSimplePartitions(data, n):
    # Returns a partitioned version of the bosclean data!
    data['xRegion'] = bucket(data.Latitude, n)
    data['yRegion'] = bucket(data.Longitude, n)
    data['Region'] = n * data.xRegion + data.yRegion

    return data

In [10]:
sfdata_file = '../data/sf.csv'
sfdata = pd.read_csv(sfdata_file)

In [11]:
# Create histograms on some important metrics
# Map from english_description : column name
columnsToPlotSF = { 'Police District' : 'PdDistrict',
                    'Crime Type' : 'Category',
                    'Day of the Week' : 'DayOfWeek',
                   'Crime Outcome' : 'Resolution' }

In [12]:
plotCrimeCounts(sfdata, columnsToPlotSF, 'San Francisco')

In [13]:
# Read bosdata
import re
import warnings

bos_file = '../data/boston.csv'
target_type = str  # The desired output type

with warnings.catch_warnings(record=True) as ws:
    warnings.simplefilter("always")

    bosData = pd.read_csv(bos_file, sep=",", header=0)
    print("Warnings raised:", ws)
    # We have an error on specific columns, try and load them as string
    for w in ws:
        s = str(w.message)
        print("Warning message:", s)
        match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
        if match:
            columns = match.group(1).split(',') # Get columns as a list
            columns = [int(c) for c in columns]
            print("Applying %s dtype to columns:" % target_type, columns)
            bosData.iloc[:,columns] = bosData.iloc[:,columns].astype(target_type)


('Warnings raised:', [<warnings.WarningMessage object at 0x7fdc17196650>])
('Warning message:', 'Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.')
("Applying <type 'str'> dtype to columns:", [10])

In [14]:
# Create histograms on some important metrics for Boston
columnsToPlotBos = { 'Reporting District' : 'REPTDISTRICT', 
                     'Weapon Type' : 'WEAPONTYPE',
                     'Shooting/No Shooting' : 'Shooting',
                     'Officer Shift' : 'SHIFT',
                     'Year' : 'Year', 
                     'Month' : 'Month',
                     'Day of the Week' : 'DAY_WEEK'}

In [15]:
plotCrimeCounts(bosData, columnsToPlotBos, 'Boston')

In [16]:
loadPickle = True

In [17]:
# Convert date to actual date format. This might take a while!
# Note that we should not have to do this once the data
# has been cleaned!
# SKIP IF CLEAN DATA EXISTS
if not loadPickle:
    sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x, errors='raise'))
    sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x, errors='raise'))

In [18]:
sfpickle_file = '../../cs281_data/large_data/sfclean.pk'

In [19]:
# We only need to do this once, afterwards we should load from the 
# saved location.
# DO NOT RUN IF PICKLED FILE ALREADY EXISTS
if not loadPickle:
    sfclean = cleanColumns(sfdata)
    sfclean = extractDateFeatures(sfclean)
    sfclean.to_pickle(sfpickle_file)

In [20]:
# Load sfclean from pickled location!
sfclean = open(sfpickle_file)
sfclean = pickle.load(sfclean)

In [21]:
# Sort the data by time
sfclean = sfclean.sort_values(by='TimeFeature')

In [22]:
# Save memory by deleting old data
del(sfdata)
# del(sfclean_data)

In [23]:
# Generate some additional histograms
columnsToPlotSF2 = { 'Month' : 'Month',
                    'Day of Month' : 'DoM',
                    'Year' : 'Year',
                   'Hour of Day' : 'ToD' }

In [24]:
plotCrimeCounts(sfclean, columnsToPlotSF2, 'San Franscisco')

In [29]:
histograms(sfclean, 'San Francisco', 'sf_data_analysis')

In [36]:
# Let's generate some heatmaps for both of these data crimes
# Can we overlay these on top of the geographical location???
n = 30
SFByRegion = createPartitions(sfclean, n)

In [39]:
sns.heatmap(createHeatmapData(SFByRegion))
plt.title('Crime Distribution in San Francisco')
plt.show()


/usr/local/lib/python2.7/dist-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [ ]:
yearByYear(SFByRegion, 'sf_data_analysis', 'San Francisco')

In [ ]:
monthByMonth(SFByRegion, 'sf_data_analysis', 'San Francisco')

In [40]:
# We now process the Boston Data: Again, we pickle the results.
loadBosPickle = True
bos_pickle_file = '../../cs281_data/large_data/bosclean.pk'

In [41]:
# Let's process the boston data
if not loadBosPickle:
    # Clean the columns
    bosData['Latitude'] = bosData['X']
    bosData['Longitude'] = bosData['Y']
    
    # Drop unused columns
    toDrop = ['X', 'Y']
    bosData = bosData.drop(toDrop, axis=1)
    
    # Extract date features
    # day of week
    day = np.array(bosData.DAY_WEEK)
    day[ day == "Sunday"] = 0
    day[ day == "Monday"] = 1
    day[ day == "Tuesday"] = 2
    day[ day == "Wednesday"] = 3
    day[ day == "Thursday"] = 4
    day[ day == "Friday"] = 5
    day[ day == "Saturday"] = 6

    date_time = np.array([x.split() for x in bosData.FROMDATE])
    date = date_time[:,0]
    time = date_time[:,1]
    tod = date_time[:,2]

    # month, day, year
    date = np.array([x.split('/') for x in date])
    month = [int(x) for x in date[:,0]]
    dom = [int(x) for x in date[:,1]]
    year = [int(x) for x in date[:,2]]
    min_year = np.min(year)
    min_month = np.min(month)
    time_feat = np.subtract(year, min_year)*12 + np.subtract(month,min_month)

    # time of day
    time_c = [x.split(':') for x in time]
    time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1]) 
            if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1]) 
            for x,y in zip(time_c, tod)]
    
    # Add them back to the data frame
    bosData['Day'] = day
    del day
    bosData['Month'] = month
    del month
    bosData['Dom'] = dom
    del dom
    bosData['Year'] = year
    del year
    bosData['Time'] = time
    del time
    bosData['TimeFeature'] = time_feat
    del time_feat

In [42]:
# Drop some more unnecessary columns
if not loadBosPickle:
    toDrop = ['COMPNOS', 'NatureCode', 'Location', 'XSTREETNAME', 'STREETNAME']
    toDrop += ['INCIDENT_TYPE_DESCRIPTION', 'MAIN_CRIMECODE', 'REPTDISTRICT']
    toDrop += ['REPORTINGAREA', 'FROMDATE', 'WEAPONTYPE', 'Shooting', 'DOMESTIC']
    toDrop += ['Location']

    bosData = bosData.drop(toDrop, axis=1)
    print "Records before: {}".format(len(bosData))
    bosData = bosData.dropna(axis=0)
    print "Records after: {}".format(len(bosData))

In [43]:
# Lets's save the data
if not loadBosPickle:
    bosData.to_pickle(bos_pickle_file)
    del(bosData)

In [44]:
# Let's load the data
with open(bos_pickle_file) as bosclean_file:
    bosclean = pickle.load(bosclean_file)

In [45]:
histograms(bosclean, 'Boston', 'boston_data_analysis')

In [46]:
# Let's generate some heatmaps on the boston crime data
# Can we overlay these over a boston map?
n = 30
regionedData = createSimplePartitions(bosclean, n)

In [47]:
# Let's do a single large heatmap
sns.heatmap(createHeatmapData(regionedData, n))
plt.title('Crime Distribution in Boston')
plt.show()

In [ ]:
yearByYear(regionedData, 'boston_data_analysis', 'Boston')

In [ ]:
monthByMonth(regionedData, 'boston_data_analysis', 'Boston')

In [48]:
# Read in the Chicago data set
import re
import warnings

years = 2000 + np.linspace(1,15, 15)
buckets = np.zeros(1)
target_type = str  # The desired output type

chicagoData = pd.DataFrame()
for year in years:

    data_file = '../../cs281_data/large_data/chicago/%d.0.csv' % year    
    with warnings.catch_warnings(record=True) as ws:
        warnings.simplefilter("always")

        data = pd.read_csv(data_file, sep=",", header=None)
        print("Warnings raised:", ws)
        # We have an error on specific columns, try and load them as string
        for w in ws:
            s = str(w.message)
            print("Warning message:", s)
            match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
            if match:
                columns = match.group(1).split(',') # Get columns as a list
                columns = [int(c) for c in columns]
                print("Applying %s dtype to columns:" % target_type, columns)
                data.iloc[:,columns] = data.iloc[:,columns].astype(target_type)
        
    chicagoData = chicagoData.append(data,ignore_index=True)
    print len(chicagoData)


('Warnings raised:', [])
485710
('Warnings raised:', [])
972433
('Warnings raised:', [])
1448320
('Warnings raised:', [])
1917629
('Warnings raised:', [])
2371247
('Warnings raised:', [])
2819223
('Warnings raised:', [])
3256088
('Warnings raised:', [])
3682969
('Warnings raised:', [])
4074628
('Warnings raised:', [])
4444648
('Warnings raised:', [])
4795983
('Warnings raised:', [])
5131475
('Warnings raised:', [])
5437860
('Warnings raised:', [])
5711613
('Warnings raised:', [])
5939364

In [49]:
# Extract Features
chicagoData.columns = ['Date', 'Latitude', 'Longitude']
pickledChicago = True
chicago_pickle_file = '../../cs281_data/large_data/chicago.pkl'
if not pickledChicago:
    
    date_time = np.array([x.split() for x in chicagoData.Date])
    date = date_time[:,0]
    time = date_time[:,1]
    tod = date_time[:,2]

    # month, day, year
    date = np.array([x.split('/') for x in date])
    month = [int(x) for x in date[:,0]]
    dom = [int(x) for x in date[:,1]]
    year = [int(x) for x in date[:,2]]
    min_year = np.min(year)
    min_month = np.min(month)
    time_feat = np.subtract(year, min_year)*12 + np.subtract(month,min_month)
    
    time_c = [x.split(':') for x in time]
    time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1]) 
            if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1]) 
            for x,y in zip(time_c, tod)]
    
    chicagoData['Month'] = month
    del month
    chicagoData['DoM'] = dom
    del dom
    chicagoData['Year'] = year
    del year
    chicagoData['ToD'] = tod
    del tod
    chicagoData['Time'] = time
    del time
    chicagoData['TimeFeature'] = time_feat
    del time_feat
    
    # Sort the results
    chicagoData = chicagoData.sort_values(by='TimeFeature')
    
    chicagoData = chicagoData.dropna()
    
    # Save the results to a pickle file
    chicagoData.to_pickle(chicago_pickle_file)
    print "Saved to Pickle File"

In [75]:
# Let's load the data 
with open(chicago_pickle_file) as f:
    %time
    chicagoClean = pickle.load(f)
del chicagoData


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs

In [51]:
# Create heatmaps and histograms if possible
columnsToPlotChicago = { 'Year' : 'Year', 
                         'Month' : 'Month',
                         'Day of Month' : 'DoM',
                         'Hour of Day' : 'ToD'}
plotCrimeCounts(chicagoClean, columnsToPlotChicago, 'Chicago')

In [52]:
# Make histograms of the crime
histograms(chicagoClean, 'Chicago', 'chicago_data_analysis')

In [92]:
# Partition into n = 30 
n = 30
%time
chicagoPartitions = createSimplePartitions(chicagoClean, n)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.87 µs

In [93]:
# Heatmap of the entire Chicago region!
%time
sns.heatmap(createHeatmapData(chicagoPartitions, n))
plt.title('Crime Distribution in Chicago')
plt.show()


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 15 µs

In [94]:
%%timeit
yearByYear(chicagoPartitions, 'chicago_data_analysis', 'Chicago')


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs

In [95]:
%%timeit
monthByMonth(chicagoPartitions, 'chicago_data_analysis', 'Chicago')


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10 µs

In [ ]:
'''
THE BELOW HAS BEEN MOVED TO BASELINES.ipython
'''
# Calculate averages for each region based on the training set
# Last year
last_year = sfclean.Year.max()
training = sfclean[sfclean.Year != last_year]
test = sfclean[sfclean.Year == last_year]

# Partition the data
averageTrainingData = createPartitions(training, 10)

In [ ]:
years = averageTrainingData.Year.unique()
months = averageTrainingData.Month.unique()
regions = averageTrainingData.Region.unique()

# Calculate how many months of data we have
nmonths = 0
for year in years:
    nmonths += len(averageTrainingData[averageTrainingData.Year == year].Month.unique())

In [ ]:
nmonths

In [ ]:
# Now calculat ethe predictions!
predictions = {}
for region in regions:
    data = partitionedData[partitionedData.Region == region]
    crimes = len(data)
    predictions[region] = crimes / float(nmonths)

In [ ]:
# Now calculate the rmse
testAverage = createPartitions(test, 10)

In [ ]:
# Let's create a dictionary with the counts for region,month,year
from collections import defaultdict
crimeCounts = defaultdict(int)
for i, row in testAverage.iterrows():
    crimeCounts[(row.Region, row.Year, row.Month)] += 1

In [ ]:
# Now calculate the RMSE of this function:
error = []
for region in testAverage.Region.unique():
    data = testAverage[testAverage.Region == region]
    for year in data.Year.unique():
        data2 = data[data.Year == year]
        for month in data2.Month.unique():
            try:
                # print crimeCounts[(region, year, month)]
                # print predictions[region]
                error.append(crimeCounts[(region, year, month)] - 
                         predictions[region])
            except KeyError:
                print region

error = np.array(error)
rmse = np.sqrt(np.sum(error ** 2) / len(error))

In [ ]:
rmse

In [ ]:
np.std(crimeCounts.values())

In [ ]:
np.max(crimeCounts.values())

In [ ]: