In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')
In [3]:
# load csv data into dask df
file_name = '../data/Crimes_-_2017.csv'
crimes = dd.read_csv(file_name,
error_bad_lines=False,
assume_missing=True)
In [4]:
%%time
# log records count and data partitions
print('{:,} total records in {} partitions'\
.format(len(crimes), crimes.npartitions))
In [5]:
%%time
# drop duplicates
crimes.drop_duplicates(subset=['ID', 'Case Number'], inplace=True)
Out[5]:
In [6]:
%%time
# persist in memory
crimes = crimes.persist()
In [7]:
%%time
print("DataFrame size: {:,}".format(crimes.size.compute()))
In [8]:
# get top 2 records
crimes.head(2)
Out[8]:
In [9]:
# get last 2 records
crimes.tail(2)
Out[9]:
In [10]:
# strip out white space from column names
crimes = crimes.rename(columns={c: c.replace(' ', '') for c in crimes.columns})
crimes.head(2)
Out[10]:
In [11]:
# list columns
crimes.columns
Out[11]:
In [12]:
# infer data types
crimes.dtypes
Out[12]:
In [13]:
def unique_column_values(df):
for column in df.columns:
print("{} | {} | {}".format(
df[column].name,
len(df[column].unique()),
df[column].dtype))
In [14]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(crimes)
In [15]:
# reduce data set
select_columns = ['Date', 'Block', 'PrimaryType','Description', 'LocationDescription',
'Arrest', 'Domestic', 'Latitude', 'Longitude']
In [16]:
crimes = crimes[select_columns]
print("{:,} total records".format(len(crimes)))
crimes.head(2)
Out[16]:
In [17]:
# drop duplicates
crimes = crimes.drop_duplicates() #.dropna()
print("{:,} total records".format(len(crimes)))
crimes.head(2)
Out[17]:
In [18]:
# count arrests
arrests = crimes[crimes.Arrest==True]
print("{:,} arrests".format(len(arrests)))
arrests.head()
Out[18]:
In [19]:
# domestic violance
domestic = crimes[crimes.Domestic==True]
print("{:,} domestic crime reports".format(len(domestic)))
domestic.head()
Out[19]:
In [20]:
crimes.info()
In [21]:
# get narcotics crimes
narcotics = crimes[crimes['PrimaryType']=='NARCOTICS']
narcotics = narcotics.set_index('Date')
narcotics.head()
Out[21]:
In [22]:
# get narcotics crime description counts
narcotics_crimes = narcotics[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(narcotics_crimes.head(10))
print('...')
print('Total Narcotics Crime Descriptions: {}'.format(len(narcotics_crimes)))
In [23]:
# plot top 20 narcotics crimes
narcotics_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top 2017 Chicago Narcotics Crimes')
plt.xlabel('Number of Crimes')
plt.ylabel('Narcotics Crime Type')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [24]:
# get other offenses crimes
other_offenses = crimes[crimes['PrimaryType']=='OTHER OFFENSE']
other_offenses = other_offenses.set_index('Date')
other_offenses.head()
Out[24]:
In [25]:
# get other offense crime description counts
other_offense_crimes = other_offenses[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(other_offense_crimes.head(10))
print('...')
print('Total Other Offense Crime Descriptions: {}'.format(len(other_offense_crimes)))
In [26]:
# plot top 20 other offense crimes
other_offense_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top 2017 Chicago Other Offense Crimes')
plt.xlabel('Number of Crimes')
plt.ylabel('Other Offense Crime Type')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [ ]: