The Atlanta Police Department provides Part 1 crime data at http://www.atlantapd.org/i-want-to/crime-data-downloads
A recent copy of the data file is stored in the cluster. Please, do not copy this data file into your home directory!
In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
# load data set
df = pd.read_csv('/home/data/APD/COBRA-YTD2017.csv.gz')
print "Shape of table: ", df.shape
Let's look at the structure of this table. We're actually creating some text output that can be used to create a data dictionary.
In [ ]:
dataDict = pd.DataFrame({'DataType': df.dtypes.values, 'Description': '', }, index=df.columns.values)
We need to enter the descriptions for each entry in our dictionary manually. However, why not just create a the Python code automatically...
Run the code below only if you haven't edited the datadict.py
file in a different way, since it will overwrite what you have so far. (That's why the code is commented-out.)
In [ ]:
# with open("datadict.py", "w") as io:
# for i in dataDict.index:
# io.write("dataDict.loc['%s'].Description = '' # type: %s\n" % (i, str(dataDict.loc[i].DataType)))
In [ ]:
# %load datadict.py
dataDict.loc['MI_PRINX'].Description = '' # type: int64
dataDict.loc['offense_id'].Description = 'Unique ID in the format YYDDDNNNN with the year YY, the day of the year DDD and a counter NNNN' # type: int64
dataDict.loc['rpt_date'].Description = 'Date the crime was reported' # type: object
dataDict.loc['occur_date'].Description = 'Estimated date when the crime occured' # type: object
dataDict.loc['occur_time'].Description = 'Estimated time when the crime occured' # type: object
dataDict.loc['poss_date'].Description = '' # type: object
dataDict.loc['poss_time'].Description = '' # type: object
dataDict.loc['beat'].Description = '' # type: int64
dataDict.loc['apt_office_prefix'].Description = '' # type: object
dataDict.loc['apt_office_num'].Description = '' # type: object
dataDict.loc['location'].Description = '' # type: object
dataDict.loc['MinOfucr'].Description = '' # type: int64
dataDict.loc['MinOfibr_code'].Description = '' # type: object
dataDict.loc['dispo_code'].Description = '' # type: object
dataDict.loc['MaxOfnum_victims'].Description = '' # type: float64
dataDict.loc['Shift'].Description = 'Zones have 8 or 10 hour shifts' # type: object
dataDict.loc['Avg Day'].Description = '' # type: object
dataDict.loc['loc_type'].Description = '' # type: float64
dataDict.loc['UC2 Literal'].Description = '' # type: object
dataDict.loc['neighborhood'].Description = '' # type: object
dataDict.loc['npu'].Description = '' # type: object
dataDict.loc['x'].Description = '' # type: float64
dataDict.loc['y'].Description = '' # type: float64
dataDict.to_csv("COBRA_Data_Dictionary.csv")
In [ ]:
print df.groupby("Shift").count().index
datetime
to deal with converting parsing and formatting dates and times. See https://docs.python.org/2/library/datetime.htmlpandas
package provides functionality to convert text fields into date/time fields...given the values adhere to a given format. See http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.to_datetime.html
In [4]:
# function currying
def fixdatetime(fld):
def _fix(s):
date_col = '%s_date' % fld
time_col = '%s_time' % fld
if time_col in s.index:
return str(s[date_col])+' '+str(s[time_col])
else:
return str(s[date_col])+' 00:00:00'
return _fix
df.apply(fixdatetime('rpt'), axis=1)[:10]
Out[4]:
In [5]:
for col in ['rpt', 'occur', 'poss']:
datser = df.apply(fixdatetime(col), axis=1)
df['%s_dt'%col] = pd.to_datetime(datser, format="%m/%d/%Y %H:%M:%S", errors='coerce')
In [6]:
df.head()
Out[6]:
In [41]:
df['Zone'] = df['beat']//100
In [ ]:
In [ ]:
df['UC2 Literal'].unique()
In [42]:
df[df['UC2 Literal']=='LARCENY-FROM VEHICLE']
Out[42]:
In [ ]:
df.occur_dt.map(lambda d: d.year).unique()
In [43]:
df['Year'] = df.occur_dt.map(lambda d: d.year)
df2 = df[(df.Year>=2010) & (df.Year<=2017)]
df2.shape, df.shape
Out[43]:
https://pandas.pydata.org/pandas-docs/stable/basics.html#descriptive-statistics
In [ ]:
In [ ]:
In [44]:
df_LarcenyFromVehicle = df2[(df2['UC2 Literal']=='LARCENY-FROM VEHICLE')&(df2.Year==2017)].copy()
agr_LarcenyFromVehicle = df_LarcenyFromVehicle.set_index('occur_dt').resample('W').offense_id.count()
agr_LarcenyFromVehicle
Out[44]:
In [30]:
df_LarcenyFromVehicle["Hour"] = df_LarcenyFromVehicle.occur_dt.map(lambda d: d.hour)
In [33]:
df_LarcenyFromVehicle.groupby("Hour").offense_id.count()
Out[33]:
In [53]:
hourly = df_LarcenyFromVehicle.resample('H', on='occur_dt').offense_id.count()
In [71]:
hourly.reset_index().occur_dt.map(lambda d: d.week)
Out[71]:
In [64]:
df3 = pd.DataFrame({"N": hourly})
##df3['Day'] = df3.reset_index().occur_dt ##.map(lambda d: d.day)
df3
Out[64]:
In [73]:
ls
In [ ]:
fig = plt.figure(figsize=(10,6)) # 10inx10in
#plt.plot(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'])
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.scatter(resdf['BURGLARY-NONRES'].index, resdf['BURGLARY-NONRES'], marker='o')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
fig.savefig('BurglaryResidence_over_month.svg')
x = 1
In [ ]:
def getTheMonth(x):
return x.month
df['occur_month'] = df['occur_ts'].map(getTheMonth)
In [ ]:
resdf = df.groupby(['UC2 Literal', 'occur_month']).offense_id.count()
fig = plt.figure(figsize=(10,6))
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.savefig('quiz3-burglary-residence.png')
In [ ]:
fig = plt.figure(figsize=(40,30))
crime_types = crime_year.index.levels[0]
years = crime_year.index.levels[1]
for c in range(len(crime_types)):
y_max = max(crime_year.loc[crime_types[c]])
plt.subplot(4,3,c+1)
plt.hlines(crime_year.loc[crime_types[c]].iloc[-1]*100/y_max, years[0], years[-1], linestyles="dashed", color="r")
plt.bar(crime_year.loc[crime_types[c]].index, crime_year.loc[crime_types[c]]*100/y_max, label=crime_types[c], alpha=0.5)
##plt.legend()
plt.ylim(0, 100)
plt.xticks(years+0.4, [str(int(y)) for y in years], rotation=0, fontsize=24)
plt.yticks([0,20,40,60,80,100], ['0%','20%','40%','60%','80%','100%'], fontsize=24)
plt.title(crime_types[c], fontsize=30)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
resburglaries = crime_year_month.loc[crime_types[c]]
fig = plt.figure(figsize=(20,10))
for y in years:
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s"%crime_types[c], fontsize=20)
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
std = resburglaries.loc[y].std()
##plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, (resburglaries.loc[y]-avg)/std, label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (normalized)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
plt.ylabel("Standard deviations $\sigma_y$")
None
In [ ]:
In [ ]:
In [ ]: