The Atlanta Police Department provides Part 1 crime data at http://www.atlantapd.org/i-want-to/crime-data-downloads
A recent copy of the data file is stored in the cluster. Please, do not copy this data file into your home directory!
In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
ls -l /home/data/APD/COBRA-YTD*.csv.gz
In [ ]:
df = pd.read_csv('/home/data/APD/COBRA-YTD-multiyear.csv.gz')
In [13]:
df.shape
Out[13]:
In [14]:
df.dtypes
Out[14]:
In [15]:
#brdf = pd.read_csv('/home/pmolnar/burglary_residence.csv')
#brdf.head()
In [ ]:
dataDict = pd.DataFrame({'DataType': df.dtypes.values, 'Description': '', }, index=df.columns.values)
We need to enter the descriptions for each entry in our dictionary manually. However, why not just create a the Python code automatically...
Run the code below only if you haven't edited the datadict.py
file in a different way, since it will overwrite what you have so far. (That's why the code is commented-out.)
In [ ]:
dataDict
In [ ]:
with open("datadict2.py", "w") as io:
for i in dataDict.index:
io.write("dataDict.loc['%s'].Description = '' # type: %s\n" % (i, str(dataDict.loc[i].DataType)))
In [ ]:
ls -l datadict2.py
In [ ]:
# %load datadict.py
dataDict.loc['MI_PRINX'].Description = '' # type: int64
dataDict.loc['offense_id'].Description = 'Unique ID in the format YYDDDNNNN with the year YY, the day of the year DDD and a counter NNNN' # type: int64
dataDict.loc['rpt_date'].Description = 'Date the crime was reported' # type: object
dataDict.loc['occur_date'].Description = 'Estimated date when the crime occured' # type: object
dataDict.loc['occur_time'].Description = 'Estimated time when the crime occured' # type: object
dataDict.loc['poss_date'].Description = '' # type: object
dataDict.loc['poss_time'].Description = '' # type: object
dataDict.loc['beat'].Description = '' # type: int64
dataDict.loc['apt_office_prefix'].Description = '' # type: object
dataDict.loc['apt_office_num'].Description = '' # type: object
dataDict.loc['location'].Description = '' # type: object
dataDict.loc['MinOfucr'].Description = '' # type: int64
dataDict.loc['MinOfibr_code'].Description = '' # type: object
dataDict.loc['dispo_code'].Description = '' # type: object
dataDict.loc['MaxOfnum_victims'].Description = '' # type: float64
dataDict.loc['Shift'].Description = 'Zones have 8 or 10 hour shifts' # type: object
dataDict.loc['Avg Day'].Description = '' # type: object
dataDict.loc['loc_type'].Description = '' # type: float64
dataDict.loc['UC2 Literal'].Description = '' # type: object
dataDict.loc['neighborhood'].Description = '' # type: object
dataDict.loc['npu'].Description = '' # type: object
dataDict.loc['x'].Description = '' # type: float64
dataDict.loc['y'].Description = '' # type: float64
dataDict.to_csv("COBRA_Data_Dictionary.csv")
In [ ]:
sorted(df.npu.unique())
In [ ]:
len(df.neighborhood.unique())
datetime
to deal with converting parsing and formatting dates and times. See https://docs.python.org/2/library/datetime.htmlpandas
package provides functionality to convert text fields into date/time fields...given the values adhere to a given format. See http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.to_datetime.html
In [16]:
df[['occur_date', 'occur_time']][0:5]
Out[16]:
In [18]:
# function currying
def fixdatetime(fld):
def _fix(s):
date_col = '%s_date' % fld
time_col = '%s_time' % fld
if time_col in s.index:
return str(s[date_col])+' '+str(s[time_col])
else:
return str(s[date_col])+' 00:00:00'
return _fix
##df.apply(fixdatetime('occur'), axis=1)[:10]
In [19]:
for col in ['rpt', 'occur', 'poss']:
datser = df.apply(fixdatetime(col), axis=1)
df['%s_dt'%col] = pd.to_datetime(datser, format="%m/%d/%Y %H:%M:%S", errors='coerce')
In [20]:
df.head()
Out[20]:
In [21]:
df.dtypes
Out[21]:
In [ ]:
df.beat[:10]
In [ ]:
df['Zone'] = df['beat']//100
In [ ]:
df.Zone[:4]
In [ ]:
df['UC2 Literal'].unique()
In [ ]:
##df[df['UC2 Literal']=='LARCENY-FROM VEHICLE']
In [ ]:
df.occur_date.min(), df.occur_date.max()
In [ ]:
df['Year'] = df.rpt_dt.map(lambda d: d.year)
In [ ]:
df.groupby('Year').offense_id.count()
In [ ]:
brdf = df[df['UC2 Literal']=='BURGLARY-RESIDENCE'].copy()
In [ ]:
brdf.shape, df.shape
In [ ]:
def gethour(d):
return d.hour
brdf.occur_dt.map(gethour)
##brdf['occur_dt'].map(gethour)
##brdf.occur_dt.map(lambda d: d.hour)
In [ ]:
print type(brdf.occur_dt)
brdf['Hour'] = brdf.occur_dt.apply(gethour)
In [ ]:
brdf.head()
In [ ]:
br_hr = brdf.groupby(['Hour']).offense_id.count()
In [ ]:
plt.step(br_hr.index, br_hr.values)
In [ ]:
plt.figure(figsize=(20,8))
for z in range(1,7):
plt.subplot(3,2,z)
plt.title("Zone %d" % z)
#brdf[brdf.Zone==z].hist(column='Hour', bins=24)
plt.hist(brdf[brdf.Zone==z].Hour, bins=24)
plt.show()
In [ ]:
plt.figure(figsize=(30,15))
for h in range(24):
plt.subplot(4,6,h+1)
plt.title("Hour %d" % h)
#brdf[brdf.Zone==z].hist(column='Hour', bins=24)
plt.hist(brdf[brdf.Hour==h].Zone, bins=6)
plt.ylim(0,40) ## sets limit on Y-axis
plt.show()
In [ ]:
In [ ]:
df['UC2 Literal'].unique()
In [ ]:
df.groupby(['UC2 Literal', 'Zone']).offense_id.count()
In [ ]:
df['dayofweek'] = df.occur_dt.map(lambda d: d.dayofweek)
In [ ]:
df.groupby(['UC2 Literal','dayofweek']).offense_id.count()
In [ ]:
brdf.apply(lambda r: str(r.location)+', '+str(r.npu), axis=1)
In [ ]:
brdf.apply(np.min, axis=0)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
df.occur_dt.map(lambda d: d.year).unique()
In [ ]:
df['Year'] = df.occur_dt.map(lambda d: d.year)
df2 = df[(df.Year>=2010) & (df.Year<=2017)]
df2.shape, df.shape
https://pandas.pydata.org/pandas-docs/stable/basics.html#descriptive-statistics
In [ ]:
In [ ]:
In [ ]:
df_LarcenyFromVehicle = df2[(df2['UC2 Literal']=='LARCENY-FROM VEHICLE')&(df2.Year==2017)].copy()
agr_LarcenyFromVehicle = df_LarcenyFromVehicle.set_index('occur_dt').resample('W').offense_id.count()
agr_LarcenyFromVehicle
In [ ]:
df_LarcenyFromVehicle["Hour"] = df_LarcenyFromVehicle.occur_dt.map(lambda d: d.hour)
In [ ]:
df_LarcenyFromVehicle.groupby("Hour").offense_id.count()
In [ ]:
hourly = df_LarcenyFromVehicle.resample('H', on='occur_dt').offense_id.count()
In [ ]:
hourly.reset_index().occur_dt.map(lambda d: d.week)
In [ ]:
df3 = pd.DataFrame({"N": hourly})
##df3['Day'] = df3.reset_index().occur_dt ##.map(lambda d: d.day)
df3
In [ ]:
ls
In [22]:
df.columns
Out[22]:
In [24]:
df['occur_month'] = df['occur_dt'].map(lambda dt: dt.month)
In [29]:
df['occur_year'] = df['occur_dt'].map(lambda dt: dt.year)
In [30]:
resdf = df.groupby(['UC2 Literal', 'occur_year', 'occur_month']).offense_id.count()
resdf.head()
Out[30]:
In [26]:
resdf_tbl = resdf.reset_index()
resdf_tbl.head()
Out[26]:
In [38]:
fig = plt.figure(figsize=(10,6))
for yy in range(2009, 2017):
plt.plot(resdf['BURGLARY-RESIDENCE'][yy].index, resdf['BURGLARY-RESIDENCE'][yy], marker='x', label=str(yy))
plt.legend()
plt.ylim(0, 1000)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
##plt.savefig('quiz3-burglary-residence.png')
;
Out[38]:
In [ ]:
fig = plt.figure(figsize=(10,6)) # 10inx10in
#plt.plot(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'])
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.scatter(resdf['BURGLARY-NONRES'].index, resdf['BURGLARY-NONRES'], marker='o')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
fig.savefig('BurglaryResidence_over_month.svg')
x = 1
In [ ]:
fig = plt.figure(figsize=(40,30))
crime_types = crime_year.index.levels[0]
years = crime_year.index.levels[1]
for c in range(len(crime_types)):
y_max = max(crime_year.loc[crime_types[c]])
plt.subplot(4,3,c+1)
plt.hlines(crime_year.loc[crime_types[c]].iloc[-1]*100/y_max, years[0], years[-1], linestyles="dashed", color="r")
plt.bar(crime_year.loc[crime_types[c]].index, crime_year.loc[crime_types[c]]*100/y_max, label=crime_types[c], alpha=0.5)
##plt.legend()
plt.ylim(0, 100)
plt.xticks(years+0.4, [str(int(y)) for y in years], rotation=0, fontsize=24)
plt.yticks([0,20,40,60,80,100], ['0%','20%','40%','60%','80%','100%'], fontsize=24)
plt.title(crime_types[c], fontsize=30)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
resburglaries = crime_year_month.loc[crime_types[c]]
fig = plt.figure(figsize=(20,10))
for y in years:
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s"%crime_types[c], fontsize=20)
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
std = resburglaries.loc[y].std()
##plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, (resburglaries.loc[y]-avg)/std, label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (normalized)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
plt.ylabel("Standard deviations $\sigma_y$")
None
In [ ]:
In [ ]:
In [ ]: