In [1]:
import pandas as pd
pd.options.display.max_rows = 6
%matplotlib inline
In [2]:
crimes = pd.read_csv('Crimes_-_2001_to_present.csv', parse_dates=['Date'])
In [3]:
crimes50_by_community = crimes.head(50).groupby('Community Area')
crimes50_by_community.groups
Out[3]:
In [4]:
crimes_by_community = crimes.groupby('Community Area')
In [5]:
community_crime_count = crimes_by_community['ID'].agg('count')
community_crime_count
Out[5]:
In [6]:
community_crime_count.plot(kind='bar', figsize=(12,5))
Out[6]:
In [7]:
def to_day(timestamp):
return timestamp.replace(minute=0,hour=0, second=0)
crimes['Day'] = crimes['Date'].apply(to_day)
In [8]:
crimes_by_day = crimes.groupby('Day')
In [36]:
crimes_by_day['ID'].agg('nunique').plot()
Out[36]:
What about a time series for a single community area?
In [10]:
crimes[crimes['Community Area'] == 41].groupby('Day')['ID'].agg('count').plot()
Out[10]:
In [11]:
crimes_by_community_day = crimes.groupby(['Community Area', 'Day'])
crimes_by_community_day_count = crimes_by_community_day['ID'].agg('count')
In [12]:
crimes_by_community_day_count
Out[12]:
Similarly for arrests, then we can take their quotient:
In [13]:
crimes_by_community_day_arrests = crimes_by_community_day['Arrest'].agg('sum')
In [14]:
community_day_arrest_prop = crimes_by_community_day_arrests / crimes_by_community_day_count
community_day_arrest_prop
Out[14]:
In [40]:
crimes.groupby(['Community Area', 'Primary Type'])['ID'].agg('count').unstack('Community Area')
Out[40]:
In [15]:
community_day_arrest_prop.unstack('Community Area')
Out[15]:
There are missing values after reshaping.
That means there were no crimes in that area on that day so fillna(0)
In [42]:
community_arrest_timeseries = community_day_arrest_prop.unstack('Community Area')
#community_arrest_timeseries.fillna(0, inplace=True)
community_arrest_timeseries
Out[42]:
Now we can plot multiple community area timeseries:
In [43]:
community_arrest_timeseries[[40,41,42]]
Out[43]:
In [55]:
community_arrest_timeseries[[1, 10]]
Out[55]:
In [44]:
community_arrest_timeseries[[40,41,42]].plot()
Out[44]:
In [18]:
housing = pd.read_csv('Affordable_Rental_Housing_Developments.csv')
In [56]:
housing['Property Type'].value_counts()
Out[56]:
In [19]:
housing
Out[19]:
In [20]:
community_area_crime = pd.DataFrame({'Crime Count': community_crime_count})
community_area_crime
Out[20]:
In [21]:
housing_crime = housing.merge(community_area_crime,
left_on='Community Area Number', right_index=True)
housing_crime
Out[21]:
In [22]:
community_housing = pd.DataFrame({
'Affordable Housing Units': housing.groupby('Community Area Number')['Units'].agg('sum')
})
community_housing
Out[22]:
In [57]:
community_housing.shape
Out[57]:
In [58]:
housing_crime_aggregate = community_housing.merge(community_area_crime,
left_index=True, right_index=True)
housing_crime_aggregate.shape
Out[58]:
In [59]:
housing_crime_aggregate
Out[59]:
In [24]:
housing_crime_aggregate = community_housing.merge(
community_area_crime, left_index=True, right_index=True, how='outer')
housing_crime_aggregate.shape
Out[24]:
In [25]:
housing_crime_aggregate
Out[25]:
In [63]:
df = pd.DataFrame(columns=['column1', 'asdf'], index=range(10))
In [70]:
df.iloc[0]['column1'] = 10
In [71]:
df
Out[71]:
Rows without a corresponding row in the merge are filled with missing values.
In our case those should be zeros, so we can fillna(0)
.
In [26]:
housing_crime_aggregate.fillna(0, inplace=True)
housing_crime_aggregate.plot(kind='scatter', x='Affordable Housing Units', y='Crime Count')
Out[26]: