In [ ]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
The Atlanta Police Department provides Part 1 crime data at http://www.atlantapd.org/i-want-to/crime-data-downloads
A recent copy of the data file is stored in the cluster. Please, do not copy this data file into your home directory!
In [1]:
### Load libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [ ]:
help(plt.legend)
Load data (don't change this if you're running the notebook on the cluster)
We have two files
/home/data/APD/COBRA083016_2015.xlsx
for 2015/home/data/APD/COBRA083016.xlsx
from 2009 to current date
In [2]:
%%time
df = pd.read_excel('/home/data/APD/COBRA083016_2015.xlsx', sheetname='Query')
In [3]:
df.shape
Out[3]:
In [4]:
for c in df.columns:
print(c)
In [5]:
df[0:5]
Out[5]:
In [6]:
df.describe()
Out[6]:
In [7]:
df.offense_id.min(), df.offense_id.max()
Out[7]:
In [19]:
crime_summary = df.groupby(['UC2 Literal', 'Zone']).offense_id.count()
In [21]:
crime_summary.index
Out[21]:
In [22]:
crime_summary.reset_index().head()
Out[22]:
In [ ]:
In [ ]:
In [ ]:
In [16]:
df["Zone"] = df.beat // 100
In [17]:
df
Out[17]:
In [23]:
df[['offense_id', 'occur_date', 'occur_time', 'rpt_date']][1:10]
Out[23]:
Convert into date-time type
In [ ]:
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
In [ ]:
#df[['offense_id', 'occur_date', 'occur_time', 'occur_ts', 'rpt_date']][1:10]
In [ ]:
In [24]:
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
In [25]:
df['occur_month'] = df['occur_ts'].map(lambda x: x.month)
df['occur_woy'] = df.occur_ts.dt.weekofyear
In [26]:
df.describe()
Out[26]:
In [37]:
df.shape
Out[37]:
In [38]:
df.columns
Out[38]:
In [49]:
df.iloc[9]
Out[49]:
In [45]:
resdf.index
Out[45]:
In [48]:
resdf.loc[['AUTO_THEFT', 6]]
Out[48]:
In [35]:
resdf = df.groupby(['UC2 Literal', 'occur_month']).offense_id.count()
resdf
Out[35]:
In [28]:
resdf['BURGLARY-RESIDENCE'].as_matrix()
Out[28]:
In [ ]:
resdf['BURGLARY-RESIDENCE'].iloc(0)
In [29]:
%matplotlib inline
fig = plt.figure(figsize=(10,6)) # 10inx10in
#plt.plot(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'])
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.scatter(resdf['BURGLARY-NONRES'].index, resdf['BURGLARY-NONRES'], marker='o')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
fig.savefig('BurglaryResidence_over_month.svg')
x = 1
In [32]:
def getTheMonth(x):
return x.month
df['occur_month'] = df['occur_ts'].map(getTheMonth)
In [30]:
df = pd.read_excel('/home/data/APD/COBRA083016_2015.xlsx', sheetname='Query')
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
df['occur_month'] = df['occur_ts'].map(lambda x: x.month)
df['occur_woy'] = df.occur_ts.dt.weekofyear
In [31]:
%matplotlib inline
resdf = df.groupby(['UC2 Literal', 'occur_month']).offense_id.count()
fig = plt.figure(figsize=(10,6))
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.savefig('quiz3-burglary-residence.png')
''
Out[31]:
In [ ]:
plt.savefig('quiz3-burglary-residence.png')
In [ ]:
In [ ]:
In [34]:
## load complete dataset
dff = pd.read_excel('/home/data/APD/COBRA083016.xlsx', sheetname='Query')
In [ ]:
dff.shape
In [ ]:
for evt in ['occur', 'poss']:
dff['%s_ts'%evt] = pd.to_datetime(dff['%s_date'%evt]+' '+dff['%s_time'%evt])
dff['rpt_ts'] = pd.to_datetime(dff.rpt_date)
In [ ]:
', '.join(dff.columns)
In [ ]:
dff['occur_year'] = dff.occur_ts.dt.year
dff['occur_month'] = dff.occur_ts.dt.month
dff['occur_dayweek'] = dff.occur_ts.dt.dayofweek
In [33]:
crime_year = dff[dff.occur_year.between(2009, 2015)].groupby(by=['UC2 Literal', 'occur_year']).offense_id.count()
In [ ]:
%matplotlib inline
fig = plt.figure(figsize=(40,30))
crime_types = crime_year.index.levels[0]
years = crime_year.index.levels[1]
for c in range(len(crime_types)):
y_max = max(crime_year.loc[crime_types[c]])
plt.subplot(4,3,c+1)
plt.hlines(crime_year.loc[crime_types[c]].iloc[-1]*100/y_max, years[0], years[-1], linestyles="dashed", color="r")
plt.bar(crime_year.loc[crime_types[c]].index, crime_year.loc[crime_types[c]]*100/y_max, label=crime_types[c], alpha=0.5)
##plt.legend()
plt.ylim(0, 100)
plt.xticks(years+0.4, [str(int(y)) for y in years], rotation=0, fontsize=24)
plt.yticks([0,20,40,60,80,100], ['0%','20%','40%','60%','80%','100%'], fontsize=24)
plt.title(crime_types[c], fontsize=30)
None
Let's look at residential burglary.
In [ ]:
c = 3
crime_types[c]
In [ ]:
crime_year_month = dff[dff.occur_year.between(2009, 2015)].groupby(by=['UC2 Literal', 'occur_year', 'occur_month']).offense_id.count()
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
resburglaries = crime_year_month.loc[crime_types[c]]
fig = plt.figure(figsize=(20,10))
for y in years:
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s"%crime_types[c], fontsize=20)
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
Normalized over the annual average
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (with annuale averages)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
std = resburglaries.loc[y].std()
##plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, (resburglaries.loc[y]-avg)/std, label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (normalized)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
plt.ylabel("Standard deviations $\sigma_y$")
None
In [ ]:
In [ ]:
seasonal_adjust = resburglaries.reset_index().groupby(by=['occur_month']).offense_id.agg('mean')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Suppose there are $n$ data points {{math|{(''xi'', ''yi''), ''i'' {{=}} 1, ..., ''n''}.}} The function that describes x and y is:
$$y_i = \alpha + \beta x_i + \varepsilon_i.$$The goal is to find the equation of the straight line
$$y = \alpha + \beta x,$$which would provide a "best" fit for the data points. Here the "best" will be understood as in the [[Ordinary least squares|least-squares]] approach: a line that minimizes the sum of squared residuals of the linear regression model. In other words, {{mvar|α}} (the {{mvar|y}}-intercept) and {{mvar|β}} (the slope) solve the following minimization problem:
$$\text{Find }\min_{\alpha,\,\beta} Q(\alpha, \beta), \qquad \text{for } Q(\alpha, \beta) = \sum_{i=1}^n\varepsilon_i^{\,2} = \sum_{i=1}^n (y_i - \alpha - \beta x_i)^2\ $$By using either [[calculus]], the geometry of [[inner product space]]s, or simply expanding to get a quadratic expression in {{mvar|α}} and {{mvar|β}}, it can be shown that the values of {{mvar|α}} and {{mvar|β}} that minimize the objective function {{mvar|Q}}Kenney, J. F. and Keeping, E. S. (1962) "Linear Regression and Correlation." Ch. 15 in ''Mathematics of Statistics'', Pt. 1, 3rd ed. Princeton, NJ: Van Nostrand, pp. 252–285 are
:
where {{math|''rxy''}} is the [[Correlation#Pearson's product-moment coefficient|sample correlation coefficient]] between {{mvar|x}} and {{mvar|y}}; and {{math|''sx''}} and {{math|''sy''}} are the [[sample standard deviation]] of {{mvar|x}} and {{mvar|y}}. A horizontal bar over a quantity indicates the average value of that quantity. For example:
:
Substituting the above expressions for and into
:
yields
:
This shows that {{math|''rxy''}} is the slope of the regression line of the [[Standard score|standardized]] data points (and that this line passes through the origin).
It is sometimes useful to calculate {{math|''rxy''}} from the data independently using this equation:
:
The [[coefficient of determination]] (R squared) is equal to when the model is linear with a single independent variable. See [[Correlation#Pearson's product-moment coefficient|sample correlation coefficient]] for additional details.
===Linear regression without the intercept term=== Sometimes it is appropriate to force the regression line to pass through the origin, because {{mvar|x}} and {{mvar|y}} are assumed to be proportional. For the model without the intercept term, {{math|''y'' {{=}} ''βx''}}, the OLS estimator for {{mvar|β}} simplifies to
:
Substituting {{math|(''x'' − ''h'', ''y'' − ''k'')}} in place of {{math|(''x'', ''y'')}} gives the regression through {{math|(''h'', ''k'')}}:
:
The last form above demonstrates how moving the line away from the center of mass of the data points affects the slope.
In [ ]:
### in case we want to save a DataFrame
#writer = pd.ExcelWriter('myresults.xlsx')
#df.to_excel(writer,'Results')
#writer.save()
In [ ]:
resdf
In [ ]:
In [ ]:
In [ ]:
In [ ]: