In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
The Atlanta Police Department provides Part 1 crime data at http://www.atlantapd.org/i-want-to/crime-data-downloads
A recent copy of the data file is stored in the cluster. Please, do not copy this data file into your home directory!
In [1]:
### Load libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Load data (don't change this if you're running the notebook on the cluster)
We have two files
/home/data/APD/COBRA083016_2015.xlsx for 2015/home/data/APD/COBRA083016.xlsx from 2009 to current date
In [2]:
%%time
df = pd.read_excel('/home/data/APD/COBRA083016_2015.xlsx', sheetname='Query')
In [3]:
df.shape
Out[3]:
In [4]:
for c in df.columns:
print(c)
In [5]:
df.head()
Out[5]:
In [24]:
dfshort = df[['offense_id','occur_date','occur_time','UC2 Literal']]
In [26]:
dfshort.index = pd.TimedeltaIndex(list(dfshort.occur_time))
In [27]:
dfshort.head()
Out[27]:
In [7]:
df.offense_id.min(), df.offense_id.max()
Out[7]:
In [11]:
df.columns
Out[11]:
In [8]:
crime_summary = df.groupby(['UC2 Literal', 'neighborhood']).offense_id.count()
In [13]:
crime_summary.index
Out[13]:
In [10]:
crime_summary.reset_index().head(20)
Out[10]:
In [ ]:
In [ ]:
In [ ]:
In [15]:
df["Zone"] = df.beat // 100
In [16]:
df
Out[16]:
In [11]:
df[['offense_id', 'occur_date', 'occur_time', 'rpt_date']][1:10]
Out[11]:
Convert into date-time type
In [12]:
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
In [19]:
#df[['offense_id', 'occur_date', 'occur_time', 'occur_ts', 'rpt_date']][1:10]
In [ ]:
In [20]:
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
In [21]:
df['occur_month'] = df['occur_ts'].map(lambda x: x.month)
df['occur_woy'] = df.occur_ts.dt.weekofyear
In [22]:
df.describe()
Out[22]:
In [23]:
df.shape
Out[23]:
In [24]:
df.columns
Out[24]:
In [13]:
df.iloc[9]
Out[13]:
In [27]:
#resdf.index
In [28]:
resdf.loc[['AUTO_THEFT', 6]]
In [16]:
resdf = df.groupby(['UC2 Literal', 'occur_date']).offense_id.count()
resdf
Out[16]:
In [17]:
resdf['BURGLARY-RESIDENCE'].as_matrix()
Out[17]:
In [19]:
resdf['BURGLARY-RESIDENCE'].iloc(0)
Out[19]:
In [22]:
resdf['BURGLARY-RESIDENCE']
Out[22]:
In [25]:
%matplotlib inline
fig = plt.figure(figsize=(10,6)) # 10inx10in
#plt.plot(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'])
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'].values, marker='x')
# plt.scatter(resdf['BURGLARY-NONRES'].index, resdf['BURGLARY-NONRES'], marker='o')
# plt.ylim(0, 500)
# plt.title('BURGLARY-RESIDENCE')
# plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
# fig.savefig('BurglaryResidence_over_month.svg')
# x = 1
In [ ]:
def getTheMonth(x):
return x.month
df['occur_month'] = df['occur_ts'].map(getTheMonth)
In [ ]:
df = pd.read_excel('/home/data/APD/COBRA083016_2015.xlsx', sheetname='Query')
df['occur_ts'] = pd.to_datetime(df.occur_date+' '+df.occur_time)
df['occur_month'] = df['occur_ts'].map(lambda x: x.month)
df['occur_woy'] = df.occur_ts.dt.weekofyear
In [ ]:
%matplotlib inline
resdf = df.groupby(['UC2 Literal', 'occur_month']).offense_id.count()
fig = plt.figure(figsize=(10,6))
plt.scatter(resdf['BURGLARY-RESIDENCE'].index, resdf['BURGLARY-RESIDENCE'], marker='x')
plt.ylim(0, 500)
plt.title('BURGLARY-RESIDENCE')
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.savefig('quiz3-burglary-residence.png')
''
In [ ]:
plt.savefig('quiz3-burglary-residence.png')
In [28]:
pd.unique(df) # how to get duplicate records?
In [29]:
pd.unique(df['UC2 Literal'])
Out[29]:
In [30]:
len(pd.unique(df.MI_PRINX))
Out[30]:
In [ ]:
## load complete dataset
dff = pd.read_excel('/home/data/APD/COBRA083016.xlsx', sheetname='Query')
In [ ]:
dff.shape
In [ ]:
for evt in ['occur', 'poss']:
dff['%s_ts'%evt] = pd.to_datetime(dff['%s_date'%evt]+' '+dff['%s_time'%evt])
dff['rpt_ts'] = pd.to_datetime(dff.rpt_date)
In [ ]:
', '.join(dff.columns)
In [ ]:
dff['occur_year'] = dff.occur_ts.dt.year
dff['occur_month'] = dff.occur_ts.dt.month
dff['occur_dayweek'] = dff.occur_ts.dt.dayofweek
In [ ]:
crime_year = dff[dff.occur_year.between(2009, 2015)].groupby(by=['UC2 Literal', 'occur_year']).offense_id.count()
In [ ]:
%matplotlib inline
fig = plt.figure(figsize=(40,30))
crime_types = crime_year.index.levels[0]
years = crime_year.index.levels[1]
for c in range(len(crime_types)):
y_max = max(crime_year.loc[crime_types[c]])
plt.subplot(4,3,c+1)
plt.hlines(crime_year.loc[crime_types[c]].iloc[-1]*100/y_max, years[0], years[-1], linestyles="dashed", color="r")
plt.bar(crime_year.loc[crime_types[c]].index, crime_year.loc[crime_types[c]]*100/y_max, label=crime_types[c], alpha=0.5)
##plt.legend()
plt.ylim(0, 100)
plt.xticks(years+0.4, [str(int(y)) for y in years], rotation=0, fontsize=24)
plt.yticks([0,20,40,60,80,100], ['0%','20%','40%','60%','80%','100%'], fontsize=24)
plt.title(crime_types[c], fontsize=30)
None
Let's look at residential burglary.
In [ ]:
c = 3
crime_types[c]
In [ ]:
crime_year_month = dff[dff.occur_year.between(2009, 2015)].groupby(by=['UC2 Literal', 'occur_year', 'occur_month']).offense_id.count()
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
resburglaries = crime_year_month.loc[crime_types[c]]
fig = plt.figure(figsize=(20,10))
for y in years:
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s"%crime_types[c], fontsize=20)
plt.xticks(range(13), ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
Normalized over the annual average
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, resburglaries.loc[y], label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (with annuale averages)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
None
In [ ]:
c = 3 ## 'BURGLARY-RESIDENCE'
fig = plt.figure(figsize=(20,10))
for y in years:
avg = resburglaries.loc[y].mean()
std = resburglaries.loc[y].std()
##plt.hlines(avg, 1, 13, linestyle='dashed')
plt.plot(resburglaries.loc[y].index, (resburglaries.loc[y]-avg)/std, label=("%4.0f"%y))
plt.legend()
plt.title("Seasonal Trends - %s (normalized)"%crime_types[c], fontsize=20)
plt.xticks(list(range(1,13)), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlim(0,13)
plt.ylabel("Standard deviations $\sigma_y$")
None
In [ ]:
In [ ]:
seasonal_adjust = resburglaries.reset_index().groupby(by=['occur_month']).offense_id.agg('mean')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Suppose there are $n$ data points {{math|{(''xi'', ''yi''), ''i'' {{=}} 1, ..., ''n''}.}} The function that describes x and y is:
$$y_i = \alpha + \beta x_i + \varepsilon_i.$$The goal is to find the equation of the straight line
$$y = \alpha + \beta x,$$which would provide a "best" fit for the data points. Here the "best" will be understood as in the [[Ordinary least squares|least-squares]] approach: a line that minimizes the sum of squared residuals of the linear regression model. In other words, {{mvar|α}} (the {{mvar|y}}-intercept) and {{mvar|β}} (the slope) solve the following minimization problem:
$$\text{Find }\min_{\alpha,\,\beta} Q(\alpha, \beta), \qquad \text{for } Q(\alpha, \beta) = \sum_{i=1}^n\varepsilon_i^{\,2} = \sum_{i=1}^n (y_i - \alpha - \beta x_i)^2\ $$By using either [[calculus]], the geometry of [[inner product space]]s, or simply expanding to get a quadratic expression in {{mvar|α}} and {{mvar|β}}, it can be shown that the values of {{mvar|α}} and {{mvar|β}} that minimize the objective function {{mvar|Q}}Kenney, J. F. and Keeping, E. S. (1962) "Linear Regression and Correlation." Ch. 15 in ''Mathematics of Statistics'', Pt. 1, 3rd ed. Princeton, NJ: Van Nostrand, pp. 252–285 are
:
where {{math|''rxy''}} is the [[Correlation#Pearson's product-moment coefficient|sample correlation coefficient]] between {{mvar|x}} and {{mvar|y}}; and {{math|''sx''}} and {{math|''sy''}} are the [[sample standard deviation]] of {{mvar|x}} and {{mvar|y}}. A horizontal bar over a quantity indicates the average value of that quantity. For example:
:
Substituting the above expressions for and into
:
yields
:
This shows that {{math|''rxy''}} is the slope of the regression line of the [[Standard score|standardized]] data points (and that this line passes through the origin).
It is sometimes useful to calculate {{math|''rxy''}} from the data independently using this equation:
:
The [[coefficient of determination]] (R squared) is equal to when the model is linear with a single independent variable. See [[Correlation#Pearson's product-moment coefficient|sample correlation coefficient]] for additional details.
===Linear regression without the intercept term=== Sometimes it is appropriate to force the regression line to pass through the origin, because {{mvar|x}} and {{mvar|y}} are assumed to be proportional. For the model without the intercept term, {{math|''y'' {{=}} ''βx''}}, the OLS estimator for {{mvar|β}} simplifies to
:
Substituting {{math|(''x'' − ''h'', ''y'' − ''k'')}} in place of {{math|(''x'', ''y'')}} gives the regression through {{math|(''h'', ''k'')}}:
:
The last form above demonstrates how moving the line away from the center of mass of the data points affects the slope.
In [29]:
### in case we want to save a DataFrame
#writer = pd.ExcelWriter('myresults.xlsx')
#df.to_excel(writer,'Results')
#writer.save()
In [30]:
#resdf
In [ ]:
In [ ]:
In [ ]:
In [ ]: