Phoenix Crime

Cleaning data

The dataset was downloaded from Phoenix city website.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('./data/crimedataset.csv',)
print (df.shape)
df.head(10)


(143108, 7)
/usr/local/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[2]:
INC NUMBER OCCURRED ON OCCURRED TO UCR CRIME CATEGORY 100 BLOCK ADDR ZIP PREMISE TYPE
0 201600000052855 11/01/2015 00:00 01/09/2016 00:00 MOTOR VEHICLE THEFT N 43RD AVE & W CACTUS RD 85029.0 SINGLE FAMILY HOUSE
1 201500002103724 11/01/2015 00:00 11/01/2015 15:21 DRUG OFFENSE 54XX W INDIAN SCHOOL RD 85031.0 APARTMENT
2 201500002102327 11/01/2015 00:00 11/01/2015 09:00 LARCENY-THEFT 51XX N 15TH ST 85014.0 APARTMENT
3 201500002101405 11/01/2015 00:00 11/01/2015 05:00 MOTOR VEHICLE THEFT 102XX W MEDLOCK AVE 85307.0 SINGLE FAMILY HOUSE
4 201500002102668 11/01/2015 00:00 11/01/2015 11:50 MOTOR VEHICLE THEFT 69XX W WOOD ST 85043.0 SINGLE FAMILY HOUSE
5 201600000527709 11/01/2015 00:00 03/22/2016 00:36 LARCENY-THEFT 33XX W CAMELBACK RD 85017.0 PARKING LOT
6 201600000594484 11/01/2015 00:00 NaN RAPE 13XX E ALMERIA RD 85006.0 SINGLE FAMILY HOUSE
7 201700001722914 11/01/2015 00:00 NaN LARCENY-THEFT 279XX N 23RD LN 85085.0 SINGLE FAMILY HOUSE
8 201700001603695 11/01/2015 00:00 03/31/2016 00:00 RAPE 38XX W CAMELBACK RD 85019.0 HOSPITAL
9 201500002168686 11/01/2015 00:00 11/11/2015 09:30 LARCENY-THEFT 14XX E HIGHLAND AVE 85014.0 PARKING LOT

In [3]:
df['OCCURRED ON'].fillna(df['OCCURRED TO'], inplace=True)

In [4]:
temp=pd.DatetimeIndex(df['OCCURRED ON'])

In [5]:
df['date'] = temp.date
df['hour'] = temp.time
df['year'] = (temp.year).astype(int)
df['month'] = (temp.month).astype(int)
df['day'] = (temp.day).astype(int)
df['weekday'] = temp.weekday_name

df.head()


Out[5]:
INC NUMBER OCCURRED ON OCCURRED TO UCR CRIME CATEGORY 100 BLOCK ADDR ZIP PREMISE TYPE date hour year month day weekday
0 201600000052855 11/01/2015 00:00 01/09/2016 00:00 MOTOR VEHICLE THEFT N 43RD AVE & W CACTUS RD 85029.0 SINGLE FAMILY HOUSE 2015-11-01 00:00:00 2015 11 1 Sunday
1 201500002103724 11/01/2015 00:00 11/01/2015 15:21 DRUG OFFENSE 54XX W INDIAN SCHOOL RD 85031.0 APARTMENT 2015-11-01 00:00:00 2015 11 1 Sunday
2 201500002102327 11/01/2015 00:00 11/01/2015 09:00 LARCENY-THEFT 51XX N 15TH ST 85014.0 APARTMENT 2015-11-01 00:00:00 2015 11 1 Sunday
3 201500002101405 11/01/2015 00:00 11/01/2015 05:00 MOTOR VEHICLE THEFT 102XX W MEDLOCK AVE 85307.0 SINGLE FAMILY HOUSE 2015-11-01 00:00:00 2015 11 1 Sunday
4 201500002102668 11/01/2015 00:00 11/01/2015 11:50 MOTOR VEHICLE THEFT 69XX W WOOD ST 85043.0 SINGLE FAMILY HOUSE 2015-11-01 00:00:00 2015 11 1 Sunday

In [6]:
cols = ['OCCURRED ON','date','hour','year','month','day','weekday','ZIP','UCR CRIME CATEGORY','PREMISE TYPE']
df = df[cols]
df.head(10)
df.columns = ['datetime','date','hour','year','month','day','weekday','zip','crime','place']

In [7]:
df.to_csv('./data/cleaneddataset.csv', encoding='utf-8',index=False)

In [8]:
df['year'].unique()


Out[8]:
array([2015, 2016, 2017, 2018])

In [ ]: