In [1]:
import pandas as pd
import numpy as np
import datetime

data = pd.read_csv('311_data.csv')

In [2]:
ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
ccab_data[0:5]


Out[2]:
Created Date Closed Date Agency Borough
0 01/01/2016 12:00:09 AM 01/01/2016 01:57:32 AM NYPD BROOKLYN
1 01/01/2016 12:00:40 AM 01/01/2016 03:12:53 AM NYPD BRONX
2 01/01/2016 12:01:09 AM 01/21/2016 09:20:55 AM HPD BRONX
3 01/01/2016 12:02:59 AM 01/01/2016 11:35:50 PM NYPD Unspecified
4 01/01/2016 12:03:03 AM 01/08/2016 01:13:00 AM HPD BRONX

In [3]:
ccab_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null object
Closed Date     1518 non-null object
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: object(4)
memory usage: 47.8+ KB

In [25]:
#ccab_data = ccab_data[(ccab_data['Closed Date'].notnull()) & (ccab_data['Borough'].notnull())]
#ccab_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1517 entries, 0 to 1525
Data columns (total 4 columns):
Created Date    1517 non-null datetime64[ns]
Closed Date     1517 non-null object
Agency          1517 non-null object
Borough         1517 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 59.3+ KB

In [6]:
ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
ccab_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null datetime64[ns]
Closed Date     1518 non-null object
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 47.8+ KB

In [8]:
ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [9]:
ccab_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null datetime64[ns]
Closed Date     1518 non-null datetime64[ns]
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: datetime64[ns](2), object(2)
memory usage: 47.8+ KB

In [10]:
ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [11]:
ccab_data[0:5]


Out[11]:
Created Date Closed Date Agency Borough processing_time
0 2016-01-01 00:00:09 2016-01-01 01:57:32 NYPD BROOKLYN 0 days 01:57:23
1 2016-01-01 00:00:40 2016-01-01 03:12:53 NYPD BRONX 0 days 03:12:13
2 2016-01-01 00:01:09 2016-01-21 09:20:55 HPD BRONX 20 days 09:19:46
3 2016-01-01 00:02:59 2016-01-01 23:35:50 NYPD Unspecified 0 days 23:32:51
4 2016-01-01 00:03:03 2016-01-08 01:13:00 HPD BRONX 7 days 01:09:57

In [12]:
ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [13]:
ccab_data.to_csv('output.csv')

In [14]:
data_output = pd.read_csv('output.csv')

In [16]:
def part1():
    data = pd.read_csv('311_data.csv')
    # Create a view of the dataframe containing specific columns
    ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
    # Convert string to datetime type
    ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
    ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
    # create a column 'processing_time'
    ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']
    # Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
    ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)
    # output a csv file named output1.csv 
    ccab_data.to_csv('output1.csv')
    return ccab_data

In [17]:
part1()


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [201]:
import pandas as pd
import numpy as np
import datetime
%matplotlib inline
import seaborn as sns
plt.gcf().subplots_adjust(bottom=0.15)
#Your code starts here
data = pd.read_csv('311_data_2.csv')
# Convert string to datetime type
data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
data['Closed Date'] = data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
# create a column 'processing_time'
data['processing_time'] = data['Closed Date'] - ccab_data['Created Date']
# Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
data['start_time_window'] = data['Created Date'].apply(lambda x:x.hour)


<matplotlib.figure.Figure at 0x127d997b8>

In [202]:
def convert_time(x):
    try:
        x = np.timedelta64(x, 'h')
        return x.astype('int').item()
    except:
        return 0

In [203]:
group_data = data
group_data['processing_time'] = group_data['processing_time'].apply(lambda x:convert_time(x))
#start_time_group['processing_time'] = start_time_group['processing_time'].apply(lambda x:np.timedelta64(x,'m'))
#start_time_group.size().plot(kind='bar', figsize=(8,5.5))
# plt.savefig("1.png",dpi=100)
# plt.clf()
#group_data['processing_time']

In [238]:
start_time_group = group_data.groupby('start_time_window').agg({'processing_time':['mean']})
#start_time_group.size().plot('bar', figsize=(8,5.5))
start_time_group['processing_time']['mean']


Out[238]:
start_time_window
0     15.206612
1     70.585294
2     55.587940
3     73.202703
4    110.696203
5     60.723214
6    124.646341
7     60.652778
8    138.030000
Name: mean, dtype: float64

In [244]:
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
plt.bar([0,1,2,3,4,5,6,7,8], start_time_group['processing_time']['mean'], align='center')
ax.set_title('start_time_window')
plt.savefig("1.png",dpi=100)
plt.clf()


<matplotlib.figure.Figure at 0x13d254940>

In [40]:
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
sns.boxplot(x='start_time_window', y='processing_time',ax=ax, data=data)
plt.savefig("2.png",dpi=100)
plt.clf()


<matplotlib.figure.Figure at 0x130cf8d30>

In [44]:
data.groupby('Agency').size().sort_values(ascending=False).plot(kind='bar',figsize=(8,5.5))
plt.savefig("3.png",dpi=100)
plt.clf()


<matplotlib.figure.Figure at 0x12e25ec18>

In [51]:
COL_NUM = 2
ROW_NUM = 3
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))

for i, (label, col) in enumerate(data.groupby(['Agency', 'Borough']).size().unstack().iteritems()):
    ax = axes[int(i/COL_NUM), i%COL_NUM]
    col = col.sort_values(ascending=False)
    ax.set_title(label)
    col.plot(kind='barh', ax=ax)
plt.tight_layout()
plt.savefig("4.png",dpi=100)
plt.clf()


<matplotlib.figure.Figure at 0x13dcc0da0>

In [ ]: