notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import datetime

data = pd.read_csv('311_data.csv')



In [2]:

    
ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
ccab_data[0:5]









    Out[2]:







  
    
      
      Created Date
      Closed Date
      Agency
      Borough
    
  
  
    
      0
      01/01/2016 12:00:09 AM
      01/01/2016 01:57:32 AM
      NYPD
      BROOKLYN
    
    
      1
      01/01/2016 12:00:40 AM
      01/01/2016 03:12:53 AM
      NYPD
      BRONX
    
    
      2
      01/01/2016 12:01:09 AM
      01/21/2016 09:20:55 AM
      HPD
      BRONX
    
    
      3
      01/01/2016 12:02:59 AM
      01/01/2016 11:35:50 PM
      NYPD
      Unspecified
    
    
      4
      01/01/2016 12:03:03 AM
      01/08/2016 01:13:00 AM
      HPD
      BRONX



In [3]:

    
ccab_data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null object
Closed Date     1518 non-null object
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: object(4)
memory usage: 47.8+ KB



In [25]:

    
#ccab_data = ccab_data[(ccab_data['Closed Date'].notnull()) & (ccab_data['Borough'].notnull())]
#ccab_data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1517 entries, 0 to 1525
Data columns (total 4 columns):
Created Date    1517 non-null datetime64[ns]
Closed Date     1517 non-null object
Agency          1517 non-null object
Borough         1517 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 59.3+ KB



In [6]:

    
ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
ccab_data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null datetime64[ns]
Closed Date     1518 non-null object
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 47.8+ KB



In [8]:

    
ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))









    



/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [9]:

    
ccab_data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1527 entries, 0 to 1526
Data columns (total 4 columns):
Created Date    1527 non-null datetime64[ns]
Closed Date     1518 non-null datetime64[ns]
Agency          1527 non-null object
Borough         1526 non-null object
dtypes: datetime64[ns](2), object(2)
memory usage: 47.8+ KB



In [10]:

    
ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']









    



/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [11]:

    
ccab_data[0:5]









    Out[11]:







  
    
      
      Created Date
      Closed Date
      Agency
      Borough
      processing_time
    
  
  
    
      0
      2016-01-01 00:00:09
      2016-01-01 01:57:32
      NYPD
      BROOKLYN
      0 days 01:57:23
    
    
      1
      2016-01-01 00:00:40
      2016-01-01 03:12:53
      NYPD
      BRONX
      0 days 03:12:13
    
    
      2
      2016-01-01 00:01:09
      2016-01-21 09:20:55
      HPD
      BRONX
      20 days 09:19:46
    
    
      3
      2016-01-01 00:02:59
      2016-01-01 23:35:50
      NYPD
      Unspecified
      0 days 23:32:51
    
    
      4
      2016-01-01 00:03:03
      2016-01-08 01:13:00
      HPD
      BRONX
      7 days 01:09:57



In [12]:

    
ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)









    



/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [13]:

    
ccab_data.to_csv('output.csv')



In [14]:

    
data_output = pd.read_csv('output.csv')



In [16]:

    
def part1():
    data = pd.read_csv('311_data.csv')
    # Create a view of the dataframe containing specific columns
    ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
    # Convert string to datetime type
    ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
    ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
    # create a column 'processing_time'
    ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']
    # Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
    ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)
    # output a csv file named output1.csv 
    ccab_data.to_csv('output1.csv')
    return ccab_data



In [17]:

    
part1()









    



/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [201]:

    
import pandas as pd
import numpy as np
import datetime
%matplotlib inline
import seaborn as sns
plt.gcf().subplots_adjust(bottom=0.15)
#Your code starts here
data = pd.read_csv('311_data_2.csv')
# Convert string to datetime type
data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
data['Closed Date'] = data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
# create a column 'processing_time'
data['processing_time'] = data['Closed Date'] - ccab_data['Created Date']
# Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
data['start_time_window'] = data['Created Date'].apply(lambda x:x.hour)









    





<matplotlib.figure.Figure at 0x127d997b8>



In [202]:

    
def convert_time(x):
    try:
        x = np.timedelta64(x, 'h')
        return x.astype('int').item()
    except:
        return 0



In [203]:

    
group_data = data
group_data['processing_time'] = group_data['processing_time'].apply(lambda x:convert_time(x))
#start_time_group['processing_time'] = start_time_group['processing_time'].apply(lambda x:np.timedelta64(x,'m'))
#start_time_group.size().plot(kind='bar', figsize=(8,5.5))
# plt.savefig("1.png",dpi=100)
# plt.clf()
#group_data['processing_time']



In [238]:

    
start_time_group = group_data.groupby('start_time_window').agg({'processing_time':['mean']})
#start_time_group.size().plot('bar', figsize=(8,5.5))
start_time_group['processing_time']['mean']









    Out[238]:





start_time_window
0     15.206612
1     70.585294
2     55.587940
3     73.202703
4    110.696203
5     60.723214
6    124.646341
7     60.652778
8    138.030000
Name: mean, dtype: float64



In [244]:

    
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
plt.bar([0,1,2,3,4,5,6,7,8], start_time_group['processing_time']['mean'], align='center')
ax.set_title('start_time_window')
plt.savefig("1.png",dpi=100)
plt.clf()









    





<matplotlib.figure.Figure at 0x13d254940>



In [40]:

    
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
sns.boxplot(x='start_time_window', y='processing_time',ax=ax, data=data)
plt.savefig("2.png",dpi=100)
plt.clf()









    





<matplotlib.figure.Figure at 0x130cf8d30>



In [44]:

    
data.groupby('Agency').size().sort_values(ascending=False).plot(kind='bar',figsize=(8,5.5))
plt.savefig("3.png",dpi=100)
plt.clf()









    





<matplotlib.figure.Figure at 0x12e25ec18>



In [51]:

    
COL_NUM = 2
ROW_NUM = 3
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))

for i, (label, col) in enumerate(data.groupby(['Agency', 'Borough']).size().unstack().iteritems()):
    ax = axes[int(i/COL_NUM), i%COL_NUM]
    col = col.sort_values(ascending=False)
    ax.set_title(label)
    col.plot(kind='barh', ax=ax)
plt.tight_layout()
plt.savefig("4.png",dpi=100)
plt.clf()









    





<matplotlib.figure.Figure at 0x13dcc0da0>



In [ ]:

	Created Date	Closed Date	Agency	Borough
0	01/01/2016 12:00:09 AM	01/01/2016 01:57:32 AM	NYPD	BROOKLYN
1	01/01/2016 12:00:40 AM	01/01/2016 03:12:53 AM	NYPD	BRONX
2	01/01/2016 12:01:09 AM	01/21/2016 09:20:55 AM	HPD	BRONX
3	01/01/2016 12:02:59 AM	01/01/2016 11:35:50 PM	NYPD	Unspecified
4	01/01/2016 12:03:03 AM	01/08/2016 01:13:00 AM	HPD	BRONX

	Created Date	Closed Date	Agency	Borough	processing_time
0	2016-01-01 00:00:09	2016-01-01 01:57:32	NYPD	BROOKLYN	0 days 01:57:23
1	2016-01-01 00:00:40	2016-01-01 03:12:53	NYPD	BRONX	0 days 03:12:13
2	2016-01-01 00:01:09	2016-01-21 09:20:55	HPD	BRONX	20 days 09:19:46
3	2016-01-01 00:02:59	2016-01-01 23:35:50	NYPD	Unspecified	0 days 23:32:51
4	2016-01-01 00:03:03	2016-01-08 01:13:00	HPD	BRONX	7 days 01:09:57