In [2]:
#import warnings
#warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import re
import plotly
import plotly.offline as py
from plotly.graph_objs import *
from ipywidgets import interact
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns



In [67]:
data = pd.read_csv("https://raw.githubusercontent.com/jasonjgy2000/Cuny/master/Data%20608/Homework%204/Data/riverkeeper_data_2013.csv")

In [68]:
data.head(5)


Out[68]:
Site Date EnteroCount FourDayRainTotal SampleCount
0 Hudson above Mohawk River 10/16/2011 1733 1.5 35
1 Hudson above Mohawk River 10/21/2013 4 0.2 35
2 Hudson above Mohawk River 9/21/2013 20 0.0 35
3 Hudson above Mohawk River 8/19/2013 6 0.0 35
4 Hudson above Mohawk River 7/21/2013 31 0.0 35

Data Cleansing


In [5]:
data.dtypes


Out[5]:
Site                 object
Date                 object
EnteroCount          object
FourDayRainTotal    float64
SampleCount           int64
dtype: object

The Date column will be converted from a string data type to date. Removing less and more than signs from EnterCount and convert it to int datatype. Changing Site from string to category


In [32]:
data['Date'] = pd.to_datetime(data["Date"])
data['Site'] = data['Site'].astype('category')
data['EnteroCount'] = data['EnteroCount'].str.replace(r'\<|>', '').astype('int64')
# remove Na
data = data.dropna()

In [7]:
data[3230:3234]


Out[7]:
Site Date EnteroCount FourDayRainTotal SampleCount
3230 East River mid-channel at 23rd St. 2013-06-19 10 0.2 50
3231 East River mid-channel at 23rd St. 2013-05-31 10 0.5 50
3232 East River mid-channel at 23rd St. 2012-11-08 121 1.2 50
3233 East River mid-channel at 23rd St. 2012-10-10 31 0.5 50

In [33]:
data.dtypes


Out[33]:
Site                      category
Date                datetime64[ns]
EnteroCount                  int64
FourDayRainTotal           float64
SampleCount                  int64
dtype: object

Create lists & graphs of the best and worst places to swim in the dataset


In [34]:
# water is unacceptable if a single sample is greater than 110 Enterococcus/100 mL
# water is unacceptable if five(5) or more samples weighted average is more than 30 Enterococcus/100 mL

data['Year'] = data['Date'].apply(lambda x: x.year)
placesToSwim = data.groupby(['Site','Year'])['EnteroCount'].sum()
placesToSwim = placesToSwim.to_frame().reset_index()
placesToSwim['RecordCount'] = data.groupby(['Site','Year'])['EnteroCount'].transform('count')
placesToSwim['yearSum'] = placesToSwim.groupby(['Year'])['RecordCount'].transform('sum')
placesToSwim['WeightedMean'] = (placesToSwim['EnteroCount'] * placesToSwim['RecordCount'])/ placesToSwim['yearSum']

In [35]:
# add unaccecptable/ acceptable check
placesToSwim['Classification'] = np.where(((placesToSwim.WeightedMean > 110) & (placesToSwim.RecordCount < 5 )) |
                                          ((placesToSwim.WeightedMean > 30) & (placesToSwim.RecordCount > 4 ))
                                          ,'unacceptable', 'acceptable')

In [36]:
placesToSwim.head()


Out[36]:
Site Year EnteroCount RecordCount yearSum WeightedMean Classification
0 125th St. Pier 2006 3751 6 141 159.617021 unacceptable
1 125th St. Pier 2007 5804 6 160 217.650000 unacceptable
2 125th St. Pier 2008 105 6 456 1.381579 acceptable
3 125th St. Pier 2009 237 6 452 3.146018 acceptable
4 125th St. Pier 2010 246 6 449 3.287305 acceptable

In [ ]:
#Worse Places to Swim

In [63]:
def plot_it(Year,Filter,Sorting):
    placesToSwimData = placesToSwim[(placesToSwim['Year']==int(Year)) & (placesToSwim['Classification']==Filter)]
    if Filter == 'acceptable':
        placesToSwimData = placesToSwimData.head(10)
    else:
        placesToSwimData = placesToSwimData.tail(10)
    
    if Sorting == 'ascending':
        placesToSwimData = placesToSwimData.sort_values(['WeightedMean'],ascending=[True])
    else:
        placesToSwimData = placesToSwimData.sort_values(['WeightedMean'],ascending=[False])
    
    fig = {
        'data': [
            {
                'x': placesToSwimData['Site'],
                'y': placesToSwimData['WeightedMean'],
                'name': Year,
                'type': 'bar'
            }
        ],
        
    'layout': Layout(xaxis=XAxis(title='Site'), yaxis=YAxis(title='Weighted Mean'))
    }
    py.iplot(fig)

In [64]:
interact(plot_it, Year=('2006', '2007','2008', '2009', '2010', '2011', '2012', '2013'),         
         Filter =('acceptable','unacceptable'),Sorting=('ascending','descending'))


Out[64]:
<function __main__.plot_it>

In [39]:
years = [2006,2007,2008,2009,2010,2011,2012,2013]
for year in years:
    test = placesToSwim[((placesToSwim['Year']==year) & (placesToSwim['Classification'] == 'acceptable') )]
    test = test.sort_values(['WeightedMean'],ascending=True).head(10)[['Site','WeightedMean']].reset_index(drop=True)
    test.columns = ['Site','Weighted EnteroCount']
    print '---------------------------------------------'
    print 'Best Places to swim for the Year '+ str(year)
    print test


---------------------------------------------
Best Places to swim for the Year 2006
                         Site  Weighted EnteroCount
0     Stony Point mid-channel              0.085106
1          Croton Point Beach              1.191489
2              Ossining Beach              3.659574
3  Haverstraw Bay mid-channel              3.872340
4         Yonkers STP Outfall              4.127660
5    North River STP at 145th              4.638298
6               Furnace Brook              4.765957
7       GW Bridge mid-channel              6.340426
8        79th St. mid-channel              7.787234
9         Yonkers mid-channel              8.170213
---------------------------------------------
Best Places to swim for the Year 2007
                              Site  Weighted EnteroCount
0  Peekskill Riverfront Green Park                1.5375
1               Croton Point Beach                1.7250
2              Yonkers mid-channel                3.6750
3                    Furnace Brook                3.8250
4                   Ossining Beach                4.9125
5              Yonkers STP Outfall                5.6250
6          Stony Point mid-channel                7.6875
7          The Battery mid-channel               11.1750
8       Haverstraw Bay mid-channel               11.9875
9         North River STP at 145th               13.7625
---------------------------------------------
Best Places to swim for the Year 2008
                                 Site  Weighted EnteroCount
0            Norrie Point mid-channel              0.118421
1  Poughkeepsie Drinking Water Intake              0.184211
2                      Tivoli Landing              0.184211
3     Port Ewen Drinking Water Intake              0.197368
4                Ulster Landing Beach              0.210526
5            Poughkeepsie Launch Ramp              0.342105
6              West Point STP Outfall              0.421053
7                         Inbocht Bay              0.434211
8                  Malden Launch Ramp              0.434211
9                  Cold Spring Harbor              0.567982
---------------------------------------------
Best Places to swim for the Year 2009
                                 Site  Weighted EnteroCount
0              West Point STP Outfall              0.464602
1           Emeline Beach- Haverstraw              0.862832
2                     Irvington Beach              0.929204
3               TZ Bridge mid-channel              0.929204
4                 Yonkers STP Outfall              0.929204
5               GW Bridge mid-channel              0.929204
6  Poughkeepsie Drinking Water Intake              0.955752
7          Haverstraw Bay mid-channel              1.022124
8                  Little Stony Point              1.048673
9                      Ossining Beach              1.061947
---------------------------------------------
Best Places to swim for the Year 2010
                                 Site  Weighted EnteroCount
0  Poughkeepsie Drinking Water Intake              0.293987
1            Norrie Point mid-channel              0.387528
2     Port Ewen Drinking Water Intake              0.414254
3             Stony Point mid-channel              0.801782
4               TZ Bridge mid-channel              0.890869
5                         Inbocht Bay              0.935412
6           Emeline Beach- Haverstraw              1.069042
7                 Yonkers mid-channel              1.091314
8                      Tivoli Landing              1.162584
9                     Irvington Beach              1.216036
---------------------------------------------
Best Places to swim for the Year 2011
                                 Site  Weighted EnteroCount
0                 Yonkers mid-channel              1.098655
1               TZ Bridge mid-channel              1.255605
2                    Marlboro Landing              1.331839
3                  Little Stony Point              1.426009
4                  Cold Spring Harbor              1.775785
5          Haverstraw Bay mid-channel              1.923767
6  Poughkeepsie Drinking Water Intake              1.950673
7                 Yonkers STP Outfall              2.085202
8               GW Bridge mid-channel              2.192825
9             Stony Point mid-channel              2.246637
---------------------------------------------
Best Places to swim for the Year 2012
                                 Site  Weighted EnteroCount
0            Norrie Point mid-channel              0.291391
1  Poughkeepsie Drinking Water Intake              0.384106
2                      Tivoli Landing              0.397351
3                         Inbocht Bay              0.437086
4                Ulster Landing Beach              0.582781
5                  Little Stony Point              0.596026
6     Port Ewen Drinking Water Intake              0.649007
7                     Irvington Beach              0.662252
8             Gay's Point mid-channel              0.701987
9                  Cold Spring Harbor              0.728477
---------------------------------------------
Best Places to swim for the Year 2013
                                 Site  Weighted EnteroCount
0     Port Ewen Drinking Water Intake              0.441964
1            Norrie Point mid-channel              0.455357
2  Poughkeepsie Drinking Water Intake              0.468750
3        Catskill Creek- First Bridge              0.763393
4                     Irvington Beach              0.781250
5                      Tivoli Landing              0.790179
6                79th St. mid-channel              0.803571
7          Haverstraw Bay mid-channel              0.803571
8                  Croton Point Beach              0.803571
9                    Marlboro Landing              0.870536

In [40]:
for year in years:
    test = placesToSwim[((placesToSwim['Year']==year) & (placesToSwim['Classification'] == 'unacceptable') )]
    test = test.sort_values(['WeightedMean'],ascending=False).head(10)[['Site','WeightedMean']].reset_index(drop=True)
    test.columns = ['Site','Weighted EnteroCount']
    print '---------------------------------------------'
    print 'Worst Places to swim for the Year '+ str(year)
    print test


---------------------------------------------
Worst Places to swim for the Year 2006
                                      Site  Weighted EnteroCount
0                     Upper Sparkill Creek            360.340426
1                   Orangetown STP Outfall            212.085106
2                           125th St. Pier            159.617021
3                           Saw Mill River            148.936170
4                            Piermont Pier            116.042553
5  Newtown Creek- Metropolitan Ave. Bridge            106.170213
6               Newtown Creek- Dutch Kills             74.425532
7          Harlem River- Washington Bridge             68.340426
---------------------------------------------
Worst Places to swim for the Year 2007
                                      Site  Weighted EnteroCount
0                     Upper Sparkill Creek             445.50000
1                           125th St. Pier             217.65000
2  Newtown Creek- Metropolitan Ave. Bridge             108.37500
3                   Orangetown STP Outfall              98.58750
4                           Saw Mill River              83.51875
5                    GW Bridge mid-channel              66.33750
6                            Piermont Pier              62.28750
7                     79th St. mid-channel              46.83750
8               Newtown Creek- Dutch Kills              42.75000
---------------------------------------------
Worst Places to swim for the Year 2008
                                      Site  Weighted EnteroCount
0                     Upper Sparkill Creek           1119.960526
1  Newtown Creek- Metropolitan Ave. Bridge            968.407895
2                            Gowanus Canal            394.815789
3                            Piermont Pier            251.342105
4                           Saw Mill River            189.491228
5                   Orangetown STP Outfall            161.710526
6         Harlem River- Willis Ave. Bridge             89.725877
7                         Tarrytown Marina             54.868421
8                           Ossining Beach             52.368421
9               Newtown Creek- Dutch Kills             48.210526
---------------------------------------------
Worst Places to swim for the Year 2009
                                      Site  Weighted EnteroCount
0  Newtown Creek- Metropolitan Ave. Bridge            669.384956
1                     Upper Sparkill Creek            447.783186
2               Newtown Creek- Dutch Kills            343.234513
3                            Gowanus Canal            232.685841
4                            Piermont Pier            213.663717
5      Kingsland Pt. Park- Pocantico River            116.216814
6             Rondout- Eddyville Anchorage            111.086283
7                     Kingston STP Outfall             78.292035
8            Rondout- Kingston Public Dock             71.734513
9                 Catskill Creek- East End             64.101770
---------------------------------------------
Worst Places to swim for the Year 2010
                                      Site  Weighted EnteroCount
0                     Upper Sparkill Creek            607.616927
1                            Piermont Pier            496.035635
2                            Gowanus Canal            401.358575
3  Newtown Creek- Metropolitan Ave. Bridge            377.131403
4               Newtown Creek- Dutch Kills            370.075724
5                         Tarrytown Marina            310.356347
6                 Catskill Creek- East End             63.434298
7                     Newburgh Launch Ramp             62.231626
8             Catskill Creek- First Bridge             50.000000
9             Hudson River above Troy Lock             45.781737
---------------------------------------------
Worst Places to swim for the Year 2011
                                  Site  Weighted EnteroCount
0                       Saw Mill River            443.650224
1                        Gowanus Canal            439.336323
2  Kingsland Pt. Park- Pocantico River            328.748879
3                     Tarrytown Marina            273.867713
4            Emeline Beach- Haverstraw            145.533632
5                        Piermont Pier            114.766816
6                 Upper Sparkill Creek            100.735426
7                     Cedar Pond Brook             95.219731
8                        Furnace Brook             91.170404
9      Peekskill Riverfront Green Park             89.986547
---------------------------------------------
Worst Places to swim for the Year 2012
                         Site  Weighted EnteroCount
0               Gowanus Canal            647.973510
1      Orangetown STP Outfall            322.079470
2        Upper Sparkill Creek            287.364238
3   Mohawk River at Waterford             82.278146
4    North River STP at 145th             80.423841
5        Kingston STP Outfall             65.470199
6               Piermont Pier             63.086093
7  Newtown Creek- Dutch Kills             34.821192
8       Esopus Creek Entrance             33.668874
9         Yonkers STP Outfall             32.847682
---------------------------------------------
Worst Places to swim for the Year 2013
                                      Site  Weighted EnteroCount
0  Newtown Creek- Metropolitan Ave. Bridge             60.763393
1                Mohawk River at Waterford             60.508929
2             Dunn Memorial Bridge- Albany             44.651786
3      Kingsland Pt. Park- Pocantico River             35.531250
4                     Kingston STP Outfall             35.008929
5             Hudson River above Troy Lock             34.915179
6                     Newburgh Launch Ramp             34.285714
7                    Bethlehem Launch Ramp             33.669643
8                         Coeymans Landing             32.281250

2. Which sites have been tested most regularly?


In [91]:
testRegularly = data.groupby(['Site'])['EnteroCount'].count()
testRegularly = testRegularly.to_frame().reset_index()
testRegularly.columns = ['Site','Number of Tests']
testRegularly = testRegularly.sort_values(['Number of Tests'],ascending=False).head(10).reset_index(drop=True)
testRegularly


Out[91]:
Site Number of Tests
0 Piermont Pier 187
1 Upper Sparkill Creek 165
2 125th St. Pier 66
3 Nyack Launch Ramp 61
4 Newtown Creek- Dutch Kills 57
5 TZ Bridge mid-channel 57
6 Orangetown STP Outfall 57
7 Newtown Creek- Metropolitan Ave. Bridge 57
8 Yonkers mid-channel 52
9 Yonkers STP Outfall 51

3. Which ones have long gaps between tests


In [41]:
for year in years:
    testingGap = data[data['Year'] == year]
    testingGap = testingGap.sort_values('Date').groupby(['Site'])['Date'].apply(lambda x: x.diff().mean()).to_frame().reset_index()
    # remove Nat
    testingGap = testingGap[testingGap.Date.notnull()]
    testingGap['Date']= testingGap['Date'].apply(lambda x: x.days)
    testingGap.columns = ['Site','Gap(Days)']
    print '---------------------------------------------------'
    print 'Year '+ str(year)
    print testingGap.sort_values('Gap(Days)',ascending=False).head(10).reset_index(drop=True)


---------------------------------------------------
Year 2006
                                      Site  Gap(Days)
0                         Cedar Pond Brook         47
1               Haverstraw Bay mid-channel         39
2                       Croton Point Beach         39
3                            Furnace Brook         39
4                           Ossining Beach         39
5                           Saw Mill River         33
6  Newtown Creek- Metropolitan Ave. Bridge         26
7                      Yonkers STP Outfall         26
8               Newtown Creek- Dutch Kills         26
9                      Yonkers mid-channel         26
---------------------------------------------------
Year 2007
                                      Site  Gap(Days)
0                  Stony Point mid-channel         45
1          Peekskill Riverfront Green Park         37
2                      Yonkers STP Outfall         28
3                  The Battery mid-channel         28
4  East River mid-channel at Roosevelt Is.         28
5                    GW Bridge mid-channel         28
6                            Gowanus Canal         28
7          Harlem River- Washington Bridge         28
8         Harlem River- Willis Ave. Bridge         28
9                     79th St. mid-channel         28
---------------------------------------------------
Year 2008
                        Site  Gap(Days)
0              Gowanus Canal         76
1                Inbocht Bay         38
2    Gay's Point mid-channel         32
3  Coxsackie Waterfront Park         31
4            Fort Montgomery         31
5                     Athens         31
6                  Castleton         31
7           Marlboro Landing         31
8  Island Creek/Normans Kill         31
9         Cold Spring Harbor         31
---------------------------------------------------
Year 2009
                           Site  Gap(Days)
0     Hudson above Mohawk River         40
1     Mohawk River at Waterford         40
2          Kingston Point Beach         32
3                 Gowanus Canal         32
4      Norrie Point mid-channel         32
5               Fort Montgomery         32
6             Esopus Creek West         32
7         Esopus Creek Entrance         32
8      Norrie Point Yacht Basin         32
9  Hudson River above Troy Lock         32
---------------------------------------------------
Year 2010
                                  Site  Gap(Days)
0                        Gowanus Canal         31
1                       125th St. Pier         28
2                       Ossining Beach         28
3                          Inbocht Bay         28
4                 79th St. mid-channel         28
5            Island Creek/Normans Kill         28
6  Kingsland Pt. Park- Pocantico River         28
7                 Kingston Point Beach         28
8                   Little Stony Point         28
9                   Malden Launch Ramp         28
---------------------------------------------------
Year 2011
                                      Site  Gap(Days)
0                     Upper Sparkill Creek         37
1                           125th St. Pier         31
2                   Orangetown STP Outfall         31
3                            Gowanus Canal         31
4          Harlem River- Washington Bridge         31
5         Harlem River- Willis Ave. Bridge         31
6               Haverstraw Bay mid-channel         31
7                     79th St. mid-channel         31
8               Newtown Creek- Dutch Kills         31
9  Newtown Creek- Metropolitan Ave. Bridge         31
---------------------------------------------------
Year 2012
                                      Site  Gap(Days)
0                           125th St. Pier         29
1                     Newburgh Launch Ramp         29
2                   Orangetown STP Outfall         29
3                 North River STP at 145th         29
4                 Norrie Point mid-channel         29
5                 Norrie Point Yacht Basin         29
6  Newtown Creek- Metropolitan Ave. Bridge         29
7               Newtown Creek- Dutch Kills         29
8                Mohawk River at Waterford         29
9                Hudson above Mohawk River         29
---------------------------------------------------
Year 2013
                              Site  Gap(Days)
0                  Irvington Beach         28
1                 Cedar Pond Brook         28
2       Haverstraw Bay mid-channel         28
3                Nyack Launch Ramp         28
4                    Furnace Brook         28
5           Orangetown STP Outfall         28
6                   Ossining Beach         28
7  Peekskill Riverfront Green Park         28
8        Emeline Beach- Haverstraw         28
9             Kingston STP Outfall         28

In [60]:
def plot_gaps(Year,Sorting):
    
    testingGap = data[data['Year']== int(Year)]
    testingGap = testingGap.sort_values('Date').groupby(['Site'])['Date'].apply(lambda x: x.diff().mean()).to_frame().reset_index()
    # remove Nat
    testingGap = testingGap[testingGap.Date.notnull()]
    testingGap['Date']= testingGap['Date'].apply(lambda x: x.days)
    testingGap.columns = ['Site','Gap(Days)']
    if Sorting == 'ascending':
        testingGap = testingGap.sort_values('Gap(Days)',ascending=True).head(10)
    else:
        testingGap = testingGap.sort_values('Gap(Days)',ascending=False).head(10)
    
    fig = {
        'data': [
            {
                'x': testingGap['Site'],
                'y': testingGap['Gap(Days)'],
                'name': Year,
                'type': 'bar'
            }
        ],
    'layout': Layout(xaxis=XAxis(title='Site'), yaxis=YAxis(title='Gap(Days)'))
    }
    py.iplot(fig)

In [61]:
interact(plot_gaps, Year=('2006', '2007','2008', '2009', '2010', '2011', '2012', '2013'),Sorting=('ascending','descending'))


Out[61]:
<function __main__.plot_gaps>

4. Is there a relationship between the amount of rain and water quality? Show this relationship graphically. If you can, estimate the effect of rain on quality at different sites and create a visualization to compare them


In [45]:
relationship = data[['EnteroCount','FourDayRainTotal']]
relationship.head()


Out[45]:
EnteroCount FourDayRainTotal
0 1733 1.5
1 4 0.2
2 20 0.0
3 6 0.0
4 31 0.0

Scatterplot of the data


In [65]:
trace = Scatter(
    x = relationship['FourDayRainTotal'],
    y = relationship['EnteroCount'],
    mode = 'markers',
    name='Scatter Plot'
)

plotData = [trace]

layout = Layout(
    title='Scatter Plot',
    xaxis=dict(
        title='Four Day Rain Total',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Entero Count',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = Figure(data=plotData, layout=layout)
py.iplot(fig, filename='basic-scatter')


There seems to be a strong relationship between rainfall and Entero Count. It seems that once there is more than 2.5 inches of rain the Entero readings values are lower.


In [66]:
import seaborn as sns
corr = relationship.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)


Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0xce6ffd0>