Using the dataset available here complete the following:
Background: The United States Environmental Protection Agency (EPA) reports Entero counts as colonies (or cells) per 100 ml of water. The federal standard for unacceptable water quality is a single sample value of greater than 110 Enterococcus/100mL, or five or more samples with a geometric mean (a weighted average) greater than 30 Enterococcus/100mL.
In [190]:
# Import modules for analysis and visualization
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.mstats import gmean
import numpy as np
from numpy import genfromtxt
import os.path
from datetime import datetime
In [198]:
# Clean up and import (pandas)
data_url = "https://raw.githubusercontent.com/jlaurito/CUNY_IS608/master/lecture4/data/riverkeeper_data_2013.csv"
raw_data = pd.read_csv(data_url)
# info like what you get with str() in R
raw_data.dtypes
Out[198]:
In [199]:
# look at the first 25 rows
# remember the first row is the colnames
raw_data.head(25)
Out[199]:
In [200]:
# Adjust the data types to make the data easier to work with
raw_data["Site"] = raw_data["Site"].astype('category') # like facets
raw_data["Date"] = pd.to_datetime(raw_data["Date"]) # dates as dates
# We need to remove the greater than and less than symbols and treat "EnteroCount" as an integer -- 'int64'
raw_data["EnteroCount"] = raw_data["EnteroCount"].str.lstrip('><')
raw_data["EnteroCount"] = raw_data["EnteroCount"].astype('int64')
In [201]:
# Check to make sure the changes are correct
raw_data.dtypes
Out[201]:
In [202]:
raw_data.head(20) # there are no more < or > symbols
Out[202]:
In [7]:
# Create a column showing whether or not the water quality is acceptable in a given place
# unnaceptable:
# 110 Enterococcus/100mL OR
# five or more samples with a geometric mean (a weighted average) > 30 Enterococcus/100mL.
# The prompt is really vague in defining the geometric mean condition - I used gmean()
# When calculated, geometric mean by site of highest 5 samples and single sample >= 110
# shows nearly all sites would have water with unacceptable water quality at some point
raw_data['swim'] = np.where(
(raw_data.groupby('Site').EnteroCount.transform(max) > 110)
| (raw_data.groupby('Site').EnteroCount.transform(lambda group: gmean(group.nlargest(5))) > 30),
'unacceptable', 'acceptable')
raw_data.head(10)
Out[7]:
In [18]:
# As a result of above criteria not being very useful, I instead grouped the data by site and
# calculated the mean entero_count for each site
mean_entero = raw_data.groupby(['Site'])['EnteroCount'].mean().sort_values()
In [19]:
# Top ten best places to swim
print "Ten best places to swim"
print "+++++++++++++++++++++++"
mean_entero.head(10)
Out[19]:
In [20]:
# Ten worst places to swim
print "Ten worst places to swim"
print "++++++++++++++++++++++++"
mean_entero.tail(10).sort_values(ascending=False)
Out[20]:
In [21]:
# write the results to a csv and read in for more flexible use (now and later)
mean_entero.to_csv( 'mean_entero.csv' )
In [22]:
mean_entero1 = pd.read_csv('mean_entero.csv')
mean_entero1.columns = ['Site','Average_Entero_Count']
mean_entero1.head() #make sure it looks right
Out[22]:
In [164]:
# Plot the results
%matplotlib inline
sns.set_style("whitegrid")
sns.set_style("ticks")
plt.figure(figsize=(10, 20))
ax = sns.barplot(x ="Average_Entero_Count", y="Site", data=mean_entero1, palette="Blues_d")
ax.set(xlabel='Average Entero Count', ylabel='Site')
ax.set_title('Waterways ranked by Average Entero Count')
sns.despine()
# Don't go swimming in the Gowanus Canal
In [68]:
# Are we looking for consistancy or frequency in testing??
# Note to self - learn more about plotting time series data
# Determine the most recent reading for each site (might be useful?)
maxDate = raw_data.groupby(by=["Site"])["Date"].max()
maxDate.sort_values(ascending=False, inplace=True)
# what are our most FREQUENTLY tested Sites?
test_Site_counts = raw_data.groupby(by=["Site"])["Date"].count()
test_Site_counts.sort_values(ascending=False, inplace=True)
# Print out results
print "Most recently tested site dates"
print "++++++++++++++++++++++++"
print maxDate.head(15)
print "\n"
print "Most frequently tested sites"
print "++++++++++++++++++++++++"
print test_Site_counts.head(15)
In [94]:
# Consistancy is more important
# Figure out how many days elapsed between readings and calculate average by group.
# This gives us a general sense of the regularity of testing
# After lots of cursing and fiddling!
lag_date_data = raw_data.groupby('Site')['Date'].apply(lambda x: x.diff().mean()).reset_index().rename(columns={'Date':'Mean_lag_days'})
lag_date_data.head()
Out[94]:
In [96]:
# write it to a csv to make it more flexible
lag_date_data.to_csv( 'lag_data.csv' )
In [113]:
# Clean up the csv for plotting
lag_data = pd.read_csv('lag_data.csv')
lag_data.columns = ['index','Site','mean_test_lag']
lag_data_num_col = pd.DataFrame(lag_data['mean_test_lag'].str.split().tolist(), columns=['mean_lag_days','junk','more_junk'])
mean_lag_data = pd.concat([lag_data, lag_data_num_col], axis=1)
del mean_lag_data['index']
del mean_lag_data['mean_test_lag']
del mean_lag_data['junk']
del mean_lag_data['more_junk']
# convert mean column to numeric type
mean_lag_data["mean_lag_days"] = pd.to_numeric(mean_lag_data["mean_lag_days"])
# apply absolute value to mean column
mean_lag_data['mean_lag_days'] = mean_lag_data['mean_lag_days'].abs()
mean_lag_data.head(10)
Out[113]:
In [114]:
print mean_lag_data.dtypes
In [123]:
# Get a general sense of the spread - on average how often are the sites tested
%matplotlib inline
plt.figure(figsize=(4.5, 10))
sns.violinplot(mean_lag_data, palette="colorblind")
sns.despine()
In [162]:
# Subset random sample of sites to compare and plot
# Random subset of 10% of the data
sub_lag_data = mean_lag_data.sample(frac=0.1, replace=True)
%matplotlib inline
plt.figure(figsize=(10, 8))
ax = sns.barplot(x ="mean_lag_days", y="Site", data=sub_lag_data, palette="Blues_d")
ax.set(xlabel='Mean Lag Time', ylabel='Site')
ax.set_title('Mean Lag Time Between Tests in Days')
sns.despine()
# From the plot we can see there is a huge range in average lag between tests!
In [203]:
# Cleanup so we're using just the entero count and rain totals
del raw_data['Site']
del raw_data['Date']
del raw_data['SampleCount']
raw_data.head()
Out[203]:
In [253]:
%matplotlib inline
# Scatterplot to show relationship between rainfall and Entero Count
plt.figure(figsize=(7.5, 6.5))
sns.regplot('FourDayRainTotal', 'EnteroCount', data=raw_data, fit_reg=False, x_jitter=1)
sns.despine()
sns.plt.ylim(0)
sns.plt.xlim(0)
# There seems to be a relatively strong relationship between rainfall and water quality once
# you go beyond ~3 inches of rain. At that point there are much fewer high Entero Count readings
Out[253]:
In [227]:
# Compare any two locations with the following
# re-read in/clean up
data_url = "https://raw.githubusercontent.com/jlaurito/CUNY_IS608/master/lecture4/data/riverkeeper_data_2013.csv"
site_rain_data = pd.read_csv(data_url)
site_rain_data["Site"] = site_rain_data["Site"].astype('category') # like facets
site_rain_data["Date"] = pd.to_datetime(site_rain_data["Date"]) # dates as dates
site_rain_data["EnteroCount"] = site_rain_data["EnteroCount"].str.lstrip('><')
site_rain_data["EnteroCount"] = site_rain_data["EnteroCount"].astype('int64')
site_rain_data.head()
Out[227]:
In [283]:
site1 = raw_input('enter your first site --> ') # Gowanus Canal <- worst
In [284]:
site2 = raw_input('enter your second site --> ') # Croton Point Beach <- best water
In [285]:
# Gowanus is dirtiest, Croton Point Beach is cleanest
x = site_rain_data.loc[site_rain_data['Site'] == site1]
y = site_rain_data.loc[site_rain_data['Site'] == site2]
frames = [x, y]
result = pd.concat(frames)
del result['Date']
del result['SampleCount']
result.head()
Out[285]:
In [287]:
plt.figure(figsize=(10, 10))
sns.lmplot('FourDayRainTotal', 'EnteroCount', data=result, hue='Site', palette="Set2", fit_reg=False, scatter_kws={"s": 50}, legend=False)
plt.legend(loc='upper right')
sns.despine()
sns.plt.ylim(0)
sns.plt.xlim(0)
Out[287]: