Codes for Final Project

Understanding Water Quality in the Main Arm of the Fraser River


In [2]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#stats library

import statsmodels.api as sm
import scipy

#The function below is used to show the plots within the notebook

%matplotlib inline

In [26]:
#Check version of pandas library
pd.__version__


Out[26]:
'0.17.0'

In [27]:
#Check version of numpy library
np.__version__


Out[27]:
'1.10.1'

In [28]:
#Load dataset using Pandas

water_quality_data=pd.read_csv('../data/Fraser_River_Water_Data.csv', delimiter= ',')

In [32]:
#Print Water_Quality_Data

water_quality_data.head()


Out[32]:
Sample time Sample number Sample type Arsenic Dissolved [Lab: 1] [VMV: 107942] Status(0) Unit code(0) Value modifier code(0) Carbon Dissolved Organic [Lab: -54] [VMV: 1067] Status(1) Unit code(1) ... Unit code(13) Value modifier code(13) Temperature Water [Lab: 80] [VMV: 1125] Status(14) Unit code(14) Value modifier code(14) Temperature Water [Lab: -54] [VMV: 1125] Status(15) Unit code(15) Value modifier code(15)
0 2008-08-27 21:50:00 08PY001319 1 NaN NaN NaN NaN 8.2 U MG/L ... NaN NaN 16 U DEG C NaN NaN NaN NaN NaN
1 2008-09-03 00:25:00 08PY001043 1 NaN NaN NaN NaN 2.0 U MG/L ... NaN NaN 16 U DEG C NaN NaN NaN NaN NaN
2 2008-09-17 22:15:00 08PY001061 1 NaN NaN NaN NaN 2.9 U MG/L ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2008-09-30 20:30:00 08PY001124 1 NaN NaN NaN NaN 1.6 U MG/L ... NaN NaN 14 U DEG C NaN NaN NaN NaN NaN
4 2008-10-15 19:24:00 08PY001200 1 NaN NaN NaN NaN 4.7 U MG/L ... NaN NaN 11 U DEG C NaN NaN NaN NaN NaN

5 rows × 67 columns


In [30]:
water_quality_data.tail()


Out[30]:
Sample time Sample number Sample type Arsenic Dissolved [Lab: 1] [VMV: 107942] Status(0) Unit code(0) Value modifier code(0) Carbon Dissolved Organic [Lab: -54] [VMV: 1067] Status(1) Unit code(1) ... Unit code(13) Value modifier code(13) Temperature Water [Lab: 80] [VMV: 1125] Status(14) Unit code(14) Value modifier code(14) Temperature Water [Lab: -54] [VMV: 1125] Status(15) Unit code(15) Value modifier code(15)
91 2015-01-21 22:00:00 15PY005242 1 0.34 U UG/L NaN 2.70 U MG/L ... NaN NaN NaN NaN NaN NaN 4.5 U DEG C NaN
92 2015-02-13 19:05:00 15PY005461 1 0.36 U UG/L NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 6.3 U DEG C NaN
93 2015-03-26 11:04:00 15PY005597 1 NaN NaN NaN NaN 5.53 U MG/L ... NaN NaN NaN NaN NaN NaN 7.7 U DEG C NaN
94 2015-04-16 20:50:00 15PY005838 1 NaN NaN NaN NaN 4.67 U MG/L ... NaN NaN NaN NaN NaN NaN 9.0 U DEG C NaN
95 2015-04-30 20:45:00 15PY005887 1 NaN NaN NaN NaN 5.07 U MG/L ... NaN NaN NaN NaN NaN NaN 10.1 U DEG C NaN

5 rows × 67 columns


In [31]:
#Print the variables for the dataset to help understand the dataset better

water_quality_data.dtypes.head()


Out[31]:
Sample time                                  object
Sample number                                object
Sample type                                   int64
Arsenic Dissolved [Lab: 1] [VMV: 107942]    float64
Status(0)                                    object
dtype: object

In [28]:
# Print data frame for variables extracted from water_quality_data

fraser_water_quality.head()


Out[28]:
Temperature Water [Lab: 80] [VMV: 1125] Oxygen Dissolved [Lab: -54] [VMV: 1124] Specific Conductance [Lab: -54] [VMV: 2041] Salinity [Lab: -54] [VMV: 1318]
0 16 NaN 250 NaN
1 16 NaN 93 NaN
2 NaN NaN 110 NaN
3 14 NaN 320 NaN
4 11 NaN 150 NaN

Statistical Analysis for dataset- Linear Modelling


In [98]:
#Import pasty library to run data

from patsy import dmatrices
from patsy.builtins import *

In [7]:
#Extract variables which to be analyzed for assignment

fraser_water_quality_testing=water_quality_data[['Temperature Water [Lab: 80] [VMV: 1125]',
                                         'Oxygen Dissolved [Lab: -54] [VMV: 1124]']]

In [33]:
#Print the head of the new dataframe

fraser_water_quality_testing.head()


Out[33]:
Temperature Water [Lab: 80] [VMV: 1125] Oxygen Dissolved [Lab: -54] [VMV: 1124]
0 16 NaN
1 16 NaN
2 NaN NaN
3 14 NaN
4 11 NaN

In [11]:
#Remove the NaN from the dataset
water_quality_DO=fraser_water_quality_testing.dropna()

In [34]:
#Print the dataframe for values remaining from those dropped
water_quality_DO


Out[34]:
Temperature Water [Lab: 80] [VMV: 1125] Oxygen Dissolved [Lab: -54] [VMV: 1124]
49 7.5 12.00
53 16.5 10.50
64 7.5 12.70
69 17.5 10.15
70 19.5 9.38
73 20.0 9.05
74 20.6 8.96
75 12.2 9.70

In [14]:
#Change column names

water_quality_DO2= water_quality_DO.rename(columns={'Temperature Water [Lab: 80] [VMV: 1125]':'Temperature_Lab80',
                                                    'Oxygen Dissolved [Lab: -54] [VMV: 1124]':
                                                              'Oxygen_Dissolved_Lab-54'})

In [35]:
#Print the dataframe of the column names changed

water_quality_DO2


Out[35]:
Temperature_Lab80 Oxygen_Dissolved_Lab-54
49 7.5 12.00
53 16.5 10.50
64 7.5 12.70
69 17.5 10.15
70 19.5 9.38
73 20.0 9.05
74 20.6 8.96
75 12.2 9.70

In [18]:
#Linear Model general form for the data

lm= sm.formula.ols(formula="Q('Oxygen_Dissolved_Lab-54') ~ Q('Temperature_Lab80')", data= water_quality_DO2).fit()

In [19]:
# This gives the beta values, the gradient is the time value

lm.params


Out[19]:
Intercept                 13.780598
Q('Temperature_Lab80')    -0.229223
dtype: float64

In [20]:
# Used to use the predict function we make a data frame, therefore below we have data frame that is used to make dataframe

x_new=pd.DataFrame({'Temperature_Lab80': range(1,700)})

In [21]:
x_new.head()


Out[21]:
Temperature_Lab80
0 1
1 2
2 3
3 4
4 5

In [22]:
# create predict function to calculate linear model
y_preds=lm.predict(x_new)
y_preds[1:10]


Out[22]:
array([ 13.32215146,  13.09292818,  12.86370489,  12.63448161,
        12.40525833,  12.17603504,  11.94681176,  11.71758848,  11.4883652 ])

In [36]:
#Plot the linear model for the dataset

plot_water_quality= water_quality_DO2.plot(kind='scatter', x='Temperature_Lab80', y="Oxygen_Dissolved_Lab-54")

plt.xlim(0,25)
plt.ylim(0,18)
plt.title('Graph showing the relationship between Dissolved Oxygen and Temperature in the Fraser River Main Arm')

plt.plot(x_new, y_preds, c='blue', linewidth=3)


Out[36]:
[<matplotlib.lines.Line2D at 0x375b4a8>]
C:\Users\Jhanelle\Anaconda3\lib\site-packages\matplotlib\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [25]:
#Print statistical summary for the linear regression

print(lm.summary())


                                 OLS Regression Results                                 
========================================================================================
Dep. Variable:     Q('Oxygen_Dissolved_Lab-54')   R-squared:                       0.812
Model:                                      OLS   Adj. R-squared:                  0.781
Method:                           Least Squares   F-statistic:                     25.96
Date:                          Sun, 15 Nov 2015   Prob (F-statistic):            0.00223
Time:                                  19:12:39   Log-Likelihood:                -6.6875
No. Observations:                             8   AIC:                             17.37
Df Residuals:                                 6   BIC:                             17.53
Df Model:                                     1                                         
Covariance Type:                      nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Intercept                 13.7806      0.719     19.159      0.000        12.021    15.541
Q('Temperature_Lab80')    -0.2292      0.045     -5.095      0.002        -0.339    -0.119
==============================================================================
Omnibus:                        7.204   Durbin-Watson:                   0.784
Prob(Omnibus):                  0.027   Jarque-Bera (JB):                2.168
Skew:                          -1.207   Prob(JB):                        0.338
Kurtosis:                       3.823   Cond. No.                         50.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\Jhanelle\Anaconda3\lib\site-packages\scipy\stats\stats.py:1233: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  int(n))

In [37]:
plot_water_quality.savefig('plot1.png')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-37-be252cba8f8d> in <module>()
----> 1 plot_water_quality.savefig('plot1.png')

AttributeError: 'AxesSubplot' object has no attribute 'savefig'

In [ ]:
# I need help finding other statistical techniques
# I also need help saving the plot
# Do you guys think I have enough visuals for my data?

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: