Codes for Final Project

Understanding Water Quality in the Main Arm of the Fraser River


In [126]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt

#stats library

import statsmodels.api as sm
import scipy

#The function below is used to show the plots within the notebook

%matplotlib inline

In [91]:
#Check version of pandas library
pd.__version__


Out[91]:
'0.17.0'

In [93]:
#Path from which the data is retrieved
filepath= '../data/Fraser_River_Water_Data.csv'

def load_water_quality_data():
    '''This function loads the dataset required to complete this project.'''
    
    #Function used load data
    water_quality_data= pd.read_csv(filepath, delimiter= ',')
    
    return(water_quality_data)

In [94]:
#Print the head of the data set
load_water_quality_data().head()


Out[94]:
Sample time Sample number Sample type Arsenic Dissolved [Lab: 1] [VMV: 107942] Status(0) Unit code(0) Value modifier code(0) Carbon Dissolved Organic [Lab: -54] [VMV: 1067] Status(1) Unit code(1) ... Unit code(13) Value modifier code(13) Temperature Water [Lab: 80] [VMV: 1125] Status(14) Unit code(14) Value modifier code(14) Temperature Water [Lab: -54] [VMV: 1125] Status(15) Unit code(15) Value modifier code(15)
0 2008-08-27 21:50:00 08PY001319 1 NaN NaN NaN NaN 8.2 U MG/L ... NaN NaN 16 U DEG C NaN NaN NaN NaN NaN
1 2008-09-03 00:25:00 08PY001043 1 NaN NaN NaN NaN 2.0 U MG/L ... NaN NaN 16 U DEG C NaN NaN NaN NaN NaN
2 2008-09-17 22:15:00 08PY001061 1 NaN NaN NaN NaN 2.9 U MG/L ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2008-09-30 20:30:00 08PY001124 1 NaN NaN NaN NaN 1.6 U MG/L ... NaN NaN 14 U DEG C NaN NaN NaN NaN NaN
4 2008-10-15 19:24:00 08PY001200 1 NaN NaN NaN NaN 4.7 U MG/L ... NaN NaN 11 U DEG C NaN NaN NaN NaN NaN

5 rows × 67 columns


In [95]:
#Import pasty library to run data

from patsy import dmatrices
from patsy.builtins import *

In [129]:
def extract_and_drop_data():
    
    '''This function extracts the variables needed, drop the NaN values and rename these columns such 
    as to eliminate patsy error'''
    
    #Extract variables to be analyzed for assignment
    fraser_water_quality_testing=load_water_quality_data()[['Sample time', 'Temperature Water [Lab: 80] [VMV: 1125]',
                                                          'Oxygen Dissolved [Lab: -54] [VMV: 1124]']]
    
    #Remove the NaN from the dataset
    water_quality_DO=fraser_water_quality_testing.dropna()
    
    #Change column names to eliminate the possibility of patsy error when doing linear regression
    water_quality_DO2= water_quality_DO.rename(columns={'Sample time': 'Sample_time','Temperature Water [Lab: 80] [VMV: 1125]':'Temperature_Lab80',
                                                    'Oxygen Dissolved [Lab: -54] [VMV: 1124]':'Oxygen_Dissolved_Lab-54'})
    return water_quality_DO2

In [132]:
#Use the defined function to rename column names and drop empty values

edited_columns=extract_and_drop_data()

In [137]:
#Print a new dataframe with the munged data
edited_columns


Out[137]:
Sample_time Temperature_Lab80 Oxygen_Dissolved_Lab-54
49 2012-04-27 21:40:00 7.5 12.00
53 2012-07-13 20:40:00 16.5 10.50
64 2013-05-03 19:30:00 7.5 12.70
69 2013-07-11 18:35:00 17.5 10.15
70 2013-07-25 18:00:00 19.5 9.38
73 2013-09-04 19:10:00 20.0 9.05
74 2013-09-17 19:25:00 20.6 8.96
75 2013-10-02 20:45:00 12.2 9.70

In [283]:
def linear_model(x,y):
    '''This function is used to create a linear regression for x and y variables in the dataset.
    The input for the function are the x and y variables while the output is a 
    linear model that represents these variables'''
    
    #Define the x and y variables
    y= 'Oxygen_Dissolved_Lab-54'
    x= 'Temperature_Lab80'
    
    #General form for the linear model formula, this would be used with the defined variable
    lm= sm.formula.ols(formula= "Q('" + y + "') ~ Q('" + x + "')", data= edited_columns).fit()
    
    # Used to predict function we make a data frame, therefore below we have data frame that is used to make dataframe
    x_new=pd.DataFrame({'Temperature_Lab80': range(1,700)})
    
    # create a predict function to calculate linear model
    y_preds=lm.predict(x_new)
    
    return lm
    return x_new
    return y_preds

In [271]:
#Create a variable for the parameters of the linear model 
parameters_linear_model=linear_model('Temperature_Lab80','Oxygen_Dissolved_Lab-54').params

In [272]:
#Print the parameters of the linear model
parameters_linear_model


Out[272]:
Intercept                 13.780598
Q('Temperature_Lab80')    -0.229223
dtype: float64

In [273]:
#Create a summary for linear model 
summary_of_linear_model=linear_model('Temperature_Lab80','Oxygen_Dissolved_Lab-54').summary()


C:\Users\Jhanelle\Anaconda3\lib\site-packages\scipy\stats\stats.py:1233: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  int(n))

In [260]:
#Print the summary of the linear model values
summary_of_linear_model


Out[260]:
OLS Regression Results
Dep. Variable: Q('Oxygen_Dissolved_Lab-54') R-squared: 0.812
Model: OLS Adj. R-squared: 0.781
Method: Least Squares F-statistic: 25.96
Date: Tue, 17 Nov 2015 Prob (F-statistic): 0.00223
Time: 21:25:17 Log-Likelihood: -6.6875
No. Observations: 8 AIC: 17.37
Df Residuals: 6 BIC: 17.53
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 13.7806 0.719 19.159 0.000 12.021 15.541
Q('Temperature_Lab80') -0.2292 0.045 -5.095 0.002 -0.339 -0.119
Omnibus: 7.204 Durbin-Watson: 0.784
Prob(Omnibus): 0.027 Jarque-Bera (JB): 2.168
Skew: -1.207 Prob(JB): 0.338
Kurtosis: 3.823 Cond. No. 50.6

In [300]:
#Create the dimensions of the figure size for the plot created
plt.figure(figsize=(20,20))

#Plot the linear model for the dataset
plot_water_quality= edited_columns.plot(kind='scatter', x='Temperature_Lab80', y="Oxygen_Dissolved_Lab-54")

#Create the x limits for the graph
plt.xlim(0,25)

#Create the y limits for the graph
plt.ylim(0,18)

#Define the y_preds variable
y_preds=linear_model('Temperature_Lab80','Oxygen_Dissolved_Lab-54').predict(x_new)

#Create a plot that displays the linear model
plt.plot(x_new, y_preds, c='blue', linewidth=3)

#Set title for the linear regression model
plt.title('The relationship between Dissolved Oxygen and Temperature in the Fraser River Main Arm', fontsize=9)

#Save the linear model plot
plt.savefig('../results/Fraser_River_Water_Quality_Graph_01.pdf')

plt.show()


C:\Users\Jhanelle\Anaconda3\lib\site-packages\matplotlib\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
<matplotlib.figure.Figure at 0xbbedc88>

In [ ]:


In [ ]: