In [2]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#stats library
import statsmodels.api as sm
import scipy
#The function below is used to show the plots within the notebook
%matplotlib inline
In [26]:
#Check version of pandas library
pd.__version__
Out[26]:
In [27]:
#Check version of numpy library
np.__version__
Out[27]:
In [28]:
#Load dataset using Pandas
water_quality_data=pd.read_csv('../data/Fraser_River_Water_Data.csv', delimiter= ',')
In [32]:
#Print Water_Quality_Data
water_quality_data.head()
Out[32]:
In [30]:
water_quality_data.tail()
Out[30]:
In [31]:
#Print the variables for the dataset to help understand the dataset better
water_quality_data.dtypes.head()
Out[31]:
In [28]:
# Print data frame for variables extracted from water_quality_data
fraser_water_quality.head()
Out[28]:
Statistical Analysis for dataset- Linear Modelling
In [98]:
#Import pasty library to run data
from patsy import dmatrices
from patsy.builtins import *
In [7]:
#Extract variables which to be analyzed for assignment
fraser_water_quality_testing=water_quality_data[['Temperature Water [Lab: 80] [VMV: 1125]',
'Oxygen Dissolved [Lab: -54] [VMV: 1124]']]
In [33]:
#Print the head of the new dataframe
fraser_water_quality_testing.head()
Out[33]:
In [11]:
#Remove the NaN from the dataset
water_quality_DO=fraser_water_quality_testing.dropna()
In [34]:
#Print the dataframe for values remaining from those dropped
water_quality_DO
Out[34]:
In [14]:
#Change column names
water_quality_DO2= water_quality_DO.rename(columns={'Temperature Water [Lab: 80] [VMV: 1125]':'Temperature_Lab80',
'Oxygen Dissolved [Lab: -54] [VMV: 1124]':
'Oxygen_Dissolved_Lab-54'})
In [35]:
#Print the dataframe of the column names changed
water_quality_DO2
Out[35]:
In [18]:
#Linear Model general form for the data
lm= sm.formula.ols(formula="Q('Oxygen_Dissolved_Lab-54') ~ Q('Temperature_Lab80')", data= water_quality_DO2).fit()
In [19]:
# This gives the beta values, the gradient is the time value
lm.params
Out[19]:
In [20]:
# Used to use the predict function we make a data frame, therefore below we have data frame that is used to make dataframe
x_new=pd.DataFrame({'Temperature_Lab80': range(1,700)})
In [21]:
x_new.head()
Out[21]:
In [22]:
# create predict function to calculate linear model
y_preds=lm.predict(x_new)
y_preds[1:10]
Out[22]:
In [36]:
#Plot the linear model for the dataset
plot_water_quality= water_quality_DO2.plot(kind='scatter', x='Temperature_Lab80', y="Oxygen_Dissolved_Lab-54")
plt.xlim(0,25)
plt.ylim(0,18)
plt.title('Graph showing the relationship between Dissolved Oxygen and Temperature in the Fraser River Main Arm')
plt.plot(x_new, y_preds, c='blue', linewidth=3)
Out[36]:
In [25]:
#Print statistical summary for the linear regression
print(lm.summary())
In [37]:
plot_water_quality.savefig('plot1.png')
In [ ]:
# I need help finding other statistical techniques
# I also need help saving the plot
# Do you guys think I have enough visuals for my data?
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: