In [36]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf # package we'll be using for linear regression
import numpy as np
import scipy as sp
In [4]:
df = pd.read_csv("data/hanford.csv")
df
Out[4]:
In [5]:
df.describe()
Out[5]:
In [19]:
df.plot(kind='scatter', x='Exposure', y='Mortality')
Out[19]:
In [11]:
r = df.corr()['Exposure']['Mortality']
r
Out[11]:
Yes, there seems to be a correlation wothy of investigation.
In [16]:
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
intercept, slope = lm.params
In [18]:
lm.params
Out[18]:
In [ ]:
In [22]:
# Method 01 (What we've learned from the class)
df.plot(kind='scatter', x='Exposure', y='Mortality')
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="red")
Out[22]:
In [42]:
# Method 02 (Another version) _ so much harder ...than what we have learned
def plot_correlation( ds, x, y, ylim=(100,240) ):
plt.xlim(0,14)
plt.ylim(ylim[0],ylim[1])
plt.scatter(ds[x], ds[y], alpha=0.6, s=50)
for abc, row in ds.iterrows():
plt.text(row[x], row[y],abc )
plt.xlabel(x)
plt.ylabel(y)
# Correlation
trend_variable = np.poly1d(np.polyfit(ds[x], ds[y], 1))
trendx = np.linspace(0, 14, 4)
plt.plot(trendx, trend_variable(trendx), color='r')
r = sp.stats.pearsonr(ds[x],ds[y])
plt.text(trendx[3], trend_variable(trendx[3]),'r={:.3f}'.format(r[0]), color = 'r' )
plt.tight_layout()
In [41]:
plot_correlation(df,'Exposure','Mortality')
In [12]:
r_squared = r **2
r_squared
Out[12]:
In [25]:
def predicting_mortality_rate(exposure):
return intercept + float(exposure) * slope
In [26]:
predicting_mortality_rate(10)
Out[26]: