In [36]:
    
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf # package we'll be using for linear regression
import numpy as np
import scipy as sp
    
In [4]:
    
df = pd.read_csv("data/hanford.csv")
df
    
    Out[4]:
In [5]:
    
df.describe()
    
    Out[5]:
In [19]:
    
df.plot(kind='scatter', x='Exposure', y='Mortality')
    
    Out[19]:
    
In [11]:
    
r = df.corr()['Exposure']['Mortality']
r
    
    Out[11]:
Yes, there seems to be a correlation wothy of investigation.
In [16]:
    
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
intercept, slope = lm.params
    
In [18]:
    
lm.params
    
    Out[18]:
In [ ]:
    
    
In [22]:
    
# Method 01 (What we've learned from the class)
df.plot(kind='scatter', x='Exposure', y='Mortality')
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="red")
    
    Out[22]:
    
In [42]:
    
# Method 02 (Another version) _ so much harder ...than what we have learned
def plot_correlation( ds, x, y, ylim=(100,240) ):
    plt.xlim(0,14)
    plt.ylim(ylim[0],ylim[1])
    plt.scatter(ds[x], ds[y], alpha=0.6, s=50) 
    for abc, row in ds.iterrows():
        plt.text(row[x], row[y],abc )
    plt.xlabel(x)
    plt.ylabel(y)
    
    # Correlation 
    trend_variable = np.poly1d(np.polyfit(ds[x], ds[y], 1))
    trendx = np.linspace(0, 14, 4)
    plt.plot(trendx, trend_variable(trendx), color='r') 
    r = sp.stats.pearsonr(ds[x],ds[y])
    plt.text(trendx[3], trend_variable(trendx[3]),'r={:.3f}'.format(r[0]), color = 'r' )
    plt.tight_layout()
    
In [41]:
    
plot_correlation(df,'Exposure','Mortality')
    
    
In [12]:
    
r_squared = r **2
r_squared
    
    Out[12]:
In [25]:
    
def predicting_mortality_rate(exposure):
    return intercept + float(exposure) * slope
    
In [26]:
    
predicting_mortality_rate(10)
    
    Out[26]: