In [2]:
    
import pandas as pd
%matplotlib inline
    
In [3]:
    
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
    
In [4]:
    
df = pd.read_csv("hanford.csv")
    
In [5]:
    
df.head()
    
    Out[5]:
In [6]:
    
df.describe()
    
    Out[6]:
In [7]:
    
df.corr()
    
    Out[7]:
There seems to be a highly positive correlation between both variables, as shown by the coefficient of correlation, which equals 0.92.
In [8]:
    
df.plot(kind='scatter', x='Exposure', y='Mortality')
    
    Out[8]:
    
In [9]:
    
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
    
In [17]:
    
lm.params
    
    Out[17]:
In [18]:
    
intercept, slope = lm.params
    
In [19]:
    
def mortality_rate(exposure):
    for item in df['Exposure']:
        mortality = exposure * slope + intercept
    return mortality
    
In [20]:
    
mortality_rate(3)
    
    Out[20]:
In [13]:
    
ax = df.plot(kind='scatter', x= 'Exposure', y='Mortality')
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="green")
    
    Out[13]:
    
In [14]:
    
det_corr = (df.corr())* (df.corr())
    
In [15]:
    
det_corr
    
    Out[15]:
In [16]:
    
mortality_rate(10)
    
    Out[16]: