In [1]:
    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
    
In [3]:
    
df = pd.read_csv('hanford.csv')
df
    
    Out[3]:
In [4]:
    
df.describe()
    
    Out[4]:
In [7]:
    
iqr = df.quantile(q=0.75) - df.quantile(q=0.25)
iqr
    
    Out[7]:
In [9]:
    
ual = df.quantile(q=0.75) + (iqr * 1.5)
ual
    
    Out[9]:
In [10]:
    
lal = df.quantile(q=0.25) - (iqr * 1.5)
lal
    
    Out[10]:
In [6]:
    
df.corr()
    
    Out[6]:
Yes, it seems very much so that there's a correlation worth to be investigated
In [21]:
    
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
lm.params
    
    Out[21]:
In [22]:
    
intercept, slope = lm.params
    
In [17]:
    
exposure_input = input("Type in an exposre you'd like to know the mortality for:")
if exposure_input:
    prediction = (float(lm.params['Exposure']) * float(exposure_input)) + (float(lm.params['Intercept']))
    print(prediction)
    
    
In [24]:
    
fig, ax = plt.subplots(figsize=(7,7))
plt.style.use('ggplot')
ax = df.plot(ax = ax, kind= 'scatter', x = 'Exposure', y = 'Mortality')
plt.plot(df['Exposure'],slope*df['Exposure']+intercept, color="red", linewidth=2)
    
    Out[24]:
    
In [25]:
    
r = df.corr()['Exposure']['Mortality']
r
    
    Out[25]:
In [27]:
    
coefficient_determination = r **2
coefficient_determination
    
    Out[27]:
In [30]:
    
prediction = (float(lm.params['Exposure']) * 10 + (float(lm.params['Intercept'])))
print(prediction)
    
    
In [ ]: