In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
In [3]:
df = pd.read_csv('hanford.csv')
df
Out[3]:
In [4]:
df.describe()
Out[4]:
In [7]:
iqr = df.quantile(q=0.75) - df.quantile(q=0.25)
iqr
Out[7]:
In [9]:
ual = df.quantile(q=0.75) + (iqr * 1.5)
ual
Out[9]:
In [10]:
lal = df.quantile(q=0.25) - (iqr * 1.5)
lal
Out[10]:
In [6]:
df.corr()
Out[6]:
Yes, it seems very much so that there's a correlation worth to be investigated
In [21]:
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
lm.params
Out[21]:
In [22]:
intercept, slope = lm.params
In [17]:
exposure_input = input("Type in an exposre you'd like to know the mortality for:")
if exposure_input:
prediction = (float(lm.params['Exposure']) * float(exposure_input)) + (float(lm.params['Intercept']))
print(prediction)
In [24]:
fig, ax = plt.subplots(figsize=(7,7))
plt.style.use('ggplot')
ax = df.plot(ax = ax, kind= 'scatter', x = 'Exposure', y = 'Mortality')
plt.plot(df['Exposure'],slope*df['Exposure']+intercept, color="red", linewidth=2)
Out[24]:
In [25]:
r = df.corr()['Exposure']['Mortality']
r
Out[25]:
In [27]:
coefficient_determination = r **2
coefficient_determination
Out[27]:
In [30]:
prediction = (float(lm.params['Exposure']) * 10 + (float(lm.params['Intercept'])))
print(prediction)
In [ ]: