In [2]:
import pandas as pd
%matplotlib inline
In [3]:
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
In [4]:
df = pd.read_csv("hanford.csv")
In [5]:
df.head()
Out[5]:
In [6]:
df.describe()
Out[6]:
In [7]:
df.corr()
Out[7]:
There seems to be a highly positive correlation between both variables, as shown by the coefficient of correlation, which equals 0.92.
In [8]:
df.plot(kind='scatter', x='Exposure', y='Mortality')
Out[8]:
In [9]:
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
In [17]:
lm.params
Out[17]:
In [18]:
intercept, slope = lm.params
In [19]:
def mortality_rate(exposure):
for item in df['Exposure']:
mortality = exposure * slope + intercept
return mortality
In [20]:
mortality_rate(3)
Out[20]:
In [13]:
ax = df.plot(kind='scatter', x= 'Exposure', y='Mortality')
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="green")
Out[13]:
In [14]:
det_corr = (df.corr())* (df.corr())
In [15]:
det_corr
Out[15]:
In [16]:
mortality_rate(10)
Out[16]: