In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
df = pd.read_csv('../data/hanford.csv')
In [4]:
df.head()
Out[4]:
In [5]:
df.mean()
Out[5]:
In [6]:
df.median()
Out[6]:
In [7]:
df.mode()
Out[7]:
In [9]:
max(df['Exposure']) - min(df['Exposure'])
Out[9]:
In [10]:
max(df['Mortality']) - min(df['Mortality'])
Out[10]:
In [11]:
df['Exposure'].quantile(q=0.75) - df['Exposure'].quantile(q=0.25)
Out[11]:
In [12]:
df['Mortality'].quantile(q=0.75) - df['Mortality'].quantile(q=0.25)
Out[12]:
In [13]:
df.std()
Out[13]:
In [14]:
df.corr()
Out[14]:
In [15]:
df.plot(kind = 'scatter', x = 'Exposure', y = 'Mortality')
Out[15]:
Yes.
In [16]:
lm = smf.ols(formula = 'Mortality~Exposure', data = df).fit()
In [18]:
b, m = lm.params
In [21]:
def predicted_mortality_rate(exposure):
y = m * exposure + b
return y
In [20]:
df.plot(kind = 'scatter', x = 'Exposure', y = 'Mortality')
plt.plot(df['Exposure'], m * df['Exposure'] + b, '-', color = 'red')
Out[20]:
In [22]:
predicted_mortality_rate(10)
Out[22]:
In [ ]: