In [8]:
import pandas as pd
import matplotlib.pyplot as plt
#DISPLAY MOTPLOTLIB INLINE WITH THE NOTEBOOK AS OPPOSED TO POP UP WINDOW
%matplotlib inline
import statsmodels.formula.api as smf # package we'll be using for linear regression
In [9]:
df = pd.read_csv('../data/hanford.csv')
In [10]:
df.head()
Out[10]:
In [11]:
df.corr()
Out[11]:
In [12]:
lm = smf.ols(formula="Mortality~Exposure",data=df).fit() #notice the formula regresses Y on X (Y~X)
In [13]:
lm.params
Out[13]:
In [14]:
lm.summary() # R sQUARED IS 0.858 WHICH should be investigated!
Out[14]:
In [15]:
intercept, slope = lm.params
In [16]:
ax = df.plot(kind='scatter', x='Exposure', y='Mortality', alpha=0.5)
ax.set_title('Camcer Mortality Rates Based on Exposure')
ax.set_xlabel('Index of Exposure')
ax.set_ylabel('Cancer Mortality per 100,000 man-years')
Out[16]:
In [17]:
df.plot(kind="scatter",x="Exposure",y="Mortality")
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="red") #we create the best fit line from the values in the fit model
Out[17]:
In [18]:
R_squared = 0.858
In [19]:
index_ex = 10
plt.plot(index_ex,slope*index_ex+intercept,"-",color="red") #we create the best fit line from the values in the fit model
Out[19]:
In [20]:
# y = mx + b
intercept = 114.7156
In [21]:
slope * 10 + 114.7156
Out[21]:
In [22]:
def predicting_mortality_rate(exposure):
return slope * exposure + intercept
In [23]:
predicting_mortality_rate(10)
Out[23]:
In [ ]: