In [4]:
    
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
    
In [5]:
    
df = pd.read_csv("data/hanford.csv")
    
In [6]:
    
df
    
    Out[6]:
In [4]:
    
df.describe()
    
    Out[4]:
In [5]:
    
df.hist()
    
    Out[5]:
    
In [7]:
    
df.corr()
    
    Out[7]:
In [8]:
    
df.plot(kind='scatter',x='Exposure',y='Mortality')
    
    Out[8]:
    
In [19]:
    
lm = LinearRegression()
    
In [20]:
    
data = np.asarray(df[['Mortality','Exposure']])
x = data[:,1:]
y = data[:,0]
    
In [21]:
    
lm.fit(x,y)
    
    Out[21]:
In [22]:
    
lm.score(x,y)
    
    Out[22]:
In [25]:
    
m = lm.coef_[0]
m
    
    Out[25]:
In [26]:
    
b = lm.intercept_
b
    
    Out[26]:
In [21]:
    
df.plot(kind='scatter',x='Exposure',y='Mortality')
plt.plot(df['Exposure'],m*df['Exposure']+b,'-')
    
    Out[21]:
    
In [24]:
    
lm.predict(10)
    
    Out[24]:
In [1]:
    
import statsmodels.formula.api as smf
    
In [27]:
    
lm = smf.ols(formula='Mortality~Exposure',data=df).fit()
    
In [28]:
    
lm.params
    
    Out[28]:
In [29]:
    
intercept, slope = lm.params
    
In [33]:
    
df.plot(kind='scatter',x='Exposure',y='Mortality')
plt.plot(df['Exposure'],slope*df['Exposure']+intercept,'-')
    
    Out[33]:
    
In [34]:
    
plt.xkcd()
df.plot(kind='scatter',x='Exposure',y='Mortality')
plt.plot(df['Exposure'],slope*df['Exposure']+intercept,'-')
    
    Out[34]:
    
In [30]:
    
lm.summary()
    
    Out[30]:
In [31]:
    
lm.mse_model
    
    Out[31]:
In [32]:
    
lm.pvalues
    
    Out[32]:
In [ ]: