In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf # package we'll be using for linear regression
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
plt.style.use('ggplot')
import dateutil.parser
import math
import random
import matplotlib.ticker as plticker
matplotlib.rcParams['ps.fonttype'] = 42
In [2]:
df = pd.read_csv("data/hanford.csv")
df.head()
Out[2]:
In [16]:
df.describe()
Out[16]:
In [17]:
df.median()
Out[17]:
In [19]:
rang= df['Mortality'].max() - df['Mortality'].min()
rang
Out[19]:
In [20]:
iqr_m = df['Mortality'].quantile(q=0.75)- df['Mortality'].quantile(q=0.25)
iqr_m
Out[20]:
In [27]:
iqr_e = df['Exposure'].quantile(q=0.75)- df['Exposure'].quantile(q=0.25)
iqr_e
Out[27]:
In [22]:
UAL_m= (iqr_m*1.5) + df['Mortality'].quantile(q=0.75)
UAL_m
Out[22]:
In [28]:
UAL_e= (iqr_m*1.5) + df['Exposure'].quantile(q=0.75)
UAL_e
Out[28]:
In [29]:
LAL_m= df['Mortality'].quantile(q=0.25) - (iqr_e*1.5)
LAL_m
Out[29]:
In [30]:
LAL_e= df['Exposure'].quantile(q=0.25) - (iqr_e*1.5)
LAL_e
Out[30]:
In [25]:
len(df[df['Mortality']> UAL_m])
Out[25]:
In [31]:
len(df[df['Exposure']> UAL_e])
Out[31]:
In [26]:
len(df[df['Mortality']< LAL_m])
Out[26]:
In [32]:
len(df[df['Mortality'] > UAL_m])
Out[32]:
In [8]:
df.corr()
Out[8]:
In [10]:
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
lm.params
Out[10]:
In [11]:
def exposure_predict(exposure):
df = pd.read_csv("data/hanford.csv")
lm = smf.ols(formula="Mortality~Exposure",data=df).fit()
mortality = exposure * lm.params.Exposure + lm.params.Intercept
return mortality
In [13]:
intercept, slope = lm.params
In [14]:
df.plot(kind="scatter",x="Exposure",y="Mortality")
plt.plot(df["Exposure"],slope*df["Exposure"]+intercept,"-",color="red")
Out[14]:
In [15]:
exposure_predict(10)
Out[15]:
In [ ]: