In [55]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression
In [24]:
df = pd.read_csv('hanford.csv')
In [25]:
df.describe()
Out[25]:
In [26]:
iqr = df.quantile(q=0.75) - df.quantile(q=0.25)
iqr
Out[26]:
In [27]:
ual = df.quantile(q=0.75) + (iqr * 1.5)
ual
Out[27]:
In [28]:
lal = df.quantile(q=0.25) - (iqr * 1.5)
lal
Out[28]:
In [29]:
df.plot(kind='scatter', x='Exposure', y='Mortality')
Out[29]:
In [30]:
for value in df['Exposure']:
if value < ual['Exposure']:
print(value)
In [31]:
# Find new reasonable threshold!
# Choosing 6
In [32]:
df['high_exposure'] = df['Exposure'].apply(lambda x:1 if x>6 else 0)
In [33]:
df
Out[33]:
In [34]:
# dataset = df[['Mortality']].join([pd.get_dummies(df['Exposure'],prefix="Exposure"),df.high_exposure])
In [35]:
# dataset
In [36]:
from sklearn.linear_model import LogisticRegression
In [37]:
lm = LogisticRegression()
In [43]:
x = np.asarray(df[['Mortality']])
y = np.asarray(df['high_exposure'])
In [46]:
lm = lm.fit(x,y)
In [50]:
lm.score(x,y)
Out[50]:
In [47]:
lm.coef_
Out[47]:
In [48]:
lm.intercept_
Out[48]:
In [58]:
plt.plot(x,lm.coef_[0]*x+lm.intercept_[0])
Out[58]:
In [60]:
df['high_mortality'] = df['Mortality'].apply(lambda x:1 if x>150 else 0)
In [61]:
lm2 = LogisticRegression()
In [65]:
x2 = np.asarray(df[['Exposure']])
y2 = np.asarray(df['high_mortality'])
In [67]:
lm2 = lm2.fit(x2,y2)
In [68]:
lm2.predict(50)
Out[68]:
In [69]:
# According to the prediction the mortality rate is high at an exposure level of 50.
In [ ]: