In [10]:
import pandas as pd
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv('../data/hanford.csv')
In [3]:
df.describe()
Out[3]:
In [5]:
df.plot(kind = 'scatter', x = 'Exposure', y = 'Mortality')
Out[5]:
In [12]:
df['Mortality'].mean() + 3 * df['Mortality'].std()
Out[12]:
In [13]:
df['Exposure'].mean() + 3 * df['Exposure'].std()
Out[13]:
In [16]:
high_exposure = df['Exposure'].median()
In [17]:
high_mortality = df['Mortality'].median()
In [18]:
df['Mort_high'] = df['Mortality'].apply(lambda x:1 if x >= high_mortality else 0)
df['Expo_high'] = df['Mortality'].apply(lambda x:1 if x >= high_mortality else 0)
In [19]:
df
Out[19]:
In [20]:
lm = LogisticRegression()
data = np.asarray(df[['Mort_high', 'Expo_high']]) #data is a list of lists
x = data[:,1:] # the second colon is in case you have more than one input in your prediction model
y = data[:,0]
lm.fit(x,y)
lm.score(x,y) #This gives you r^2, or coefficient of determination
slope = lm.coef_[0] #the coefficients are a list (here, we’re just using one) of the slopes for each x value
intercept = lm.intercept_
In [22]:
df.plot(kind = 'scatter', x = 'Exposure', y = 'Mortality')
plt.plot(df['Expo_high'], slope * df['Mort_high'] + intercept, '-')
Out[22]:
In [23]:
lm.predict(50)
Out[23]:
Yes
In [ ]: