Apply logistic regression to categorize whether a county had high mortality rate due to contamination

1. Import the necessary packages to read in the data, plot, and create a logistic regression model


In [1]:
import pandas as pd
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression

2. Read in the hanford.csv file in the data/ folder


In [5]:
df = pd.read_csv("hanford.csv")
df.head()


Out[5]:
County Exposure Mortality
0 Umatilla 2.49 147.1
1 Morrow 2.57 130.1
2 Gilliam 3.41 129.9
3 Sherman 1.25 113.5
4 Wasco 1.62 137.5

3. Calculate the basic descriptive statistics on the data


In [6]:
df.describe()


Out[6]:
Exposure Mortality
count 9.000000 9.000000
mean 4.617778 157.344444
std 3.491192 34.791346
min 1.250000 113.500000
25% 2.490000 130.100000
50% 3.410000 147.100000
75% 6.410000 177.900000
max 11.640000 210.300000

In [7]:
df.median()


Out[7]:
Exposure       3.41
Mortality    147.10
dtype: float64

In [8]:
rang= df['Mortality'].max() - df['Mortality'].min()
rang


Out[8]:
96.800000000000011

In [10]:
iqr_m = df['Mortality'].quantile(q=0.75)- df['Mortality'].quantile(q=0.25)
iqr_m


Out[10]:
47.800000000000011

In [11]:
iqr_e = df['Exposure'].quantile(q=0.75)- df['Exposure'].quantile(q=0.25)
iqr_e


Out[11]:
3.9199999999999999

In [12]:
UAL_m= (iqr_m*1.5) + df['Mortality'].quantile(q=0.75)
UAL_m


Out[12]:
249.60000000000002

In [17]:
UAL_e= (iqr_m*1.5) + df['Exposure'].quantile(q=0.75)
UAL_e


Out[17]:
78.110000000000014

In [13]:
LAL_m= df['Mortality'].quantile(q=0.25) - (iqr_e*1.5)  
LAL_m


Out[13]:
124.22

In [14]:
LAL_e= df['Exposure'].quantile(q=0.25) - (iqr_e*1.5)  
LAL_e


Out[14]:
-3.3899999999999997

In [15]:
len(df[df['Mortality']> UAL_m])


Out[15]:
0

In [18]:
len(df[df['Exposure']> UAL_e])


Out[18]:
0

In [19]:
len(df[df['Mortality']< LAL_m])


Out[19]:
1

In [20]:
len(df[df['Mortality'] > UAL_m])


Out[20]:
0

In [ ]:


In [ ]:

4. Find a reasonable threshold to say exposure is high and recode the data


In [ ]:


In [ ]:

5. Create a logistic regression model


In [41]:
lm = LogisticRegression()

In [49]:
data = np.asarray(df[['Mortality','Exposure']])
x = data[:,1:]
y = data[:,0]

In [50]:
data


Out[50]:
array([[ 147.1 ,    2.49],
       [ 130.1 ,    2.57],
       [ 129.9 ,    3.41],
       [ 113.5 ,    1.25],
       [ 137.5 ,    1.62],
       [ 162.3 ,    3.83],
       [ 207.5 ,   11.64],
       [ 177.9 ,    6.41],
       [ 210.3 ,    8.34]])

In [44]:
x


Out[44]:
array([[  2.49],
       [  2.57],
       [  3.41],
       [  1.25],
       [  1.62],
       [  3.83],
       [ 11.64],
       [  6.41],
       [  8.34]])

In [45]:
y


Out[45]:
array([ 147.1,  130.1,  129.9,  113.5,  137.5,  162.3,  207.5,  177.9,
        210.3])

In [48]:
lm.fit(x,y)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-48-fead33db94e1> in <module>()
----> 1 lm.fit(x)

TypeError: fit() missing 1 required positional argument: 'y'

In [35]:
lm.coef_


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-f029e8367547> in <module>()
----> 1 lm.coef_

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [36]:
lm.score(x,y)


---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-36-366f7d2147ae> in <module>()
----> 1 lm.score(x,y)

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
    308         """
    309         from .metrics import accuracy_score
--> 310         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    311 
    312 

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\linear_model\base.py in predict(self, X)
    266             Predicted class label per sample.
    267         """
--> 268         scores = self.decision_function(X)
    269         if len(scores.shape) == 1:
    270             indices = (scores > 0).astype(np.int)

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\linear_model\base.py in decision_function(self, X)
    240         if not hasattr(self, 'coef_') or self.coef_ is None:
    241             raise NotFittedError("This %(name)s instance is not fitted "
--> 242                                  "yet" % {'name': type(self).__name__})
    243 
    244         X = check_array(X, accept_sparse='csr')

NotFittedError: This LogisticRegression instance is not fitted yet

In [37]:
slope = lm.coef_[0]


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-37-84e9eb32b1fa> in <module>()
----> 1 slope = lm.coef_[0]

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [38]:
intercept = lm.intercept_


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-38-0f828ae237a1> in <module>()
----> 1 intercept = lm.intercept_

AttributeError: 'LogisticRegression' object has no attribute 'intercept_'

6. Predict whether the mortality rate (Cancer per 100,000 man years) will be high at an exposure level of 50


In [40]:
lm.predict(50)


---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-40-abad1de0f282> in <module>()
----> 1 lm.predict(50)

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\linear_model\base.py in predict(self, X)
    266             Predicted class label per sample.
    267         """
--> 268         scores = self.decision_function(X)
    269         if len(scores.shape) == 1:
    270             indices = (scores > 0).astype(np.int)

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\linear_model\base.py in decision_function(self, X)
    240         if not hasattr(self, 'coef_') or self.coef_ is None:
    241             raise NotFittedError("This %(name)s instance is not fitted "
--> 242                                  "yet" % {'name': type(self).__name__})
    243 
    244         X = check_array(X, accept_sparse='csr')

NotFittedError: This LogisticRegression instance is not fitted yet

In [ ]: