## 1. Import the necessary packages to read in the data, plot, and create a linear regression model

``````

In [10]:

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

``````

## 2. Read in the hanford.csv file

``````

In [25]:

``````

## 3. Calculate the basic descriptive statistics on the data

``````

In [26]:

df

``````
``````

Out[26]:

County
Exposure
Mortality

0
Umatilla
2.49
147.1

1
Morrow
2.57
130.1

2
Gilliam
3.41
129.9

3
Sherman
1.25
113.5

4
Wasco
1.62
137.5

5
HoodRiver
3.83
162.3

6
Portland
11.64
207.5

7
Columbia
6.41
177.9

8
Clatsop
8.34
210.3

``````
``````

In [4]:

df.describe()

``````
``````

Out[4]:

Exposure
Mortality

count
9.000000
9.000000

mean
4.617778
157.344444

std
3.491192
34.791346

min
1.250000
113.500000

25%
2.490000
130.100000

50%
3.410000
147.100000

75%
6.410000
177.900000

max
11.640000
210.300000

``````
``````

In [5]:

df.hist()

``````
``````

Out[5]:

array([[<matplotlib.axes.AxesSubplot object at 0x1071de490>,
<matplotlib.axes.AxesSubplot object at 0x107289850>]], dtype=object)

``````

## 4. Calculate the coefficient of correlation (r) and generate the scatter plot. Does there seem to be a correlation worthy of investigation?

``````

In [7]:

df.corr()

``````
``````

Out[7]:

Exposure
Mortality

Exposure
1.000000
0.926345

Mortality
0.926345
1.000000

``````
``````

In [8]:

df.plot(kind='scatter',x='Exposure',y='Mortality')

``````
``````

Out[8]:

<matplotlib.axes.AxesSubplot at 0x108ff8190>

``````

## 5. Create a linear regression model based on the available data to predict the mortality rate given a level of exposure

``````

In [9]:

lm = LinearRegression()

``````
``````

In [11]:

data = np.asarray(df[['Mortality','Exposure']])
x = data[:,1:]
y = data[:,0]

``````
``````

In [12]:

lm.fit(x,y)

``````
``````

Out[12]:

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

``````
``````

In [14]:

lm.score(x,y)

``````
``````

Out[14]:

0.85811472686989465

``````
``````

In [19]:

m = lm.coef_[0]

``````
``````

In [20]:

b = lm.intercept_

``````

## 6. Plot the linear regression line on the scatter plot of values. Calculate the r^2 (coefficient of determination)

``````

In [21]:

df.plot(kind='scatter',x='Exposure',y='Mortality')
plt.plot(df['Exposure'],m*df['Exposure']+b,'-')

``````
``````

Out[21]:

[<matplotlib.lines.Line2D at 0x10918ba10>]

``````

## 7. Predict the mortality rate (Cancer per 100,000 man years) given an index of exposure = 10

``````

In [24]:

lm.predict(10)

``````
``````

Out[24]:

array([ 207.03019353])

``````
``````

In [ ]:

``````