``````

In :

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
from sklearn.linear_model import LinearRegression # package for doing linear regression (there are others)
import numpy as np # necessary for working with the data and getting it properly shaped

``````
``````

In :

``````
``````

In :

df.plot(kind="scatter",x="height",y="weight") # do a scatter plot of the data

``````
``````

Out:

<matplotlib.axes.AxesSubplot at 0x108e44490>

``````
``````

In :

df.corr() # calculate r (coefficent of correlation)

``````
``````

Out:

height
weight

height
1.000000
0.877785

weight
0.877785
1.000000

``````
``````

In :

data = np.asarray(df[['weight','height']]) #convert the dataframe to a nparray
x, y = data[:, 1:], data[:, 0] #assign the values to variables and reshape the data

``````
``````

In :

data

``````
``````

Out:

array([[  50.5,   51.3],
[  77. ,   56.3],
[  84. ,   56.5],
[  83. ,   57.3],
[  85. ,   57.5],
[  99.5,   59. ],
[  84.5,   59.8],
[  84. ,   62.5],
[ 112.5,   62.5],
[ 102.5,   62.8],
[ 102.5,   63.5],
[  90. ,   64.3],
[ 128. ,   64.8],
[  98. ,   65.3],
[ 112. ,   66.5],
[ 112. ,   66.5],
[ 133. ,   67. ],
[ 112.5,   69. ],
[ 150. ,   72. ]])

``````
``````

In :

lr = LinearRegression() # create a LinearRegression object

``````
``````

In :

type(lr)

``````
``````

Out:

sklearn.linear_model.base.LinearRegression

``````
``````

In :

lr.fit(x,y) #fit the data to the values

``````
``````

Out:

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

``````
``````

In :

m = lr.coef_ # the slope of our regression line

``````
``````

In :

m

``````
``````

Out:

3.8990302687839535

``````
``````

In :

b = lr.intercept_ # the y-intercept (value when x = 0)

``````
``````

In :

b

``````
``````

Out:

-143.02691843935344

``````
``````

In :

# plot the linear regression line on the scatter plot
df.plot(kind="scatter",x="height",y="weight")
plt.plot(df['height'],m*df['height']+b,'-')

``````
``````

Out:

[<matplotlib.lines.Line2D at 0x108f14550>]

``````
``````

In :

# simple function to predict values based on coefficients
def pred_weight(height):
return (3.8990302687839535*height) + (-143.02691843935344)

``````
``````

In :

pred_weight(70) #test the function

``````
``````

Out:

129.9052003755233

``````
``````

In :

# predict for a given value
lr.predict(70)

``````
``````

Out:

129.90520037552329

``````
``````

In :

# calculate the R-squared (coefficient of determination)
lr.score(x,y)

``````
``````

Out:

0.77050684271597392

``````
``````

In :

``````
``````

In :

df2.plot(kind="scatter",x="Height",y="Weight")

``````
``````

Out:

<matplotlib.axes.AxesSubplot at 0x1090e2350>

``````
``````

In :

df.describe()

``````
``````

Out:

height
weight

count
19.000000
19.000000

mean
62.336842
100.026316

std
5.127075
22.773933

min
51.300000
50.500000

25%
58.250000
84.250000

50%
62.800000
99.500000

75%
65.900000
112.250000

max
72.000000
150.000000

``````
``````

In [ ]:

# create the linear regression line and predict the weight for your height. How close is it?

``````