In [18]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
from sklearn.linear_model import LinearRegression # package for doing linear regression (there are others)
import numpy as np # necessary for working with the data and getting it properly shaped

In [3]:
df = pd.read_excel("data/height_weight.xlsx")

In [32]:
df.plot(kind="scatter",x="height",y="weight") # do a scatter plot of the data


Out[32]:
<matplotlib.axes.AxesSubplot at 0x108e44490>

In [4]:
df.corr() # calculate r (coefficent of correlation)


Out[4]:
height weight
height 1.000000 0.877785
weight 0.877785 1.000000

In [44]:
data = np.asarray(df[['weight','height']]) #convert the dataframe to a nparray
x, y = data[:, 1:], data[:, 0] #assign the values to variables and reshape the data

In [59]:
data


Out[59]:
array([[  50.5,   51.3],
       [  77. ,   56.3],
       [  84. ,   56.5],
       [  83. ,   57.3],
       [  85. ,   57.5],
       [  99.5,   59. ],
       [  84.5,   59.8],
       [  84. ,   62.5],
       [ 112.5,   62.5],
       [ 102.5,   62.8],
       [ 102.5,   63.5],
       [  90. ,   64.3],
       [ 128. ,   64.8],
       [  98. ,   65.3],
       [ 112. ,   66.5],
       [ 112. ,   66.5],
       [ 133. ,   67. ],
       [ 112.5,   69. ],
       [ 150. ,   72. ]])

In [37]:
lr = LinearRegression() # create a LinearRegression object

In [60]:
type(lr)


Out[60]:
sklearn.linear_model.base.LinearRegression

In [45]:
lr.fit(x,y) #fit the data to the values


Out[45]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [49]:
m = lr.coef_[0] # the slope of our regression line

In [50]:
m


Out[50]:
3.8990302687839535

In [51]:
b = lr.intercept_ # the y-intercept (value when x = 0)

In [53]:
b


Out[53]:
-143.02691843935344

In [52]:
# plot the linear regression line on the scatter plot
df.plot(kind="scatter",x="height",y="weight")
plt.plot(df['height'],m*df['height']+b,'-')


Out[52]:
[<matplotlib.lines.Line2D at 0x108f14550>]

In [54]:
# simple function to predict values based on coefficients
def pred_weight(height):
    return (3.8990302687839535*height) + (-143.02691843935344)

In [55]:
pred_weight(70) #test the function


Out[55]:
129.9052003755233

In [57]:
# predict for a given value
lr.predict(70)


Out[57]:
129.90520037552329

In [58]:
# calculate the R-squared (coefficient of determination)
lr.score(x,y)


Out[58]:
0.77050684271597392

In [61]:
df2 = pd.read_csv("data/heights_weights_genders.csv")

In [62]:
df2.plot(kind="scatter",x="Height",y="Weight")


Out[62]:
<matplotlib.axes.AxesSubplot at 0x1090e2350>

In [63]:
df.describe()


Out[63]:
height weight
count 19.000000 19.000000
mean 62.336842 100.026316
std 5.127075 22.773933
min 51.300000 50.500000
25% 58.250000 84.250000
50% 62.800000 99.500000
75% 65.900000 112.250000
max 72.000000 150.000000

In [ ]:
# create the linear regression line and predict the weight for your height. How close is it?