notebook.community

Edit and run



In [18]:

    
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
from sklearn.linear_model import LinearRegression # package for doing linear regression (there are others)
import numpy as np # necessary for working with the data and getting it properly shaped



In [3]:

    
df = pd.read_excel("data/height_weight.xlsx")



In [32]:

    
df.plot(kind="scatter",x="height",y="weight") # do a scatter plot of the data









    Out[32]:





<matplotlib.axes.AxesSubplot at 0x108e44490>



In [4]:

    
df.corr() # calculate r (coefficent of correlation)



In [44]:

    
data = np.asarray(df[['weight','height']]) #convert the dataframe to a nparray
x, y = data[:, 1:], data[:, 0] #assign the values to variables and reshape the data



In [59]:

    
data









    Out[59]:





array([[  50.5,   51.3],
       [  77. ,   56.3],
       [  84. ,   56.5],
       [  83. ,   57.3],
       [  85. ,   57.5],
       [  99.5,   59. ],
       [  84.5,   59.8],
       [  84. ,   62.5],
       [ 112.5,   62.5],
       [ 102.5,   62.8],
       [ 102.5,   63.5],
       [  90. ,   64.3],
       [ 128. ,   64.8],
       [  98. ,   65.3],
       [ 112. ,   66.5],
       [ 112. ,   66.5],
       [ 133. ,   67. ],
       [ 112.5,   69. ],
       [ 150. ,   72. ]])



In [37]:

    
lr = LinearRegression() # create a LinearRegression object



In [60]:

    
type(lr)









    Out[60]:





sklearn.linear_model.base.LinearRegression



In [45]:

    
lr.fit(x,y) #fit the data to the values









    Out[45]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)



In [49]:

    
m = lr.coef_[0] # the slope of our regression line



In [50]:

    
m









    Out[50]:





3.8990302687839535



In [51]:

    
b = lr.intercept_ # the y-intercept (value when x = 0)



In [53]:

    
b









    Out[53]:





-143.02691843935344



In [52]:

    
# plot the linear regression line on the scatter plot
df.plot(kind="scatter",x="height",y="weight")
plt.plot(df['height'],m*df['height']+b,'-')









    Out[52]:





[<matplotlib.lines.Line2D at 0x108f14550>]



In [54]:

    
# simple function to predict values based on coefficients
def pred_weight(height):
    return (3.8990302687839535*height) + (-143.02691843935344)



In [55]:

    
pred_weight(70) #test the function









    Out[55]:





129.9052003755233



In [57]:

    
# predict for a given value
lr.predict(70)









    Out[57]:





129.90520037552329



In [58]:

    
# calculate the R-squared (coefficient of determination)
lr.score(x,y)









    Out[58]:





0.77050684271597392



In [61]:

    
df2 = pd.read_csv("data/heights_weights_genders.csv")



In [62]:

    
df2.plot(kind="scatter",x="Height",y="Weight")









    Out[62]:





<matplotlib.axes.AxesSubplot at 0x1090e2350>



In [63]:

    
df.describe()



In [ ]:

    
# create the linear regression line and predict the weight for your height. How close is it?

	height	weight
count	19.000000	19.000000
mean	62.336842	100.026316
std	5.127075	22.773933
min	51.300000	50.500000
25%	58.250000	84.250000
50%	62.800000	99.500000
75%	65.900000	112.250000
max	72.000000	150.000000