In [18]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
from sklearn.linear_model import LinearRegression # package for doing linear regression (there are others)
import numpy as np # necessary for working with the data and getting it properly shaped
In [3]:
df = pd.read_excel("data/height_weight.xlsx")
In [32]:
df.plot(kind="scatter",x="height",y="weight") # do a scatter plot of the data
Out[32]:
In [4]:
df.corr() # calculate r (coefficent of correlation)
Out[4]:
In [44]:
data = np.asarray(df[['weight','height']]) #convert the dataframe to a nparray
x, y = data[:, 1:], data[:, 0] #assign the values to variables and reshape the data
In [59]:
data
Out[59]:
In [37]:
lr = LinearRegression() # create a LinearRegression object
In [60]:
type(lr)
Out[60]:
In [45]:
lr.fit(x,y) #fit the data to the values
Out[45]:
In [49]:
m = lr.coef_[0] # the slope of our regression line
In [50]:
m
Out[50]:
In [51]:
b = lr.intercept_ # the y-intercept (value when x = 0)
In [53]:
b
Out[53]:
In [52]:
# plot the linear regression line on the scatter plot
df.plot(kind="scatter",x="height",y="weight")
plt.plot(df['height'],m*df['height']+b,'-')
Out[52]:
In [54]:
# simple function to predict values based on coefficients
def pred_weight(height):
return (3.8990302687839535*height) + (-143.02691843935344)
In [55]:
pred_weight(70) #test the function
Out[55]:
In [57]:
# predict for a given value
lr.predict(70)
Out[57]:
In [58]:
# calculate the R-squared (coefficient of determination)
lr.score(x,y)
Out[58]:
In [61]:
df2 = pd.read_csv("data/heights_weights_genders.csv")
In [62]:
df2.plot(kind="scatter",x="Height",y="Weight")
Out[62]:
In [63]:
df.describe()
Out[63]:
In [ ]:
# create the linear regression line and predict the weight for your height. How close is it?