In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf # package we'll be using for linear regression

In [2]:
df = pd.read_csv("data/heights_weights_genders.csv")

In [3]:
df.plot(kind="scatter",x="Height",y="Weight")


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f819e1a2438>

In [4]:
lm = smf.ols(formula="Weight~Height",data=df).fit() #notice the formula regresses Y on X (Y~X)

In [5]:
lm.params #get the parameters from the model fit


Out[5]:
Intercept   -350.737192
Height         7.717288
dtype: float64

In [6]:
intercept, slope = lm.params #assign those values to variables

In [7]:
df.plot(kind="scatter",x="Height",y="Weight")
plt.plot(df["Height"],slope*df["Height"]+intercept,"-",color="red") #we create the best fit line from the values in the fit model


Out[7]:
[<matplotlib.lines.Line2D at 0x7f819bb43438>]

In [8]:
lm.summary()


Out[8]:
OLS Regression Results
Dep. Variable: Weight R-squared: 0.855
Model: OLS Adj. R-squared: 0.855
Method: Least Squares F-statistic: 5.904e+04
Date: Tue, 26 Jul 2016 Prob (F-statistic): 0.00
Time: 16:09:00 Log-Likelihood: -39219.
No. Observations: 10000 AIC: 7.844e+04
Df Residuals: 9998 BIC: 7.846e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -350.7372 2.111 -166.109 0.000 -354.876 -346.598
Height 7.7173 0.032 242.975 0.000 7.655 7.780
Omnibus: 2.141 Durbin-Watson: 1.677
Prob(Omnibus): 0.343 Jarque-Bera (JB): 2.150
Skew: 0.036 Prob(JB): 0.341
Kurtosis: 2.991 Cond. No. 1.15e+03

In [ ]: