In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
import requests
URL = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x11.txt'
response = requests.get(URL)
text = response.text

In [3]:
columns = ['Index',
'Rent per arable acre',
'Milk cows per square mile',
'Difference between pasture and arable land',
'Rental price per grassy acre']
columns = [c.lower().replace(' ', '_') for c in columns]

In [7]:
lines = [line.strip() for line in text.split('\n') if not line.strip().startswith('#')]

In [14]:
data = lines[7:-2]
data = [row.split() for row in data]
data_df = pd.DataFrame(data=data, columns=columns).drop('index', axis=1).astype(np.float)

In [15]:
data_df.head()


Out[15]:
rent_per_arable_acre milk_cows_per_square_mile difference_between_pasture_and_arable_land rental_price_per_grassy_acre
0 15.50 17.25 0.24 18.38
1 22.29 18.51 0.20 20.00
2 12.36 11.13 0.12 11.50
3 31.84 5.54 0.12 25.00
4 83.90 5.44 0.04 62.50

In [16]:
data_df.corr()


Out[16]:
rent_per_arable_acre milk_cows_per_square_mile difference_between_pasture_and_arable_land rental_price_per_grassy_acre
rent_per_arable_acre 1.000000 0.045504 -0.497893 0.885082
milk_cows_per_square_mile 0.045504 1.000000 0.522598 0.303392
difference_between_pasture_and_arable_land -0.497893 0.522598 1.000000 -0.330177
rental_price_per_grassy_acre 0.885082 0.303392 -0.330177 1.000000

In [18]:
data_df.plot(kind='scatter', x='rent_per_arable_acre', y='rental_price_per_grassy_acre')


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x21495796780>

In [19]:
import statsmodels.formula.api as sm
result = sm.ols(formula='rental_price_per_grassy_acre ~ rent_per_arable_acre', data=data_df).fit()
result.summary()


Out[19]:
OLS Regression Results
Dep. Variable: rental_price_per_grassy_acre R-squared: 0.783
Model: OLS Adj. R-squared: 0.780
Method: Least Squares F-statistic: 235.1
Date: Thu, 30 Mar 2017 Prob (F-statistic): 2.86e-23
Time: 22:35:51 Log-Likelihood: -252.88
No. Observations: 67 AIC: 509.8
Df Residuals: 65 BIC: 514.2
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.8915 3.011 0.296 0.768 -5.121 6.904
rent_per_arable_acre 0.9494 0.062 15.331 0.000 0.826 1.073
Omnibus: 3.592 Durbin-Watson: 2.363
Prob(Omnibus): 0.166 Jarque-Bera (JB): 2.722
Skew: 0.440 Prob(JB): 0.256
Kurtosis: 3.447 Cond. No. 112.

In [20]:
import statsmodels.formula.api as sm
result = sm.ols(formula='rental_price_per_grassy_acre ~ rent_per_arable_acre + difference_between_pasture_and_arable_land', data=data_df).fit()
result.summary()


Out[20]:
OLS Regression Results
Dep. Variable: rental_price_per_grassy_acre R-squared: 0.800
Model: OLS Adj. R-squared: 0.793
Method: Least Squares F-statistic: 127.7
Date: Thu, 30 Mar 2017 Prob (F-statistic): 4.57e-23
Time: 22:36:50 Log-Likelihood: -250.27
No. Observations: 67 AIC: 506.5
Df Residuals: 64 BIC: 513.2
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept -6.4823 4.359 -1.487 0.142 -15.191 2.226
rent_per_arable_acre 1.0279 0.069 14.851 0.000 0.890 1.166
difference_between_pasture_and_arable_land 23.2037 10.190 2.277 0.026 2.846 43.561
Omnibus: 3.721 Durbin-Watson: 2.307
Prob(Omnibus): 0.156 Jarque-Bera (JB): 2.834
Skew: 0.413 Prob(JB): 0.242
Kurtosis: 3.577 Cond. No. 412.

In [ ]: