In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
import requests
URL = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x11.txt'
response = requests.get(URL)
text = response.text
In [3]:
columns = ['Index',
'Rent per arable acre',
'Milk cows per square mile',
'Difference between pasture and arable land',
'Rental price per grassy acre']
columns = [c.lower().replace(' ', '_') for c in columns]
In [7]:
lines = [line.strip() for line in text.split('\n') if not line.strip().startswith('#')]
In [14]:
data = lines[7:-2]
data = [row.split() for row in data]
data_df = pd.DataFrame(data=data, columns=columns).drop('index', axis=1).astype(np.float)
In [15]:
data_df.head()
Out[15]:
In [16]:
data_df.corr()
Out[16]:
In [18]:
data_df.plot(kind='scatter', x='rent_per_arable_acre', y='rental_price_per_grassy_acre')
Out[18]:
In [19]:
import statsmodels.formula.api as sm
result = sm.ols(formula='rental_price_per_grassy_acre ~ rent_per_arable_acre', data=data_df).fit()
result.summary()
Out[19]:
In [20]:
import statsmodels.formula.api as sm
result = sm.ols(formula='rental_price_per_grassy_acre ~ rent_per_arable_acre + difference_between_pasture_and_arable_land', data=data_df).fit()
result.summary()
Out[20]:
In [ ]: