In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [13]:
import requests
URL = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x04.txt'
response = requests.get(URL)
text = response.text
In [32]:
columns = ['Index', 'Print Runs', 'Page Numbers', 'Orders']
In [40]:
lines = [line.strip() for line in text.split('\n') if not line.strip().startswith('#')]
data = lines[6:-2]
data = np.array([row.split() for row in data], dtype=np.float)
data[:, 0] = data[:, 0].astype(np.int)
data_df = pd.DataFrame(data, columns=columns)
data_df = data_df.rename(columns={name: name.lower().replace(' ', '_') for name in data_df.keys()})
data_df = data_df.drop('index', axis=1)
In [43]:
data_df.head()
Out[43]:
In [42]:
data_df.corr()
Out[42]:
In [44]:
data_df.plot(kind='scatter', x='page_numbers', y='orders')
Out[44]:
In [46]:
import statsmodels.formula.api as sm
result = sm.ols(formula='orders ~ page_numbers', data=data_df).fit()
result.summary()
Out[46]:
In [47]:
import statsmodels.formula.api as sm
result = sm.ols(formula='orders ~ page_numbers + print_runs', data=data_df).fit()
result.summary()
Out[47]:
In [ ]: