In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
import requests
URL = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x03.txt'
response = requests.get(URL)
text = response.text
In [6]:
columns = ['Index', 'One', 'Age', 'Systolic Blood Pressure']
In [10]:
lines = [line.strip() for line in text.split('\n') if not line.strip().startswith('#')]
data = lines[6:-2]
data = np.array([row.split() for row in data], dtype=np.float)
data[:, 1] = data[:, 1].astype(np.int)
In [12]:
data_df = pd.DataFrame(data=data, columns=columns)
data_df = data_df.rename(columns={name: name.lower().replace(' ', '_') for name in data_df.keys()})
In [14]:
data_pressure = data_df[['age', 'systolic_blood_pressure']]
In [15]:
data_pressure.plot(kind='scatter', x='age', y='systolic_blood_pressure')
Out[15]:
In [17]:
import statsmodels.formula.api as sm
result = sm.ols(formula='systolic_blood_pressure ~ age', data=data_df).fit()
result.summary()
Out[17]:
In [27]:
without_outliers = data_pressure.sort_values(by='systolic_blood_pressure', ascending=False).iloc[1:]
In [28]:
without_outliers.plot(kind='scatter', x='age', y='systolic_blood_pressure')
Out[28]:
In [29]:
result = sm.ols(formula='systolic_blood_pressure ~ age', data=without_outliers).fit()
result.summary()
Out[29]:
In [ ]: