In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
import requests
URL = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x02.txt'
response = requests.get(URL)
text = response.text
In [8]:
lines = [line.strip() for line in text.split('\n') if not line.strip().startswith('#')]
In [9]:
lines[:10]
Out[9]:
In [10]:
columns = ['Index', 'Patient Height', 'Patient Weight', 'Catheter Length']
In [16]:
data = lines[6:-2]
In [34]:
data_a = np.array([row.split() for row in data])
data_a = data_a.astype(np.float)
data_a[:, 1] = data_a[:, 1].astype(np.int)
In [42]:
data_df = pd.DataFrame(data=data_a, columns=columns).set_index('Index')
data_df = data_df.rename(columns={name: name.lower().replace(' ', '_') for name in data_df.keys()})
In [43]:
data_df.head(4)
Out[43]:
In [48]:
data_df.corr()
Out[48]:
In [53]:
data_df[['patient_height', 'patient_weight']].corrwith(data_df['catheter_length'])
Out[53]:
In [54]:
data_df.plot(kind='scatter', x='patient_weight', y='catheter_length')
Out[54]:
In [56]:
data_df.plot(kind='scatter', x='patient_height', y='catheter_length')
Out[56]:
In [57]:
import statsmodels.formula.api as sm
result = sm.ols(formula='catheter_length ~ patient_weight', data=data_df).fit()
result.summary()
Out[57]:
In [58]:
result = sm.ols(formula='catheter_length ~ patient_height', data=data_df).fit()
result.summary()
Out[58]:
In [59]:
result = sm.ols(formula='catheter_length ~ patient_height + patient_weight', data=data_df).fit()
result.summary()
Out[59]:
In [61]:
data_df.plot(kind='scatter', x='patient_height', y='patient_weight')
Out[61]:
In [ ]: