In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
# load the dataset
boston = datasets.load_boston()
In [3]:
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df["MEDV"] = boston.target
# we will perform simple linear regression => we keep one variable
df = boston_df[["MEDV", "LSTAT"]]
sns.jointplot(x="LSTAT", y="MEDV", data=boston_df, kind="reg", size=4);
In [6]:
import statsmodels.api as sm
# !!!! Attention if we do not add any constant Statsmodels works but the result is different!!!
X = boston_df["LSTAT"]
model = sm.OLS(boston_df["MEDV"], X )
result = model.fit()
print result.summary()
In [8]:
import statsmodels.api as sm
#we have to add the constant
X = sm.add_constant(boston_df["LSTAT"])
model = sm.OLS(boston_df["MEDV"], X )
result = model.fit()
print result.summary()
In [9]:
# or use the formula a' la R interface:
import statsmodels.formula.api as smf
model = smf.ols(formula='MEDV ~ LSTAT', data=boston_df)
res = model.fit()
print res.summary()
In [10]:
# Scikit-learn is build on top of Scipy
# Using scipy - this uses the test dataset
#Scikit-learn follows the machine learning tradition where the main supported task is chosing the "best" model
#for prediction
# the emphasis is on model selection for out-of-sample prediction and therefore cross-validation on "test data".
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(boston_df["LSTAT"], boston_df["MEDV"])
print slope, intercept, r_value*r_value, p_value, std_err
In [ ]: