In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import statsmodels.api as sm
import patsy as pt
from ggplot import *
# show graphs right in the notebook (rather that in another window)
%matplotlib inline
In [2]:
# load a data file
df = pd.DataFrame.from_csv("http://roman-kh.github.io/files/linear-models/simple1.csv")
# x - a table with factors x1,x2,x3
x = df.iloc[:,:-1]
# y - a table with a dependent variable
y = df.iloc[:,-1]
df.head()
Out[2]:
In [3]:
# create an empty model
skm = lm.LinearRegression()
# calculate parameters
skm.fit(x, y)
# show them
skm.intercept_, skm.coef_
Out[3]:
In [4]:
# add constant to calculate intercept
x_ = sm.add_constant(x)
smm = sm.OLS(y, x_)
res = smm.fit()
b = res.params.values
# model coefficients
b
Out[4]:
In [5]:
# create a model with a formula
smm = sm.OLS.from_formula("y ~ x1 + x2 + x3", data=df)
# run calculations
res = smm.fit()
# show the coefficients
res.params.values
Out[5]:
In [6]:
# create a factor matrix and a result matrix from a dataframe
pt_y, pt_x = pt.dmatrices("y ~ x1 + x2 + x3", df)
# use ordinary least squares
res = np.linalg.lstsq(pt_x, pt_y)
# show the model coefficients
res[0].ravel()
Out[6]:
In [7]:
ggplot(aes(x="x1", y="y"), data=df) + geom_point()
Out[7]:
In [8]:
ggplot(aes(x="x2", y="y"), data=df) + geom_point()
Out[8]:
In [9]:
ggplot(aes(x="x3", y="y"), data=df) + geom_point()
Out[9]:
In [10]:
ggplot(aes(x="x1", y="y", color="x3"), data=df) + geom_point()
Out[10]:
In [11]:
ggplot(aes(x="x2", y="y",color="x3"), data=df) + geom_point()
Out[11]:
In [12]:
# calculate intercept for both lines: x3=0 and x3=1
b0w = b[0] + df["x2"].mean() * b[2]
b0m = b[0] + df["x2"].mean() * b[2] + b[3]
gg = ggplot(aes(x="x1",y="y",color="x3"),data=df) + geom_point()
gg += geom_abline(intercept=b0w, slope=b[1], color="#CC3333")
gg += geom_abline(intercept=b0m, slope=b[1], color="#3399CC")
gg
Out[12]:
In [13]:
b0w = b[0] + df["x1"].mean() * b[1]
b0m = b[0] + df["x1"].mean() * b[1] + b[3]
gg = ggplot(aes(x="x2",y="y",color="x3"),data=df) + geom_point()
gg += geom_abline(intercept=b0w, slope=b[2], color="#CC3333")
gg += geom_abline(intercept=b0m, slope=b[2], color="#3399CC")
gg
Out[13]:
In [14]:
# create a new factor x2*x3
x["x4"] = x["x2"] * x["x3"]
skm = lm.LinearRegression()
skm.fit(x, y)
skm.intercept_, skm.coef_
Out[14]:
In [15]:
x["x4"] = x["x2"] * x["x3"]
x_ = sm.add_constant(x)
smm = sm.OLS(y, x_)
res = smm.fit()
b2 = res.params.values
b2
Out[15]:
In [16]:
smm = sm.OLS.from_formula("y ~ x1 + x2 + x3 + x2*x3", data=df)
res = smm.fit()
res.params.values
Out[16]:
In [17]:
pt_y, pt_x = pt.dmatrices("y ~ x1 + x2 + x3 + x2*x3", df)
res = np.linalg.lstsq(pt_x, pt_y)
res[0].ravel()
Out[17]:
In [18]:
b0w2 = b2[0] + df["x1"].mean() * b2[1]
b0m2 = b2[0] + df["x1"].mean() * b2[1] + b2[3]
gg = ggplot(aes(x="x2",y="y",color="x3"),data=df) + geom_point()
gg += geom_abline(intercept=b0w2, slope=b2[2], color="#CC3333")
gg += geom_abline(intercept=b0m2, slope=b2[2]+b2[4], color="#3399CC")
gg
Out[18]: