In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [2]:
np.random.seed(1)
N = 500
x = np.random.randn(N)
y = x + 1 * np.random.randn(N)

plt.figure(figsize=(6,6))
plt.plot(x, y, 'k.')

mdl = LinearRegression()
mdl.fit(np.array([x]).T, y)
xs_mdl = [-5, 5]
ys_mdl = mdl.predict(np.array([xs_mdl]).T)
plt.plot(xs_mdl, ys_mdl, 'k--', label='regression')

pmdl = PCA()
pmdl.fit(np.stack([x,y]).T)
pca1x, pca1y = pmdl.components_[0]
x_scale = -xs_mdl[1] / pca1x
y_plt = [x_scale*pca1y, -x_scale*pca1y]
plt.plot(xs_mdl, y_plt, 'r--', label='pca')

plt.plot(xs_mdl, xs_mdl, 'k', label='x=y')

plt.xlim((-5,5))
plt.ylim((-5,5))
plt.legend(loc='best')
plt.title('Data simulated y ~ x + error')


/Users/scott/anaconda/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)
Out[2]:
Text(0.5, 1.0, 'Data simulated y ~ x + error')

In [3]:
np.random.seed(0)
N = 500
p1 = np.random.randn(N)
p2 = .5 * np.random.randn(N)
xp = p1 + p2
yp = p1 - p2

plt.figure(figsize=(6,6))
plt.plot(xp, yp, 'k.')

mdl = LinearRegression()
mdl.fit(np.array([xp]).T, yp)
xps_mdl = [-5, 5]
yps_mdl = mdl.predict(np.array([xps_mdl]).T)
plt.plot(xps_mdl, yps_mdl, 'k--', label='regression')

pmdl = PCA()
pmdl.fit(np.stack([xp,yp]).T)
print(pmdl.components_)
pca1xp, pca1yp = pmdl.components_[0]
xp_scale = -xps_mdl[1] / pca1xp
yp_plt = [xp_scale*pca1yp, -xp_scale*pca1yp]
plt.plot(xps_mdl, yp_plt, 'r--', label='pca')

plt.plot(xs_mdl, xs_mdl, 'k', label='x=y')

plt.xlim((-5,5))
plt.ylim((-5,5))
plt.legend(loc='best')
plt.title('Data simulated as 2 independent,\nperpendicular normal distributions')


[[-0.6883696  -0.72536011]
 [-0.72536011  0.6883696 ]]
Out[3]:
Text(0.5, 1.0, 'Data simulated as 2 independent,\nperpendicular normal distributions')

In [ ]: