In [102]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [103]:
### linearly related, 1 feature
X = [x for x in np.arange(600)]
y = [(2*x) for x in X]
plt.scatter(X, y)
plt.show()

### not linearly related, 1 feature
#np.random.shuffle(X)
#plt.scatter(X, y)
#plt.show()

### not linearly related, 2 features
#X1 = np.array(X)
#X1.shape = (600,1)
#X2 = X1
#np.random.shuffle(X2)
#X = np.hstack((X1,X2))
#X.shape



In [104]:
def get_sample(X,y):
    indices = np.random.choice(np.arange(600), size=10, replace=False)
    sample_X = []
    sample_y = []

    for i in indices:
        sample_X.append(X[i])
        sample_y.append(y[i])
    
    return sample_X, sample_y

In [105]:
train_X, train_y = get_sample(X,y)

train_X = sm.add_constant(train_X)
model = sm.OLS(train_y, train_X)
results = model.fit()
#print results.params
#print results.tvalues
#print results.rsquared
print results.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.011e+31
Date:                Thu, 08 Jan 2015   Prob (F-statistic):          6.84e-123
Time:                        22:27:58   Log-Likelihood:                 277.17
No. Observations:                  10   AIC:                            -550.3
Df Residuals:                       8   BIC:                            -549.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       -1.99e-13   1.83e-13     -1.086      0.309     -6.21e-13  2.24e-13
x1             2.0000   4.46e-16   4.48e+15      0.000         2.000     2.000
==============================================================================
Omnibus:                        7.486   Durbin-Watson:                   0.007
Prob(Omnibus):                  0.024   Jarque-Bera (JB):                3.776
Skew:                          -1.500   Prob(JB):                        0.151
Kurtosis:                       3.250   Cond. No.                         959.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [106]:
regr = linear_model.LinearRegression()
regr.fit(train_X, train_y)

test_X, test_y = get_sample(X,y)

test_X = sm.add_constant(test_X)

#print('Coefficients: \n', regr.coef_)
print("Residual sum of squares: %.2f" % np.mean((regr.predict(test_X) - test_y) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(test_X, test_y))


Residual sum of squares: 0.00
Variance score: 1.00