In [4]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
### linearly related, 1 feature
X = [x for x in np.arange(600)]
y = [(2*x) for x in X]
plt.scatter(X, y)
plt.show()

### not linearly related, 1 feature
#np.random.shuffle(X)
#plt.scatter(X, y)
#plt.show()

### not linearly related, 2 features
#X1 = np.array(X)
#X1.shape = (600,1)
#X2 = X1
#np.random.shuffle(X2)
#X = np.hstack((X1,X2))
#X.shape



In [6]:
def get_sample(X,y):
    indices = np.random.choice(np.arange(600), size=10, replace=False)
    sample_X = []
    sample_y = []

    for i in indices:
        sample_X.append(X[i])
        sample_y.append(y[i])
    
    return sample_X, sample_y

In [10]:
train_X, train_y = get_sample(X,y)

train_X = sm.add_constant(train_X)
model = sm.OLS(train_y, train_X)
results = model.fit()
#print results.params
#print results.tvalues
#print results.rsquared
print results.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 8.456e+30
Date:                Sun, 11 Jan 2015   Prob (F-statistic):          2.19e-121
Time:                        21:28:55   Log-Likelihood:                 274.47
No. Observations:                  10   AIC:                            -544.9
Df Residuals:                       8   BIC:                            -544.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       7.105e-15   2.24e-13      0.032      0.975     -5.09e-13  5.23e-13
x1             2.0000   6.88e-16   2.91e+15      0.000         2.000     2.000
==============================================================================
Omnibus:                        0.453   Durbin-Watson:                   0.374
Prob(Omnibus):                  0.797   Jarque-Bera (JB):                0.488
Skew:                          -0.134   Prob(JB):                        0.783
Kurtosis:                       1.951   Cond. No.                         708.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [11]:
regr = linear_model.LinearRegression()
regr.fit(train_X, train_y)

test_X, test_y = get_sample(X,y)

test_X = sm.add_constant(test_X)

#print('Coefficients: \n', regr.coef_)
print("Residual sum of squares: %.2f" % np.mean((regr.predict(test_X) - test_y) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(test_X, test_y))


Residual sum of squares: 0.00
Variance score: 1.00

In [12]:
y_hat_test = regr.predict(test_X)

In [25]:
plt.scatter(test_y,(test_y - y_hat_test))
plt.plot([0, 1200], [0, 0])
plt.title('RESIDUAL PLOT')
plt.yticks()
plt.xlabel('prediction')
plt.ylabel('residuals')
plt.show()



In [30]:
data = sm.datasets.statecrime.load_pandas().data
murder = data['murder']
X = data[['poverty', 'hs_grad']]

X["constant"] = 1
y = murder
model = sm.OLS(y, X)
results = model.fit()


fig = sm.graphics.plot_fit(results, 0)


plt.show()