In [4]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
### linearly related, 1 feature
X = [x for x in np.arange(600)]
y = [(2*x) for x in X]
plt.scatter(X, y)
plt.show()
### not linearly related, 1 feature
#np.random.shuffle(X)
#plt.scatter(X, y)
#plt.show()
### not linearly related, 2 features
#X1 = np.array(X)
#X1.shape = (600,1)
#X2 = X1
#np.random.shuffle(X2)
#X = np.hstack((X1,X2))
#X.shape
In [6]:
def get_sample(X,y):
indices = np.random.choice(np.arange(600), size=10, replace=False)
sample_X = []
sample_y = []
for i in indices:
sample_X.append(X[i])
sample_y.append(y[i])
return sample_X, sample_y
In [10]:
train_X, train_y = get_sample(X,y)
train_X = sm.add_constant(train_X)
model = sm.OLS(train_y, train_X)
results = model.fit()
#print results.params
#print results.tvalues
#print results.rsquared
print results.summary()
In [11]:
regr = linear_model.LinearRegression()
regr.fit(train_X, train_y)
test_X, test_y = get_sample(X,y)
test_X = sm.add_constant(test_X)
#print('Coefficients: \n', regr.coef_)
print("Residual sum of squares: %.2f" % np.mean((regr.predict(test_X) - test_y) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(test_X, test_y))
In [12]:
y_hat_test = regr.predict(test_X)
In [25]:
plt.scatter(test_y,(test_y - y_hat_test))
plt.plot([0, 1200], [0, 0])
plt.title('RESIDUAL PLOT')
plt.yticks()
plt.xlabel('prediction')
plt.ylabel('residuals')
plt.show()
In [30]:
data = sm.datasets.statecrime.load_pandas().data
murder = data['murder']
X = data[['poverty', 'hs_grad']]
X["constant"] = 1
y = murder
model = sm.OLS(y, X)
results = model.fit()
fig = sm.graphics.plot_fit(results, 0)
plt.show()