In [23]:
%matplotlib inline
# Using Stochastic Gradient Descent (SGD) for Regression
# SGD is widely used for regression because it's simple and
# speedy. It is used because it is easily explainable.
In [7]:
from sklearn import datasets
In [8]:
X,y = datasets.make_regression(int(1e6))
In [9]:
"{:,}".format(int(1e6))
Out[9]:
In [10]:
"{:,}".format(X.nbytes)
Out[10]:
In [12]:
X.nbytes / 1e6 # the number of megabytes
Out[12]:
In [13]:
# number of bytes per data point:
X.nbytes / (X.shape[0]*X.shape[1])
Out[13]:
In [14]:
# Fit a SGDRegressor model
In [25]:
from sklearn import linear_model
import numpy as np
from matplotlib import pyplot as plt
In [26]:
sgd = linear_model.SGDRegressor()
In [27]:
train = np.random.choice([True, False], size=len(y), p=[.75, .25])
In [28]:
sgd.fit(X[train], y[train])
Out[28]:
In [29]:
# this defaults to squared_loss which is the same as linear
# regression. Using shuffle=True will create a random shuffle
# of the data.
In [30]:
test_preds = sgd.predict(X[~train])
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(test_preds - y[~train], label='Test Predictions', color='b', alpha=.5)
ax.set_title('Residuals')
ax.legend(loc='best')
Out[30]:
In [ ]: