notebook.community

Edit and run



In [23]:

    
%matplotlib inline
# Using Stochastic Gradient Descent (SGD) for Regression
# SGD is widely used for regression because it's simple and
# speedy. It is used because it is easily explainable.



In [7]:

    
from sklearn import datasets



In [8]:

    
X,y = datasets.make_regression(int(1e6))



In [9]:

    
"{:,}".format(int(1e6))









    Out[9]:





'1,000,000'



In [10]:

    
"{:,}".format(X.nbytes)









    Out[10]:





'800,000,000'



In [12]:

    
X.nbytes / 1e6 # the number of megabytes









    Out[12]:





800.0



In [13]:

    
# number of bytes per data point:
X.nbytes / (X.shape[0]*X.shape[1])









    Out[13]:





8



In [14]:

    
# Fit a SGDRegressor model



In [25]:

    
from sklearn import linear_model
import numpy as np
from matplotlib import pyplot as plt



In [26]:

    
sgd = linear_model.SGDRegressor()



In [27]:

    
train = np.random.choice([True, False], size=len(y), p=[.75, .25])



In [28]:

    
sgd.fit(X[train], y[train])









    Out[28]:





SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)



In [29]:

    
# this defaults to squared_loss which is the same as linear
# regression. Using shuffle=True will create a random shuffle
# of the data.



In [30]:

    
test_preds = sgd.predict(X[~train])

f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(test_preds - y[~train], label='Test Predictions', color='b', alpha=.5)

ax.set_title('Residuals')
ax.legend(loc='best')









    Out[30]:





<matplotlib.legend.Legend at 0x107aa8e90>



In [ ]: