In [1]:
    
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
    
In [2]:
    
from sklearn.preprocessing import StandardScaler
from sklearn.datasets.samples_generator import make_regression
import numpy as np
def generate_dataset(n_train, n_test, n_features, noise=0.1):
    X, y = make_regression(n_samples=int(n_train + n_test),
                           n_features=int(n_features), noise=noise, 
                           random_state=101)
    
    X_train = X[:n_train]
    X_test = X[n_train:]
    
    y_train = y[:n_train]
    y_test = y[n_train:]
    
    
    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train)
    y_test = y_scaler.transform(y_test)
    
    return X_train, X_test, y_train, y_test
    
In [3]:
    
from sklearn.linear_model import LinearRegression, SGDRegressor
import time
import gc
    
In [4]:
    
n_test = 1000
n_train_v = (1000, 10000, 100000)
n_features_v = (10, 50, 100, 500, 1000)
regr_v = {'LR': LinearRegression(), 'SGD': SGDRegressor(random_state=101)}
results = {}
for regr_name, regr in regr_v.items():
    
    results[regr_name] = {}
    
    for n_train in n_train_v:
        for n_features in n_features_v:
            results[regr_name][(n_train, n_features)] = {'train': [], 'pred': []}
            for n_repetition in range(5):
                
                gc.collect()
                X_train, X_test, y_train, y_test = \
                generate_dataset(n_train, n_test, n_features)
                tick = time.time()
                regr.fit(X_train, y_train)
                train_time = time.time() - tick
                pred = regr.predict(X_test)
                predict_time = time.time() - tick - train_time
                # print("Train samples:", n_train, "Features:", n_features)
                # print("Train time [s]:", train_time)
                # print("Prediction time [s]:", predict_time)
                # print()
                results[regr_name][(n_train, n_features)]['train'].append(train_time)
                results[regr_name][(n_train, n_features)]['pred'].append(predict_time)
    
    
In [5]:
    
pylab.rcParams['figure.figsize'] = 12, 6
    
In [6]:
    
plt.subplot(1, 2, 1)
for n_train in n_train_v:
    X = n_features_v
    y = [np.mean(results['LR'][(n_train, n_features)]['train']) 
         for n_features in n_features_v]
    plt.plot(X, y, label=str(n_train) + " train points")
plt.title('Training time VS num. features')
plt.xlabel('Num features')
plt.ylabel('Training time [s]')
plt.legend(loc=0)
plt.subplot(1, 2, 2)
for n_features in n_features_v:
    X = np.log10(n_train_v)
    y = [np.mean(results['LR'][(n_train, n_features)]['train']) 
         for n_train in n_train_v]
    plt.plot(X, y, label=str(n_features) + " features")
plt.title('Training time VS num. training points')
plt.xlabel('Num training points [log10]')
plt.ylabel('Training time [s]')
plt.legend(loc=0)
plt.show()
    
    
In [7]:
    
plt.subplot(1, 2, 1)
for n_train in n_train_v:
    X = n_features_v
    y = [np.mean(results['LR'][(n_train, n_features)]['pred']) 
         for n_features in n_features_v]
    plt.plot(X, y, label=str(n_train) + " train points")
plt.title('Prediction time VS num. features')
plt.xlabel('Num features')
plt.ylabel('Prediction time [s]')
plt.legend(loc=0)
plt.subplot(1, 2, 2)
for n_features in n_features_v:
    X = np.log10(n_train_v)
    y = [np.mean(results['LR'][(n_train, n_features)]['pred']) 
         for n_train in n_train_v]
    plt.plot(X, y, label=str(n_features) + " features")
plt.title('Prediction time VS num. training points')
plt.xlabel('Num training points [log10]')
plt.ylabel('Prediction time [s]')
plt.legend(bbox_to_anchor=(0.01, 0.76), loc=2, borderaxespad=0.)
plt.show()
    
    
In [8]:
    
plt.subplot(1, 2, 1)
for n_train in n_train_v:
    X = n_features_v
    y = [np.mean(results['SGD'][(n_train, n_features)]['train']) 
         for n_features in n_features_v]
    plt.plot(X, y, label=str(n_train) + " train points")
plt.title('Training time VS num. features')
plt.xlabel('Num features')
plt.ylabel('Training time [s]')
plt.legend(loc=0)
plt.subplot(1, 2, 2)
for n_features in n_features_v:
    X = np.log10(n_train_v)
    y = [np.mean(results['SGD'][(n_train, n_features)]['train']) 
         for n_train in n_train_v]
    plt.plot(X, y, label=str(n_features) + " features")
plt.title('Training time VS num. training points')
plt.xlabel('Num training points [log10]')
plt.ylabel('Training time [s]')
plt.legend(loc=0)
plt.show()
    
    
In [9]:
    
plt.subplot(1, 2, 1)
for n_train in n_train_v:
    X = n_features_v
    y = [np.mean(results['SGD'][(n_train, n_features)]['pred']) 
         for n_features in n_features_v]
    plt.plot(X, y, label=str(n_train) + " train points")
plt.title('Prediction time VS num. features')
plt.xlabel('Num features')
plt.ylabel('Prediction time [s]')
plt.legend(loc=0)
plt.subplot(1, 2, 2)
for n_features in n_features_v:
    X = np.log10(n_train_v)
    y = [np.mean(results['SGD'][(n_train, n_features)]['pred']) 
         for n_train in n_train_v]
    plt.plot(X, y, label=str(n_features) + " features")
plt.title('Prediction time VS num. training points')
plt.xlabel('Num training points [log10]')
plt.ylabel('Prediction time [s]')
plt.legend(loc=0)
plt.show()
    
    
In [10]:
    
gc.collect()
X_train, X_test, y_train, y_test = generate_dataset(2000000, 10000, 100, 10.0)
    
In [11]:
    
print("Size of X_train is [GB]:", X_train.size * X_train[0,0].itemsize/1E9)
    
    
In [12]:
    
from sklearn.metrics import mean_absolute_error
# Batch learning
regr = SGDRegressor(random_state=101)
tick = time.time()
regr.fit(X_train, y_train)
print("With SGD, after", time.time() - tick ,"seconds")
pred = regr.predict(X_test)
print("the MAE is [log10]:", np.log10(mean_absolute_error(y_test, pred)))
    
    
In [13]:
    
def get_minibatch(X, y, batch_size):
    # We will shuffle consistently the training observations
    from sklearn.utils import resample
    X, y = resample(X, y, replace=False, random_state=101)
    n_cols = y.shape[0]
    for i in range(int(n_cols/batch_size)):
        yield (X[i*batch_size:(i+1)*batch_size, :], y[i*batch_size:(i+1)*batch_size])
    
    if n_cols % batch_size > 0:
        res_rows = n_cols % batch_size
        yield (X[-res_rows:, :], y[-res_rows:])
plot_x = []
plot_y = []
plot_labels = []
for batch_size in (1000, 10000, 100000):
    regr = SGDRegressor(random_state=101)
    training_time = 0.0
    
    X = []
    y = []
    
    for dataset in get_minibatch(X_train, y_train, batch_size):
        tick = time.time()
        regr.partial_fit(dataset[0], dataset[1])
        training_time += (time.time() - tick)
        pred = regr.predict(X_test)
        # print("With partial_fit SGD, after", training_time ,"seconds")
        # print("the MAE is", mean_absolute_error(y_test, pred))
        X.append(training_time)
        y.append(np.log10(mean_absolute_error(y_test, pred)))
    
    print("Report: Mini-batch size", batch_size)
    print("First output after [s]:", X[0])
    print("First model MAE [log10]:", y[0])
    print("Total training time [s]:", X[-1])
    print("Final MAE [log10]: ", y[-1])
    print()
    
    plot_x.append(X)
    plot_y.append(y)
    plot_labels.append("Batch size: "+str(batch_size))
    
    
In [14]:
    
pylab.rcParams['figure.figsize'] = 12, 6
plt.subplot(1,2,1)
for i in range(len(plot_x)):
    plt.plot(plot_x[i], plot_y[i], label=plot_labels[i])
plt.title('Mini-batch learning')
plt.xlabel('Training time [s]')
plt.ylabel('MAE')
plt.legend(loc=0)
plt.subplot(1,2,2)
for i in range(len(plot_x)):
    plt.plot(plot_x[i], plot_y[i], label=plot_labels[i])
plt.title('Mini-batch learning: ZOOM 0-0.15s')
plt.xlabel('Training time [s]')
plt.ylabel('MAE')
plt.xlim([0, 0.15])
plt.legend(loc=0)
plt.show()
    
    
In [15]:
    
# Default figure size
pylab.rcParams['figure.figsize'] = 8, 6
    
In [16]:
    
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer
to_remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', random_state=101,
                                remove=to_remove)
data_test = fetch_20newsgroups(subset='test', random_state=101,
                                remove=to_remove)
labels = data_train.target_names
targets = np.unique(data_train.target)
    
In [17]:
    
def get_minibatch_docs(docs, targets, batch_size):
    n_docs = len(docs)
    for i in range(int(n_docs/batch_size)):
        yield (docs[i*batch_size:(i+1)*batch_size], targets[i*batch_size:(i+1)*batch_size])
    
    if n_docs % batch_size > 0:
        res_rows = n_docs % batch_size
        yield (docs[-res_rows:], targets[-res_rows:])
    
In [18]:
    
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import sys
minibatch_size = 1000
values_to_plot = {}
for hash_table_size in (1000, 5000, 10000, 50000, 100000):
    
    values_to_plot[hash_table_size] = {'time': [], 'score': []}
    vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                   n_features=hash_table_size, ngram_range=(1, 1))
    X_test = vectorizer.transform(data_test.data)
    y_test = data_test.target
    clf = SGDClassifier(loss='log')
    timings = []
    for minibatch in get_minibatch_docs(data_train.data, data_train.target, minibatch_size):
        y_train = minibatch[1]
        tick = time.time()
        X_train = vectorizer.transform(minibatch[0])
        clf.partial_fit(X_train, y_train, targets)
        
        timings.append(time.time() - tick)
        
        pred = clf.predict(X_test)
        
        values_to_plot[hash_table_size]['score'].append(accuracy_score(y_test, pred))
    
    values_to_plot[hash_table_size]['time'] = np.cumsum(timings)
    
In [19]:
    
for k,v in sorted(values_to_plot.items()):
    plt.plot(v['time'], v['score'], 'x-', label='Hashsize size '+str(k))
plt.title('Mini-batch learning: 20newsgroups')
plt.xlabel('Training time [s]')
plt.ylabel('Accuracy')
plt.legend(loc=0)
plt.show()
    
    
In [20]:
    
type(X_test)
    
    Out[20]:
In [ ]: