Transferring data to and from JVM is remarkably costly. In this notebook, we quantify how costly the transfer is compared to the training as a function of training set size. We will use a standard ML problem: predicting glass-forming ability of ternary metallic alloys.
In [1]:
    
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm
from matminer.datasets.dataset_retrieval import load_dataset
from matminer.featurizers.composition import ElementProperty
from lolopy.learners import RandomForestRegressor
from lolopy.loloserver import find_lolo_jar
from sklearn.ensemble import RandomForestRegressor as SKRFRegressor
from scipy.interpolate import interp1d
from subprocess import PIPE, Popen
from pymatgen import Composition
from time import perf_counter
import pandas as pd
import numpy as np
    
Make a timing function.
In [2]:
    
def time_function(fun, n, *args, **kwargs):
    """Run a certain function and return timing
    
    Args:
        fun (function): Function to be evaluated
        n (int): Number of times to run function
        args: Input to function
    Returns:
        ([float]) Function run times
    """
    
    times = []
    for i in range(n):
        st = perf_counter()
        fun(*args, **kwargs)
        times.append(perf_counter() - st)
    return times
    
Pull down the dataset
In [3]:
    
data = load_dataset('mp_nostruct')
    
Eliminate entries without formulas
In [4]:
    
data = data[~ data['formula'].isnull()]
    
Downselect to $10^3$ entries
In [5]:
    
data = data.sample(1000)
    
Generate some features
In [6]:
    
X = np.array(ElementProperty.from_preset('magpie').featurize_many(data['formula'].apply(Composition), pbar=False))
    
In [7]:
    
y = data['e_form'].values
    
Make a function to run the scala benchmark
In [8]:
    
lolojar = find_lolo_jar()
    
In [9]:
    
def get_scala_timings(X, y, X_run):
    """Train a RF with standard settings using Lolo, generate uncertainties for whole dataset, report timings
    
    Args:
        X, y (ndarray): Training dataset
        X_run (ndarray): Dataset to evaluate
    Returns:
        train, expected, uncertainty (float): Training time, expected and uncertainty evaluation times
    """
    np.savetxt('train.csv', np.hstack((X, y[:, None])), delimiter=',')
    np.savetxt('run.csv', np.hstack((X_run, np.zeros((len(X_run), 1)))), delimiter=',')
    p = Popen('scala -J-Xmx8g -cp {} scala-benchmark.scala train.csv run.csv'.format(lolojar), stdout=PIPE, 
         stderr=PIPE, shell=True)
    
    result = p.stdout.read().decode()
    return map(float, result.split(','))
    
In [10]:
    
scala_train, scala_expect, scala_uncert = get_scala_timings(X, y, X)
    
In [11]:
    
print('Lolo train time:', scala_train)
    
    
In [12]:
    
print('Lolo apply time', scala_expect + scala_uncert)
    
    
In [13]:
    
model = RandomForestRegressor(num_trees=len(X))
    
Fit the model 16 times, measure the times
In [14]:
    
rf_fit = time_function(model.fit, 16, X, y)
print('Average fit time:', np.mean(rf_fit))
    
    
Run only transfering the data to Java, record the time
In [15]:
    
x_java, _ = model._convert_train_data(X, y, None)
    
In [16]:
    
rf_transfer = time_function(model._convert_train_data, 16, X, y)
print('Average transfer time:', np.mean(rf_transfer))
    
    
Compute uncertainities
In [17]:
    
rf_apply = time_function(model.predict, 16, X, return_std=True)
print('Average predict time:', np.mean(rf_apply))
    
    
In [18]:
    
rf_apply_transfer = time_function(model._convert_run_data, 16, X)
print('Average transfer time for prediction:', np.mean(rf_apply_transfer))
    
    
In [19]:
    
sk_model = SKRFRegressor(n_estimators=len(X), n_jobs=-1)
    
In [20]:
    
sk_train = time_function(sk_model.fit, 16, X, y)
    
In [21]:
    
print('Sklearn fitting time:', np.mean(sk_train))
    
    
In [22]:
    
results = []
for n in tqdm(np.logspace(1, np.log10(len(X)), 8, dtype=int)):
    # Initialize output
    r = {'n': n}
    
    # Get the training and test set sizes
    X_n = X[:n, :]
    y_n = y[:n]
    
    # Time using lolo via Scala
    scala_train, scala_expect, scala_uncert = get_scala_timings(X_n, y_n, X)
    r['scala_train'] = scala_train
    r['scala_apply'] = scala_expect
    r['scala_apply_wuncert'] = scala_expect + scala_uncert
    
    # Time using lolo via lolopy
    model.set_params(num_trees=len(X_n))
    
    r['lolopy_train'] = np.mean(time_function(model.fit, 16, X_n, y_n))
    r['lolopy_train_transfer'] = np.mean(time_function(model._convert_train_data, 16, X_n, y_n))
    
    r['lolopy_apply'] = np.mean(time_function(model.predict, 16, X, return_std=False))
    r['lolopy_apply_wuncert'] = np.mean(time_function(model.predict, 16, X, return_std=True))
    r['lolopy_apply_transfer'] = np.mean(time_function(model._convert_run_data, 16, X))
    
    model.clear_model()  # To save memory
    
    # Time using RF
    sk_model = SKRFRegressor(n_estimators=n)
    
    r['sklearn_fit'] = np.mean(time_function(sk_model.fit, 16, X_n, y_n))
    r['sklearn_apply'] = np.mean(time_function(sk_model.predict, 16, X))
    
    # Append results and continue
    results.append(r)
    
    
 
 
    
In [23]:
    
results = pd.DataFrame(results)
    
In [24]:
    
results
    
    Out[24]:
Plot the training results. The blue shading is the data transfer time
In [25]:
    
fig, ax = plt.subplots()
ax.fill_between(results['n'], results['lolopy_train_transfer'], 0.001, alpha=0.1)
ax.loglog(results['n'], results['lolopy_train'], 'r', label='lolopy')
ax.loglog(results['n'], results['scala_train'], 'b--', label='lolo')
ax.loglog(results['n'], results['sklearn_fit'], 'g:', label='sklearn')
ax.set_ylim(0.005, max(ax.get_ylim()))
ax.set_xlabel('Training Set Size')
ax.set_ylabel('Train Time (s)')
ax.legend()
fig.set_size_inches(3.5, 2.5)
fig.tight_layout()
fig.savefig('training-performance.png')
    
    
Plot the evaluation speed. Note that the number of trees scales with the training set size (hence the decrease in speed with training set size)
In [26]:
    
fig, axs = plt.subplots(1, 2)
# Plot results without uncertainties
axs[0].loglog(results['n'], len(X)  / results['lolopy_apply'], 'r', label='lolopy')
axs[0].loglog(results['n'], len(X) / results['scala_apply'], 'b--', label='lolo')
axs[0].loglog(results['n'], len(X) / results['sklearn_apply'], 'g:', label='sklearn')
axs[0].set_title('Without Uncertainties')
# Plot results with uncertainities
axs[1].loglog(results['n'], len(X)  / results['lolopy_apply_wuncert'], 'r', label='lolopy')
axs[1].loglog(results['n'], len(X) / results['scala_apply_wuncert'], 'b--', label='lolo')
axs[1].set_title('With Uncertainties')
for ax in axs:
    ax.set_xlabel('Training Set Size')
    ax.set_ylabel('Evaluation Speed (entry/s)')
    ax.legend()
fig.set_size_inches(6.5, 2.5)
fig.tight_layout()
fig.savefig('evaluation-performance.png')
    
    
Verify that performance is within acceptable bounds: Less than a 2x slowdown for model training or evaluation at a training set size of 100 entries.
In [27]:
    
lolopy_timing = interp1d(results['n'], results['lolopy_train'])
lolo_timing = interp1d(results['n'], results['scala_train'])
slowdown = lolopy_timing(100) / lolo_timing(100)
print('Training slowdown: {:.2f}'.format(slowdown))
assert slowdown < 2
    
    
In [28]:
    
lolopy_timing = interp1d(results['n'], results['lolopy_apply'])
lolo_timing = interp1d(results['n'], results['scala_apply'])
slowdown = lolopy_timing(100) / lolo_timing(100)
print('Evaluation without uncertainties slowdown: {:.2f}'.format(slowdown))
assert slowdown < 2
    
    
In [29]:
    
lolopy_timing = interp1d(results['n'], results['lolopy_apply_wuncert'])
lolo_timing = interp1d(results['n'], results['scala_apply_wuncert'])
slowdown = lolopy_timing(100) / lolo_timing(100)
print('Evaluation with uncertainties slowdown: {:.2f}'.format(slowdown))
    
    
In [30]: