In [9]:
# Load libraries
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
In [10]:
# Create Dummy dataset
num_samples = 500000
X, y = make_classification(n_samples=num_samples)
# Split train and test set
seed = 5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
In [11]:
# test different number of cores: max 8
num_cpu_list = list(range(1,9))
max_sample_list = [l * num_samples for l in [0.2, 0.4, 1]]
training_times_all = []
In [3]:
# the default setting for classifier
clf = RandomForestClassifier()
for max_sample in max_sample_list:
training_times = []
for num_cpu in num_cpu_list:
# change number of cores
clf.set_params(n_jobs=num_cpu)
start_time = time.time()
# train classifier on training data
clf.fit(X_train[:max_sample+1], y_train[:max_sample+1])
# save the runtime to the list
training_times.append(time.time() - start_time)
training_times_all.append(training_times)
In [5]:
plt.plot(num_cpu_list, training_times_all[0], 'r--', label="{0:0.0f}k".format(max_sample_list[0]/1000))
plt.plot(num_cpu_list, training_times_all[1], "bs" , label="{0:0.0f}k".format(max_sample_list[1]/1000))
plt.plot(num_cpu_list, training_times_all[2], "g^" , label="{0:0.0f}k".format(max_sample_list[2]/1000))
plt.axis([0, len(num_cpu_list)+1, 0, max(training_times)+1])
plt.title("Training time vs #CPU Cores")
plt.xlabel("#CPU Cores")
plt.ylabel("training time [s]")
plt.show()
In [6]:
list_all = []
for i in range(4):
list_small = []
for j in range(5):
list_small.append(j*i)
list_all.append(list_small)
print(list_all)
print(list_all[1])
In [22]:
print("{0:0.0f}k".format(max_sample_list[0]/1000))