In [1]:
import numpy as np
import tensorflow as tf
import time
from tqdm import tqdm
In [2]:
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
mnist = input_data.read_data_sets("MNIST_data/")
mnist_images = mnist.train.images
mnist_labels = mnist.train.labels
n_three, n_five = sum(mnist_labels==3), sum(mnist_labels==5)
X_all = np.vstack([
mnist_images[mnist_labels==3,:],
mnist_images[mnist_labels==5,:]
])
y_all = np.array([1]*n_three + [0]*n_five)
# make it more sparse
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.8)
print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X_all != 0)))
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_all==0), np.mean(y_all==1)))
X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)
In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
for model in [
LogisticRegression(),
RandomForestClassifier(n_jobs=-1, n_estimators=200)
]:
model.fit(X_tr, y_tr)
predictions = model.predict(X_te)
acc = accuracy_score(y_te, predictions)
print('model: {}'.format(model.__str__()))
print('accuracy: {}'.format(acc))
print()
In [5]:
from tffm import TFFMClassifier
for order in [2, 3]:
model = TFFMClassifier(
order=order,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='dense',
seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions)))
# this will close tf.Session and free resources
model.destroy()
In [6]:
import scipy.sparse as sp
# only CSR format supported
X_tr_sparse = sp.csr_matrix(X_tr)
X_te_sparse = sp.csr_matrix(X_te)
In [7]:
order = 3
model = TFFMClassifier(
order=order,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='sparse',
seed=42
)
model.fit(X_tr_sparse, y_tr, show_progress=True)
predictions = model.predict(X_te_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions)))
model.destroy()
In [8]:
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
model = TFFMRegressor(
order=order,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='sparse'
)
# translate Y from {0,1} to {-10, 10}
model.fit(X_tr_sparse, y_tr*20-10, show_progress=True)
predictions = model.predict(X_te_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions > 0)))
print('MSE: {}'.format(mean_squared_error(y_te*20-10, predictions)))
model.destroy()
In [9]:
n_features = X_all.shape[1]
used_features = range(100, 1000, 100)
n_repeats = 5
elapsed_mean = []
elapsed_std = []
model_title = ''
for cur_n_feats in tqdm(used_features):
time_observation = []
for _ in range(n_repeats):
active_features = np.random.choice(range(n_features), size=cur_n_feats)
model = TFFMClassifier(
order=5,
rank=50,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=1,
batch_size=-1,
init_std=0.01,
input_type='dense'
)
model_title = model.__str__()
# manually initialize model without calling .fit()
model.core.set_num_features(cur_n_feats)
model.core.build_graph()
model.initialize_session()
start_time = time.time()
predictions = model.decision_function(X_all[:, active_features])
end_time = time.time()
model.destroy()
time_observation.append(end_time - start_time)
elapsed_mean.append(np.mean(time_observation))
elapsed_std.append(np.std(time_observation))
In [16]:
%pylab inline
errorbar(used_features, elapsed_mean, yerr=elapsed_std)
xlim(0, 1000)
title(model_title)
xlabel('n_features')
ylabel('test time')
grid()
In [11]:
order = 3
model = TFFMClassifier(
order=order,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
n_epochs=10,
batch_size=-1,
init_std=0.001,
reg=0.001,
input_type='sparse',
log_dir='./tmp/logs',
verbose=1
)
model.fit(X_tr_sparse, y_tr, show_progress=True)
predictions = model.predict(X_te_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions)))
In [12]:
model.save_state('./tmp/state.tf')
model.destroy()
In [13]:
model = TFFMClassifier(
order=3,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
n_epochs=10,
batch_size=-1,
init_std=0.001,
reg=0.001,
input_type='sparse',
log_dir='./tmp/logs',
verbose=1
)
# internally model need to allocate memory before load previous weights,
# so need to set num_features explicitly
model.core.set_num_features(X_tr.shape[1])
model.load_state('./tmp/state.tf')
In [17]:
for optim, title in [(tf.train.AdamOptimizer(learning_rate=0.001), 'Adam'),
(tf.train.FtrlOptimizer(0.01, l1_regularization_strength=0.01), 'FTRL')]:
acc = []
model = TFFMClassifier(
order=3,
rank=10,
optimizer=optim,
batch_size=1024,
init_std=0.001,
reg=0.1,
input_type='sparse',
)
n_epochs = 5
anchor_epochs = range(0, 200+1, n_epochs)
for _ in anchor_epochs:
# score result every 5 epochs
model.fit(X_tr_sparse, y_tr, n_epochs=n_epochs)
predictions = model.predict(X_te_sparse)
acc.append(accuracy_score(y_te, predictions))
plot(anchor_epochs, acc, label=title)
model.destroy()
xlabel('n_epochs')
ylabel('accuracy')
legend()
grid()
In [18]:
X_all = np.vstack([
mnist_images[mnist_labels==3,:],
mnist_images[mnist_labels==5,:]
])
y_all = np.array([1]*n_three + [0]*n_five)
# make it more sparse (sparseness is about 97%)
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.97)
print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {}'.format(np.mean(X_all != 0)))
print('Classes balance: {} / {}'.format(np.mean(y_all==0), np.mean(y_all==1)))
X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)
In [19]:
for use_reweight, title in [(False, 'no reweight reg'), (True, 'reweight reg')]:
acc = []
model = TFFMClassifier(
order=3,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
batch_size=1024,
init_std=0.001,
reg=1.0,
input_type='sparse',
reweight_reg = use_reweight
)
n_epochs = 2
anchor_epochs = range(0, 20+1, n_epochs)
for _ in anchor_epochs:
# score result every 5 epochs
model.fit(X_tr_sparse, y_tr, n_epochs=n_epochs)
predictions = model.predict(X_te_sparse)
acc.append(accuracy_score(y_te, predictions))
plot(anchor_epochs, acc, label=title)
model.destroy()
xlabel('n_epochs')
ylabel('accuracy')
legend(loc=4)
grid()
When using TFFMClassifier, one can set the parameter sample_weights in order to
We will demonstrate the first two approaches.
In [20]:
from sklearn.metrics import confusion_matrix
# generate imbalanced data:
X_imbalanced = X_all[4000:,:]
y_imbalanced = y_all[4000:]
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_imbalanced==0),
np.mean(y_imbalanced==1)))
print('Balanced positive weight is {:.03f}.'.format(np.mean(y_imbalanced==0)/np.mean(y_imbalanced==1)))
X_tr, X_te, y_tr, y_te = train_test_split(X_imbalanced, y_imbalanced, random_state=42, test_size=0.3)
In [22]:
# use default weighting
model = TFFMClassifier(
order=2,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='dense',
seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
model.destroy()
In [23]:
confusion_matrix(y_te,predictions)
Out[23]:
Unweighted loss shows good performance on prevalent class, but poor performance on class with smaller representation
In [24]:
# use balanced weighting
model = TFFMClassifier(
order=2,
sample_weight='balanced',
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='dense',
seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
model.destroy()
In [25]:
confusion_matrix(y_te,predictions)
Out[25]:
Performance in underrepresented class improved, at the cost of performance in prevalent class.
In [26]:
# use manully weighting for positive class
model = TFFMClassifier(
order=2,
pos_class_weight=6.0,
rank=10,
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
n_epochs=50,
batch_size=1024,
init_std=0.001,
reg=0.01,
input_type='dense',
seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
model.destroy()
In [27]:
confusion_matrix(y_te,predictions)
Out[27]:
Here we've overdone it, but we're quite accurate on the underrepresented class. The limiting case will cause the classifier to put all point into the over-weighted class.