Web: https://www.meetup.com/Tel-Aviv-Deep-Learning-Bootcamp/events/241762893/
Notebooks: On GitHub
Shlomo Kashani
Indeed, most of the existing PyTorch examples are using Images, while here we have a CSV with 21 features. Using CONV1D before or after a Linear layer requires the use of reshaping, and this is the whole point of this tutorial.
Thus, the CNN architecture is naive and by no means optimized. Hopefully, I will improve it over time and I am working on a second CNN based version of the same problem.
This tutorial was written in order to demonstrate a fully working example of a PyTorch CNN on a real world use case, namely a Binary Classification problem.
If you are interested in the sk-learn version of this problem please refer to: https://github.com/QuantScientist/deep-ml-meetups/tree/master/hacking-kaggle/python/numer-ai
For the scientific foundation behind Binary Classification and Logistic Regression, refer to: https://github.com/QuantScientist/Deep-Learning-Boot-Camp/tree/master/Data-Science-Interviews-Book
Every step, from reading the CSV into numpy arrays, converting to GPU based tensors, training and validation, are meant to aid newcomers in their first steps in PyTorch.
Additionally, commonly used Kaggle metrics such as ROC_AUC and LOG_LOSS are logged and plotted both for the training set as well as for the validation set.
In [1]:
import torch
import sys
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score, train_test_split
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
# call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
import numpy
import numpy as np
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
Tensor = FloatTensor
import pandas
import pandas as pd
import logging
handler=logging.basicConfig(level=logging.INFO)
lgr = logging.getLogger(__name__)
%matplotlib inline
# !pip install psutil
import psutil
import os
def cpuStats():
print(sys.version)
print(psutil.cpu_percent())
print(psutil.virtual_memory()) # physical memory usage
pid = os.getpid()
py = psutil.Process(pid)
memoryUse = py.memory_info()[0] / 2. ** 30 # memory use in GB...I think
print('memory GB:', memoryUse)
cpuStats()
In [53]:
# fix seed
seed=17*19
np.random.seed(seed)
torch.manual_seed(seed)
if use_cuda:
torch.cuda.manual_seed(seed)
In [54]:
# Data params
TARGET_VAR= 'target'
TOURNAMENT_DATA_CSV = 'numerai_tournament_data.csv'
TRAINING_DATA_CSV = 'numerai_training_data.csv'
BASE_FOLDER = 'numerai/'
df_train = pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV)
df_train.head(5)
Out[54]:
In [55]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.decomposition import PCA
def toPCA(X_X, n=16):
pca = PCA(n_components=n)
pca.fit(X_X)
X_reduced = pca.transform(X_X)
print("Reduced dataset shape:", X_reduced.shape)
return X_reduced
def addPolyFeatures(inDF, deg=2):
print('Generating poly features ...')
df_copy=inDF.copy(deep=True)
poly=PolynomialFeatures(degree=deg)
p_testX = poly.fit(df_copy)
# AttributeError: 'PolynomialFeatures' object has no attribute 'get_feature_names'
target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(df_copy.columns,p) for p in poly.powers_]]
df_copy = pd.DataFrame(p_testX.transform(df_copy),columns=target_feature_names)
return df_copy
# Train, Validation, Test Split
def loadDataSplit(poly=False):
df_train = pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV)
# TOURNAMENT_DATA_CSV has both validation and test data provided by NumerAI
df_test_valid = pd.read_csv(BASE_FOLDER + TOURNAMENT_DATA_CSV)
answers_1_SINGLE = df_train[TARGET_VAR]
df_train.drop(TARGET_VAR, axis=1,inplace=True)
df_train.drop('id', axis=1,inplace=True)
df_train.drop('era', axis=1,inplace=True)
df_train.drop('data_type', axis=1,inplace=True)
# Add polynomial features
if poly:
df_train = addPolyFeatures(df_train)
df_train.to_csv(BASE_FOLDER + TRAINING_DATA_CSV + 'clean.csv', header=False, index = False)
df_train= pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV + 'clean.csv', header=None, dtype=np.float32)
df_train = pd.concat([df_train, answers_1_SINGLE], axis=1)
feature_cols = list(df_train.columns[:-1])
# print (feature_cols)
target_col = df_train.columns[-1]
trainX, trainY = df_train[feature_cols], df_train[target_col]
# TOURNAMENT_DATA_CSV has both validation and test data provided by NumerAI
# Validation set
df_validation_set=df_test_valid.loc[df_test_valid['data_type'] == 'validation']
df_validation_set=df_validation_set.copy(deep=True)
answers_1_SINGLE_validation = df_validation_set[TARGET_VAR]
df_validation_set.drop(TARGET_VAR, axis=1,inplace=True)
df_validation_set.drop('id', axis=1,inplace=True)
df_validation_set.drop('era', axis=1,inplace=True)
df_validation_set.drop('data_type', axis=1,inplace=True)
# Add polynomial features
if poly:
df_validation_set = addPolyFeatures(df_validation_set)
df_validation_set.to_csv(BASE_FOLDER + TRAINING_DATA_CSV + '-validation-clean.csv', header=False, index = False)
df_validation_set= pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV + '-validation-clean.csv', header=None, dtype=np.float32)
df_validation_set = pd.concat([df_validation_set, answers_1_SINGLE_validation], axis=1)
feature_cols = list(df_validation_set.columns[:-1])
target_col = df_validation_set.columns[-1]
valX, valY = df_validation_set[feature_cols], df_validation_set[target_col]
# Test set for submission (not labeled)
df_test_set = pd.read_csv(BASE_FOLDER + TOURNAMENT_DATA_CSV)
# df_test_set=df_test_set.loc[df_test_valid['data_type'] == 'live']
df_test_set=df_test_set.copy(deep=True)
df_test_set.drop(TARGET_VAR, axis=1,inplace=True)
tid_1_SINGLE = df_test_set['id']
df_test_set.drop('id', axis=1,inplace=True)
df_test_set.drop('era', axis=1,inplace=True)
df_test_set.drop('data_type', axis=1,inplace=True)
# Add polynomial features
if poly:
df_test_set = addPolyFeatures(df_test_set)
feature_cols = list(df_test_set.columns) # must be run here, we dont want the ID
df_test_set = pd.concat([tid_1_SINGLE, df_test_set], axis=1)
testX = df_test_set[feature_cols].values
return trainX, trainY, valX, valY, testX, df_test_set
trainX, trainY, valX, valY, testX, df_test_set = loadDataSplit()
In [56]:
print (trainX.shape)
print (trainY.shape)
print (valX.shape)
print (valY.shape)
print (testX.shape)
print (df_test_set.shape)
# trainX=toPCA(trainX)
# valX=toPCA(valX)
# testX=toPCA(testX)
# # Number of features for the input layer
N_FEATURES=trainX.shape[1]
In [57]:
# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y
def XnumpyToTensor(x_data_np):
x_data_np = np.array(x_data_np, dtype=np.float32)
print(x_data_np.shape)
print(type(x_data_np))
if use_cuda:
lgr.info ("Using the GPU")
X_tensor = Variable(torch.from_numpy(x_data_np).cuda()) # Note the conversion for pytorch
else:
lgr.info ("Using the CPU")
X_tensor = Variable(torch.from_numpy(x_data_np)) # Note the conversion for pytorch
print(type(X_tensor.data)) # should be 'torch.cuda.FloatTensor'
print((X_tensor.data.shape)) # torch.Size([108405, 29])
return X_tensor
# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y
def YnumpyToTensor(y_data_np):
y_data_np=y_data_np.reshape((y_data_np.shape[0],1)) # Must be reshaped for PyTorch!
print(y_data_np.shape)
print(type(y_data_np))
if use_cuda:
lgr.info ("Using the GPU")
# Y = Variable(torch.from_numpy(y_data_np).type(torch.LongTensor).cuda())
Y_tensor = Variable(torch.from_numpy(y_data_np)).type(torch.FloatTensor).cuda() # BCEloss requires Float
else:
lgr.info ("Using the CPU")
# Y = Variable(torch.squeeze (torch.from_numpy(y_data_np).type(torch.LongTensor))) #
Y_tensor = Variable(torch.from_numpy(y_data_np)).type(torch.FloatTensor) # BCEloss requires Float
print(type(Y_tensor.data)) # should be 'torch.cuda.FloatTensor'
print(y_data_np.shape)
print(type(y_data_np))
return Y_tensor
conv = torch.nn.Conv1d(in_channels=5, out_channels=10, kernel_size=2)
Note that view is only supposed to work on contiguous tensors, and transposing a tensor makes it non-contiguous. You can use .contiguous() after transpose
Note that Conv1d expects (batch, in_channels, in_length).
In [58]:
import torchvision.models as models
alexnet = models.alexnet(pretrained = False) # set False so that it is not downloaded
print(alexnet)
In [59]:
# n_input n_cnn_kernel n_padding n_input_rows n_hidden after CNN stride
# 21 1 1 108405 21*32=672 torch.Size([108405, 672, 3] 1
# 21 1 2 108405 21*32=672 torch.Size([108405, 672, 5]
# 21 1 3 108405 21*32=672 torch.Size([108405, 672, 7]
# 21 1 4 108405 21*32=672 torch.Size([108405, 672, 9]
# 21 2 1 108405 21*32=672 torch.Size([108405, 672, 2]
# 21 3 1 108405 21*32=672 torch.Size([108405, 672, 1]
# 21 4 1 108405 21*32=672 ERROR * ERROR * ERROR
# Given input size: (21 x 1 x 1).
# Calculated output size: (672 x 1 x 0). Output size is too small
# 21 2 2 108405 21*32=672 torch.Size([108405, 672, 4]
# 21 3 2 108405 21*32=672 torch.Size([108405, 672, 3]
# 21 4 2 108405 21*32=672 torch.Size([108405, 672, 2]
# 21 5 2 108405 21*32=672 torch.Size([108405, 672, 1]
# 21 6 2 108405 21*32=672 ERROR * ERROR * ERROR
# 21 1 3 108405 21*32=672 torch.Size([108405, 672, 7]
# 21 2 3 108405 21*32=672 torch.Size([108405, 672, 6]
# 21 3 3 108405 21*32=672 torch.Size([108405, 672, 5]
# 21 4 3 108405 21*32=672 torch.Size([108405, 672, 4]
# 21 5 3 108405 21*32=672 torch.Size([108405, 672, 3]
# 21 6 3 108405 21*32=672 torch.Size([108405, 672, 2]
# 21 7 3 108405 21*32=672 torch.Size([108405, 672, 1]
# 21 4 4 108405 21*32=672 torch.Size([108405, 672, 6]
# 21 5 4 108405 21*32=672 torch.Size([108405, 672, 5]
# 21 6 4 108405 21*32=672 torch.Size([108405, 672, 4]
# 21 7 4 108405 21*32=672 torch.Size([108405, 672, 3]
# 21 8 4 108405 21*32=672 torch.Size([108405, 672, 2]
# 21 9 4 108405 21*32=672 torch.Size([108405, 672, 1]
# 21 4 4 108405 21*32=672 torch.Size([108405, 672, 3] 2
# 21 5 4 108405 21*32=672 torch.Size([108405, 672, 3] 2
# 21 6 4 108405 21*32=672 torch.Size([108405, 672, 2] 2
In [60]:
loss_func=torch.nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
# References:
# https://github.com/vinhkhuc/PyTorch-Mini-Tutorials/blob/master/5_convolutional_net.py
# https://gist.github.com/spro/c87cc706625b8a54e604fb1024106556
X_tensor_train= XnumpyToTensor(trainX) # default order is NBC for a 3d tensor, but we have a 2d tensor
X_shape=X_tensor_train.data.size()
# 21 6 3 108405 21*32=672 torch.Size([108405, 672, 2] n_max_pool1d=1
n_mult_factor=9
n_input= trainX.shape[1]
n_hidden= n_input * n_mult_factor
n_output=1
n_input_rows=trainX.shape[0]
n_cnn_kernel=7
n_padding=4
n_max_pool1d=2
DEBUG_ON=True
def debug(msg, x):
if DEBUG_ON:
print (msg + ', (size():' + str (x.size()))
class CNNNumerAI(nn.Module):
def __init__(self, n_input, n_hidden, n_output,n_cnn_kernel, n_mult_factor, n_padding,n_max_pool1d):
super(CNNNumerAI, self).__init__()
self.n_input=n_input
self.n_hidden=n_hidden
self.n_output= n_output
self.n_cnn_kernel=n_cnn_kernel
self.n_mult_factor=n_mult_factor
self.n_padding=n_padding
self.n_max_pool1d=n_max_pool1d
self.n_l1=int((n_mult_factor * self.n_input) * (n_padding + 1) / n_max_pool1d)
self.features = nn.Sequential( # Mimicking AlexNet
torch.nn.Conv1d(self.n_input, self.n_hidden,kernel_size=(self.n_cnn_kernel,), stride=(1,), padding=(self.n_padding,)),
torch.nn.LeakyReLU(),
torch.nn.MaxPool1d(kernel_size=self.n_max_pool1d),
)
hiddenLayer2Size=int(self.n_hidden)
linear4=torch.nn.Linear(hiddenLayer2Size, 1)
torch.nn.init.xavier_uniform(linear4.weight)
dropout = torch.nn.Dropout(p=1 - (0.95))
relu=torch.nn.LeakyReLU()
self.classifier = torch.nn.Sequential(
linear4
)
self.sig=nn.Sigmoid()
def forward(self, x):
# debug('raw',x)
varSize=x.data.shape[0] # must be calculated here in forward() since its is a dynamic size
# for CNN
x=x.contiguous()
x = x.view(varSize,self.n_input,1)
debug('after view',x)
x=self.features(x)
debug('after CNN',x)
# for Linear layer
# x = x.view(varSize,self.n_l1)
x = x.view(varSize,int(self.n_hidden))
debug('after 2nd view',x)
x=self.classifier(x)
debug('after self.out',x)
x=self.sig(x)
return x
net = CNNNumerAI(n_input, n_hidden, n_output,n_cnn_kernel, n_mult_factor, n_padding, n_max_pool1d)
print(net)
if use_cuda:
net=net.cuda()
b = net(X_tensor_train)
print ('(b.size():' + str (b.size()))
In [61]:
# NN params
LR = 0.005
MOMENTUM= 0.9
# optimizer = torch.optim.SGD(net.parameters(), lr=0.02)
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
# optimizer = optim.SGD(net.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=5e-4)
optimizer = torch.optim.Adam(net.parameters(), lr=LR,weight_decay=5e-5) # L2 regularization
if use_cuda:
lgr.info ("Using the GPU")
net.cuda()
loss_func.cuda()
lgr.info (optimizer)
lgr.info (loss_func)
In [62]:
torch.backends.cudnn.enabled=False
import time
start_time = time.time()
epochs=100
all_losses = []
DEBUG_ON=False
X_tensor_train= XnumpyToTensor(trainX)
Y_tensor_train= YnumpyToTensor(trainY)
print(type(X_tensor_train.data), type(Y_tensor_train.data)) # should be 'torch.cuda.FloatTensor'
# From here onwards, we must only use PyTorch Tensors
for step in range(epochs):
epoch_start_time = time.time()
out = net(X_tensor_train) # input x and predict based on x
cost = loss_func(out, Y_tensor_train) # must be (1. nn output, 2. target), the target label is NOT one-hotted
optimizer.zero_grad() # clear gradients for next train
cost.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
if step % 10 == 0:
loss = cost.data[0]
all_losses.append(loss)
print(step, cost.data.cpu().numpy())
prediction = (net(X_tensor_train).data).float() # probabilities
pred_y = prediction.cpu().numpy().squeeze()
target_y = Y_tensor_train.cpu().data.numpy()
tu = (log_loss(target_y, pred_y),roc_auc_score(target_y,pred_y ))
epoch_end_time = time.time()
print ('{} {:6.3f} seconds'.format('EP:', epoch_end_time-epoch_start_time))
print ('LOG_LOSS={}, ROC_AUC={} '.format(*tu))
end_time = time.time()
print ('{} {:6.3f} seconds'.format('GPU:', end_time-start_time))
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(all_losses)
plt.show()
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_y,pred_y)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('LOG_LOSS=' + str(log_loss(target_y, pred_y)))
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.6f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [64]:
net.eval()
# Validation data
print (valX.shape)
print (valY.shape)
X_tensor_val= XnumpyToTensor(valX)
Y_tensor_val= YnumpyToTensor(valY)
print(type(X_tensor_val.data), type(Y_tensor_val.data)) # should be 'torch.cuda.FloatTensor'
predicted_val = (net(X_tensor_val).data).float() # probabilities
# predicted_val = (net(X_tensor_val).data > 0.5).float() # zero or one
pred_y = predicted_val.cpu().numpy()
target_y = Y_tensor_val.cpu().data.numpy()
print (type(pred_y))
print (type(target_y))
tu = (str ((pred_y == target_y).mean()),log_loss(target_y, pred_y),roc_auc_score(target_y,pred_y ))
print ('\n')
print ('acc={} log_loss={} roc_auc={} '.format(*tu))
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_y,pred_y)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('LOG_LOSS=' + str(log_loss(target_y, pred_y)))
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.6f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# print (pred_y)
In [13]:
print (df_test_set.shape)
columns = ['id', 'probability']
df_pred=pd.DataFrame(data=np.zeros((0,len(columns))), columns=columns)
df_pred.id.astype(int)
for index, row in df_test_set.iterrows():
rwo_no_id=row.drop('id')
# print (rwo_no_id.values)
x_data_np = np.array(rwo_no_id.values, dtype=np.float32)
if use_cuda:
X_tensor_test = Variable(torch.from_numpy(x_data_np).cuda()) # Note the conversion for pytorch
else:
X_tensor_test = Variable(torch.from_numpy(x_data_np)) # Note the conversion for pytorch
X_tensor_test=X_tensor_test.view(1, trainX.shape[1]) # does not work with 1d tensors
predicted_val = (net(X_tensor_test).data).float() # probabilities
p_test = predicted_val.cpu().numpy().item() # otherwise we get an array, we need a single float
df_pred = df_pred.append({'id':row['id'], 'probability':p_test},ignore_index=True)
df_pred.head(5)
In [ ]:
# df_pred.id=df_pred.id.astype(int)
def savePred(df_pred, loss):
# csv_path = 'pred/p_{}_{}_{}.csv'.format(loss, name, (str(time.time())))
csv_path = 'pred/pred_{}_{}.csv'.format(loss, (str(time.time())))
df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
print (csv_path)
savePred (df_pred, log_loss(target_y, pred_y))
In [ ]:
In [ ]:
In [ ]: