Web: https://www.meetup.com/Tel-Aviv-Deep-Learning-Bootcamp/events/241762893/
Notebooks: On GitHub
Shlomo Kashani
Indeed, most of the existing PyTorch examples are using Images, while here we have a CSV with 21 features. Using CONV1D before or after a Lineer layer requires the use of reshaping, and this is the whole point of this tutorial.
Thus, the CNN architecture is naive and by no means optimized. Hopefully, I will improve it over time and I am working on a second CNN based version of the same problem.
This tutorial was written in order to demonstrate a fully working example of a PyTorch CNN on a real world use case, namely a Binary Classification problem.
If you are interested in the sk-learn version of this problem please refer to: https://github.com/QuantScientist/deep-ml-meetups/tree/master/hacking-kaggle/python/numer-ai
For the scientific foundation behind Binary Classification and Logistic Regression, refer to: https://github.com/QuantScientist/Deep-Learning-Boot-Camp/tree/master/Data-Science-Interviews-Book
Every step, from reading the CSV into numpy arrays, converting to GPU based tensors, training and validation, are meant to aid newcomers in their first steps in PyTorch.
Additionally, commonly used Kaggle metrics such as ROC_AUC and LOG_LOSS are logged and plotted both for the training set as well as for the validation set.
In [33]:
# !pip install pycuda
%reset -f
# %%timeit
import torch
from torch.autograd import Variable
import numpy as np
import pandas
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score, train_test_split
import logging
import numpy
import numpy as np
from __future__ import print_function
from __future__ import division
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from sklearn.preprocessing import MultiLabelBinarizer
import time
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np
import scipy
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = (6, 6) # setting default size of plots
import tensorflow as tf
print("tensorflow:" + tf.__version__)
!set "KERAS_BACKEND=tensorflow"
import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
# !pip install http://download.pytorch.org/whl/cu75/torch-0.2.0.post1-cp27-cp27mu-manylinux1_x86_64.whl
# !pip install torchvision
# ! pip install cv2
# import cv2
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("PyTorch: ", torch.__version__)
print("Numpy: ", np.__version__)
handler=logging.basicConfig(level=logging.INFO)
lgr = logging.getLogger(__name__)
%matplotlib inline
# !pip install psutil
import psutil
def cpuStats():
print(sys.version)
print(psutil.cpu_percent())
print(psutil.virtual_memory()) # physical memory usage
pid = os.getpid()
py = psutil.Process(pid)
memoryUse = py.memory_info()[0] / 2. ** 30 # memory use in GB...I think
print('memory GB:', memoryUse)
cpuStats()
# %%timeit
use_cuda = torch.cuda.is_available()
# use_cuda = False
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
Tensor = FloatTensor
lgr.info("USE CUDA=" + str (use_cuda))
In [34]:
# NN params
LR = 0.005
MOMENTUM= 0.9
# fix seed
seed=17*19
np.random.seed(seed)
torch.manual_seed(seed)
if use_cuda:
torch.cuda.manual_seed(seed)
In [35]:
# Data params
TARGET_VAR= 'target'
TOURNAMENT_DATA_CSV = 'numerai_tournament_data.csv'
TRAINING_DATA_CSV = 'numerai_training_data.csv'
BASE_FOLDER = 'numerai/'
df_train = pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV)
df_train.head(5)
Out[35]:
In [36]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from collections import defaultdict
import spectrum
def featureFFT(y):
fs = 400
n = len(y)
dt = 1 / float(fs) # Get time resolution
fft_output = np.fft.rfft(y) # Perform real fft
rfreqs = np.fft.rfftfreq(n, dt) # Calculatle frequency bins
fft_mag = np.abs(fft_output) # Take only the magnitude of the spectrum
fft_mag = fft_mag * 2 / n
return fft_mag
def featureAR(ch):
ar_coeffs, dnr, reflection_coeffs = spectrum.aryule(ch, order=8)
return np.abs(ar_coeffs)
def featureEnt( row, base=2):
x_ent= -((np.log(row) / np.log(base)) * row).sum(axis=0)
return x_ent
def featureShannonEnt(row):
row=row.div(row.sum())
# print (row)
return -sum([p * math.log(p) for p in row if p != 0])
def enrichFeatures( row):
# print (len(row))
x_fft = featureFFT(row)
x_ent = featureShannonEnt(row)
x_ar = featureAR(row)
s = pd.Series({'x_ent': x_ent, 'ar1': x_ar[0], 'ar2': x_ar[1], 'ar3': x_ar[2], 'ar4': x_ar[3], 'ar5': x_ar[4],
'ar6': x_ar[5], 'ar7': x_ar[6], 'ar8': x_ar[7],
'x_fft1': x_fft[0], 'x_fft2': x_fft[1], 'x_fft3': x_fft[2], 'x_fft4': x_fft[3],
'x_fft5': x_fft[4], 'x_fft6': x_fft[5], 'x_fft7': x_fft[6], 'x_fft8': x_fft[7],
'x_fft9': x_fft[8], 'x_fft10': x_fft[9],'x_fft11': x_fft[10]})
# print (s)
return s
def genBasicFeatures(inDF):
print('Generating basic features ...')
df_copy=inDF.copy(deep=True)
magicNumber=21
feature_cols = list(inDF.columns)
# inDF['x_mean'] = np.mean(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_median'] = np.median(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_std'] = np.std(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_skew'] = scipy.stats.skew(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_kurt'] = scipy.stats.kurtosis(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_var'] = np.var(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_max'] = np.max(df_copy.ix[:, 0:magicNumber], axis=1)
# inDF['x_min'] = np.min(df_copy.ix[:, 0:magicNumber], axis=1)
# http://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
inDF=inDF.merge(df_copy.ix[:, 0:magicNumber].apply(lambda row: enrichFeatures(row), axis=1),
left_index=True, right_index=True)
return inDF
# Train, Validation, Test Split
def loadDataSplit():
df_train = pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV)
# TOURNAMENT_DATA_CSV has both validation and test data provided by NumerAI
df_test_valid = pd.read_csv(BASE_FOLDER + TOURNAMENT_DATA_CSV)
answers_1_SINGLE = df_train[TARGET_VAR]
df_train.drop(TARGET_VAR, axis=1,inplace=True)
df_train.drop('id', axis=1,inplace=True)
df_train.drop('era', axis=1,inplace=True)
df_train.drop('data_type', axis=1,inplace=True)
# df_train=genBasicFeatures(df_train)
df_train.to_csv(BASE_FOLDER + TRAINING_DATA_CSV + 'clean.csv', header=False, index = False)
df_train= pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV + 'clean.csv', header=None, dtype=np.float32)
df_train = pd.concat([df_train, answers_1_SINGLE], axis=1)
feature_cols = list(df_train.columns[:-1])
target_col = df_train.columns[-1]
trainX, trainY = df_train[feature_cols], df_train[target_col]
df_validation_set=df_test_valid.loc[df_test_valid['data_type'] == 'validation']
df_validation_set=df_validation_set.copy(deep=True)
answers_1_SINGLE_validation = df_validation_set[TARGET_VAR]
df_validation_set.drop(TARGET_VAR, axis=1,inplace=True)
df_validation_set.drop('id', axis=1,inplace=True)
df_validation_set.drop('era', axis=1,inplace=True)
df_validation_set.drop('data_type', axis=1,inplace=True)
# df_validation_set=genBasicFeatures(df_validation_set)
df_validation_set.to_csv(BASE_FOLDER + TRAINING_DATA_CSV + '-validation-clean.csv', header=False, index = False)
df_validation_set= pd.read_csv(BASE_FOLDER + TRAINING_DATA_CSV + '-validation-clean.csv', header=None, dtype=np.float32)
df_validation_set = pd.concat([df_validation_set, answers_1_SINGLE_validation], axis=1)
feature_cols = list(df_validation_set.columns[:-1])
target_col = df_validation_set.columns[-1]
valX, valY = df_validation_set[feature_cols], df_validation_set[target_col]
df_test_set = pd.read_csv(BASE_FOLDER + TOURNAMENT_DATA_CSV)
df_test_set=df_test_set.copy(deep=True)
df_test_set.drop(TARGET_VAR, axis=1,inplace=True)
tid_1_SINGLE = df_test_set['id']
df_test_set.drop('id', axis=1,inplace=True)
df_test_set.drop('era', axis=1,inplace=True)
df_test_set.drop('data_type', axis=1,inplace=True)
# df_test_set=genBasicFeatures(df_test_set)
feature_cols = list(df_test_set.columns) # must be run here, we dont want the ID
df_test_set = pd.concat([tid_1_SINGLE, df_test_set], axis=1)
testX = df_test_set[feature_cols].values
return trainX, trainY, valX, valY, testX, df_test_set
In [37]:
trainX, trainY, valX, valY, testX, df_test_set = loadDataSplit()
print (trainX.shape)
print (trainY.shape)
print (valX.shape)
print (valY.shape)
print (testX.shape)
print (df_test_set.shape)
In [6]:
trainX.head(5) # with new features added
Out[6]:
In [38]:
# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y
def XnumpyToTensor(x_data_np):
x_data_np = np.array(x_data_np.values, dtype=np.float32)
print(x_data_np.shape)
print(type(x_data_np))
if use_cuda:
lgr.info ("Using the GPU")
X_tensor = Variable(torch.from_numpy(x_data_np).cuda()) # Note the conversion for pytorch
else:
lgr.info ("Using the CPU")
X_tensor = Variable(torch.from_numpy(x_data_np)) # Note the conversion for pytorch
print(type(X_tensor.data)) # should be 'torch.cuda.FloatTensor'
print((X_tensor.data.shape)) # torch.Size([108405, 29])
return X_tensor
# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y
def YnumpyToTensor(y_data_np):
y_data_np=y_data_np.reshape((y_data_np.shape[0],1)) # Must be reshaped for PyTorch!
print(y_data_np.shape)
print(type(y_data_np))
if use_cuda:
lgr.info ("Using the GPU")
# Y = Variable(torch.from_numpy(y_data_np).type(torch.LongTensor).cuda())
Y_tensor = Variable(torch.from_numpy(y_data_np)).type(torch.FloatTensor).cuda() # BCEloss requires Float
else:
lgr.info ("Using the CPU")
# Y = Variable(torch.squeeze (torch.from_numpy(y_data_np).type(torch.LongTensor))) #
Y_tensor = Variable(torch.from_numpy(y_data_np)).type(torch.FloatTensor) # BCEloss requires Float
print(type(Y_tensor.data)) # should be 'torch.cuda.FloatTensor'
print(y_data_np.shape)
print(type(y_data_np))
return Y_tensor
In [39]:
# References:
# https://github.com/vinhkhuc/PyTorch-Mini-Tutorials/blob/master/5_convolutional_net.py
# https://gist.github.com/spro/c87cc706625b8a54e604fb1024106556
# use_cuda=False
X_tensor_train= XnumpyToTensor(trainX) # default order is NBC for a 3d tensor, but we have a 2d tensor
X_shape=X_tensor_train.data.size()
# Dimensions
# Number of features for the input layer
N_FEATURES=trainX.shape[1]
# Number of rows
NUM_ROWS_TRAINNING=trainX.shape[0]
# this number has no meaning except for being divisable by 2
N_MULT_FACTOR=8 # min should be 4
# Size of first linear layer
N_HIDDEN=N_FEATURES * N_MULT_FACTOR
# CNN kernel size
N_CNN_KERNEL=3
MAX_POOL_KERNEL=4
DEBUG_ON=False
def debug(x):
if DEBUG_ON:
print ('(x.size():' + str (x.size()))
class Net2(nn.Module):
def __init__(self, n_feature, n_hidden, n_output, n_cnn_kernel, n_mult_factor=N_MULT_FACTOR):
super(Net2, self).__init__()
self.n_feature=n_feature
self.n_hidden=n_hidden
self.n_output= n_output
self.n_cnn_kernel=n_cnn_kernel
self.n_mult_factor=n_mult_factor
self.n_l2_hidden=self.n_hidden * (self.n_mult_factor - self.n_cnn_kernel + 3)
# self.n_out_hidden=int (self.n_l2_hidden/2)
self.l1 = nn.Sequential(
torch.nn.Linear(self.n_feature, self.n_hidden),
torch.nn.Dropout(p=1 -.85),
torch.nn.LeakyReLU (0.1),
torch.nn.BatchNorm1d(self.n_hidden, eps=1e-05, momentum=0.1, affine=True)
)
self.c1= nn.Sequential(
torch.nn.Conv1d(self.n_feature, self.n_hidden,
kernel_size=(self.n_cnn_kernel,), stride=(1,), padding=(1,)),
torch.nn.Dropout(p=1 -.75),
torch.nn.LeakyReLU (0.1),
torch.nn.BatchNorm1d(self.n_hidden, eps=1e-05, momentum=0.1, affine=True)
)
self.out = nn.Sequential(
torch.nn.Linear(self.n_l2_hidden,
self.n_output),
)
self.sig=nn.Sigmoid()
def forward(self, x):
debug(x)
varSize=x.data.shape[0] # must be calculated here in forward() since its is a dynamic size
x=self.l1(x)
debug(x)
# for CNN
x = x.view(varSize,self.n_feature,self.n_mult_factor)
debug(x)
x=self.c1(x)
debug(x)
# for Linear layer
x = x.view(varSize, self.n_hidden * (self.n_mult_factor -self.n_cnn_kernel + 3))
debug(x)
# x=self.l2(x)
x=self.out(x)
debug(x)
x=self.sig(x)
return x
net = Net2(n_feature=N_FEATURES, n_hidden=N_HIDDEN, n_output=1, n_cnn_kernel=N_CNN_KERNEL) # define the network
if use_cuda:
net=net.cuda() # very important !!!
lgr.info(net)
b = net(X_tensor_train)
print ('(b.size():' + str (b.size())) # torch.Size([108405, 928])
In [40]:
optimizer = torch.optim.Adam(net.parameters(), lr=LR,weight_decay=5e-4) # L2 regularization
loss_func=torch.nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
if use_cuda:
lgr.info ("Using the GPU")
net.cuda()
loss_func.cuda()
lgr.info (optimizer)
lgr.info (loss_func)
In [ ]:
import time
start_time = time.time()
epochs=100
all_losses = []
X_tensor_train= XnumpyToTensor(trainX)
Y_tensor_train= YnumpyToTensor(trainY)
print(type(X_tensor_train.data), type(Y_tensor_train.data)) # should be 'torch.cuda.FloatTensor'
# From here onwards, we must only use PyTorch Tensors
for step in range(epochs):
out = net(X_tensor_train) # input x and predict based on x
cost = loss_func(out, Y_tensor_train) # must be (1. nn output, 2. target), the target label is NOT one-hotted
optimizer.zero_grad() # clear gradients for next train
cost.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
if step % 10 == 0:
loss = cost.data[0]
all_losses.append(loss)
print(step, cost.data.cpu().numpy())
prediction = (net(X_tensor_train).data).float() # probabilities
pred_y = prediction.cpu().numpy().squeeze()
target_y = Y_tensor_train.cpu().data.numpy()
tu = ((pred_y == target_y).mean(),log_loss(target_y, pred_y),roc_auc_score(target_y,pred_y ))
print ('ACC={}, LOG_LOSS={}, ROC_AUC={} '.format(*tu))
end_time = time.time()
print ('{} {:6.3f} seconds'.format('GPU:', end_time-start_time))
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(all_losses)
plt.show()
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_y,pred_y)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('LOG_LOSS=' + str(log_loss(target_y, pred_y)))
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.6f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [45]:
net.eval()
# Validation data
print (valX.shape)
print (valY.shape)
X_tensor_val= XnumpyToTensor(valX)
Y_tensor_val= YnumpyToTensor(valY)
print(type(X_tensor_val.data), type(Y_tensor_val.data)) # should be 'torch.cuda.FloatTensor'
predicted_val = (net(X_tensor_val).data).float() # probabilities
# predicted_val = (net(X_tensor_val).data > 0.5).float() # zero or one
pred_y = predicted_val.cpu().numpy()
target_y = Y_tensor_val.cpu().data.numpy()
print (type(pred_y))
print (type(target_y))
tu = (str ((pred_y == target_y).mean()),log_loss(target_y, pred_y),roc_auc_score(target_y,pred_y ))
print ('\n')
print ('acc={} log_loss={} roc_auc={} '.format(*tu))
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_y,pred_y)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('LOG_LOSS=' + str(log_loss(target_y, pred_y)))
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.6f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# print (pred_y)
In [46]:
print (df_test_set.shape)
columns = ['id', 'probability']
df_pred=pd.DataFrame(data=np.zeros((0,len(columns))), columns=columns)
df_pred.id.astype(int)
for index, row in df_test_set.iterrows():
rwo_no_id=row.drop('id')
# print (rwo_no_id.values)
x_data_np = np.array(rwo_no_id.values, dtype=np.float32)
if use_cuda:
X_tensor_test = Variable(torch.from_numpy(x_data_np).cuda()) # Note the conversion for pytorch
else:
X_tensor_test = Variable(torch.from_numpy(x_data_np)) # Note the conversion for pytorch
X_tensor_test=X_tensor_test.view(1, trainX.shape[1]) # does not work with 1d tensors
predicted_val = (net(X_tensor_test).data).float() # probabilities
p_test = predicted_val.cpu().numpy().item() # otherwise we get an array, we need a single float
df_pred = df_pred.append({'id':row['id'].astype(int), 'probability':p_test},ignore_index=True)
df_pred.head(5)
Out[46]:
In [47]:
df_pred.id=df_pred.id.astype(int)
def savePred(df_pred, loss):
# csv_path = 'pred/p_{}_{}_{}.csv'.format(loss, name, (str(time.time())))
csv_path = 'pred/pred_{}_{}.csv'.format(loss, (str(time.time())))
df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
print (csv_path)
savePred (df_pred, log_loss(target_y, pred_y))
In [ ]:
In [ ]:
In [ ]: