Imports, Utility Functions, Constants



In [ ]:

    
from IPython.display import FileLink
import time
import re
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
#import torchvision
from torchvision.utils import make_grid
from PIL import Image
#from skimage import io #, transform
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# import torch.utils.trainer as trainer
# import torch.utils.trainer.plugins
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import numpy as np
import os
import shutil, errno
from tqdm import tqdm_notebook
import data_science.j_utils as j_utils
# from torchsample.modules import ModuleTrainer
# from torchsample.metrics import CategoricalAccuracy

%matplotlib notebook
# %pdb



In [ ]:

    
# Set some path stuff
path = "data/nature_conservancy_fish/"
# path = "data/statefarm/sample/"
use_cuda = torch.cuda.is_available()
print('Using CUDA:', use_cuda)

traindir = os.path.join(path, 'train')
validdir = os.path.join(path, 'valid')
test1dir = os.path.join(path, 'test_stg1')
test2dir = os.path.join(path, 'test_stg2')

Make validation set



In [ ]:

    
os.listdir(traindir)



In [ ]:

    
test1_files = os.listdir(test1dir)
test1_files.sort()
test2_files = os.listdir(test2dir)
test2_files.sort()



In [ ]:

    
test1_files



In [ ]:

    
test2_files



In [ ]:

    
inputpath = traindir
outputpath = validdir

for dirpath, dirnames, filenames in os.walk(inputpath):
    print('__________________________________________')
    classes = dirpath[len(inputpath)+1:]
    structure = os.path.join(outputpath, classes)
    if not os.path.isdir(structure):
        os.mkdir(structure)
        print('For class {0}, moving files to validation set'.format(classes))
        ori_n = len(filenames)
        if ori_n > 10:
            valid_to_move = np.random.choice(filenames, int(len(filenames)/10), replace=False)
            for file in valid_to_move:
                os.rename(os.path.join(dirpath, file),os.path.join(structure, file))
            moved_n = len(os.listdir(structure))
            print('Originally {0} files in {1}, moved {2} to {3}'.format(ori_n,dirpath,moved_n,structure))
        else:
            print('No files to move to validation set for {0}'.format(dirpath))
        
    else:
        print('The folder {0} already exists. Check that it has files moved for validation set'.format(structure))



In [ ]:

    
# Run this block to move more images from training set to validation set
move_more = False

if move_more:
    inputpath = traindir
    outputpath = validdir

    for dirpath, dirnames, filenames in os.walk(inputpath):
        print('__________________________________________')
        classes = dirpath[len(inputpath)+1:]
        structure = os.path.join(outputpath, classes)
        ori_n = len(filenames)
        if ori_n > 10:
            valid_to_move = np.random.choice(filenames, int(len(filenames)/10), replace=False)
            for file in valid_to_move:
                os.rename(os.path.join(dirpath, file),os.path.join(structure, file))
            moved_n = len(os.listdir(structure))
            print('Originally {0} files in {1}, moved {2} to {3}'.format(ori_n,dirpath,moved_n,structure))
        else:
            print('No files to move to validation set for {0}. Not enough?'.format(dirpath))



In [ ]:

    
# verifying nothing in validation set is in training set

dirs = [d for d in os.listdir(validdir) if os.path.isdir(os.path.join(validdir, d))]
for dirname in dirs:
    v_set = set(os.listdir(os.path.join(validdir,dirname)))
    t_set = set(os.listdir(os.path.join(traindir,dirname)))
    print(len(v_set), len(t_set))
    if len(v_set.intersection(t_set)) > 0:
        print('Problem')

Make sample folder



In [ ]:

    
# !rm -rf data/statefarm/sample/



In [ ]:

    
samp_path = "data/nature_conservancy_fish/sample/"
if not os.path.isdir(samp_path):
    os.mkdir(samp_path)



In [ ]:

    
inputpath = path
outputpath = samp_path

for dirpath, dirnames, filenames in os.walk(inputpath):
#     print(dirpath, dirnames)
    structure = os.path.join(outputpath, dirpath[len(inputpath):])
    if (samp_path + '/sample' in structure):
        pass
    elif not os.path.isdir(structure):
        os.mkdir(structure)
        if len(filenames) > 0:
            try:
                files_to_copy = np.random.choice(filenames, 20, replace=False)
            except:
                files_to_copy = filenames
            for file in files_to_copy:
                j_utils.copyanything(os.path.join(dirpath, file), os.path.join(structure,file))
    else:
#         print(dirpath)
        print(structure)
        print(samp_path)
#         print(dirnames)
#         print(filenames)
        print("Folder {0} already exists!".format(structure))



In [ ]:

    
os.listdir('data/nature_conservancy_fish/')



In [ ]:

    
os.listdir('data/nature_conservancy_fish/sample')



In [ ]:

    
# check a sample
set(os.listdir('data/statefarm/sample/train/c0')).intersection(set(os.listdir('data/statefarm/train/c0')))

Show one image



In [ ]:

    
train_fullpaths = j_utils.get_image_fullpaths_clf(traindir, '.jpg')
example_image = j_utils.get_example_image(train_fullpaths)



In [ ]:

    
j_utils.show_image(example_image)

Figure out how to process the example image for model to accept it

Need to get average and std of RGB channels to normalize by



In [ ]:

    
# mean_rgb = j_utils.get_mean_rgb(train_fullpaths)
# std_dev_rgb = j_utils.get_std_dev_rgb(train_fullpaths, mean_rgb)

Transforms



In [ ]:

    
# from statefarm sample itself
# mean_rgb = [ 0.314,  0.380,  0.373] 
# std_dev_rgb = [ 0.291,  0.333,  0.335]
# from whatever the pretrained for finetuning was on
mean_rgb = [0.485, 0.456, 0.406]
std_dev_rgb = [0.229, 0.224, 0.225]

# transforms to do, transforms to 
comp_tsfm = transforms.Compose([
        transforms.Lambda(lambda img: img.resize((250,250))),
        transforms.Pad(10),
        transforms.RandomCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean_rgb, std=std_dev_rgb)])
tsfm_ed_image = transforms.Compose([transforms.ToPILImage()])
rvrs_tsfm = transforms.Compose([
    j_utils.UnNormalize(mean=mean_rgb, std=std_dev_rgb),
    transforms.ToPILImage()
])



In [ ]:

    
tsfm_ed_image(comp_tsfm(example_image))



In [ ]:

    
rvrs_tsfm(comp_tsfm(example_image))

Get datasets ready



In [ ]:

    
batch_size = 64
train_dataset = j_utils.get_image_dataset(traindir,tsfm=comp_tsfm)
train_loader = j_utils.get_loader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataset = j_utils.get_image_dataset(validdir, tsfm=comp_tsfm)
valid_loader = j_utils.get_loader(valid_dataset, use_cuda, batch_size=batch_size, shuffle=False)



In [ ]:

    
classes = train_dataset.classes
n_classes = len(classes)

With what's best so far in model zoo



In [ ]:

    
# model = models.resnet152(pretrained=True)
# model_name = 'Resnet152'
model = models.vgg19_bn(pretrained=True)
model_name = 'Vgg19'

precompute up to before classifier



In [ ]:

    
if model_name == 'Resnet152':    
    model.fc = nn.Dropout(p=0.0) # identity
elif model_name == 'Vgg19':
    model.classifier = nn.Dropout(p=0.0)
    
# Freeze all params
for param in model.parameters():
    param.requires_grad = False
    
# Move to gpu
if use_cuda:
    model.cuda()



In [ ]:

    
import importlib; importlib.reload(j_utils)



In [ ]:

    
test1_dataset = j_utils.TestDataset(test1dir, comp_tsfm)
test1_loader = j_utils.get_loader(test1_dataset, use_cuda, shuffle=False)
test2_dataset = j_utils.TestDataset(test2dir, comp_tsfm)
test2_loader = j_utils.get_loader(test2_dataset, use_cuda, shuffle=False)



In [ ]:

    
verify1 = test1_dataset.samples
verify1.sort()
verify1_ = os.listdir(test1dir)
verify1_.sort()
for a, b in zip(verify1, verify1_):
    if a != b:
        print('problem')
if len(verify1) != len(verify1_):
    print('length problem')
verify2 = test2_dataset.samples
verify2.sort()
verify2_ = os.listdir(test2dir)
verify2_.sort()
for a, b in zip(verify2, verify2_):
    if a != b:
        print('problem')
if len(verify2) != len(verify2_):
    print('length problem')



In [ ]:

    
j_utils.save_precompute(*j_utils.precompute_vals(model, train_loader), path, model_name, 'train_precomputed.pth')
j_utils.save_precompute(*j_utils.precompute_vals(model, valid_loader), path, model_name, 'valid_precomputed.pth')
j_utils.save_precompute(*j_utils.precompute_vals(model, test1_loader), path, model_name, 'test1_precomputed.pth')
j_utils.save_precompute(*j_utils.precompute_vals(model, test2_loader), path, model_name, 'test2_precomputed.pth')

Make datasets from precomputed



In [ ]:

    
# load_train = path+'save_precom_Resnet152/train_precomputed.pth'
# load_valid = path+'save_precom_Resnet152/valid_precomputed.pth'
# load_test1 = path+'save_precom_Resnet152/test1_precomputed.pth'
# load_test2 = path+'save_precom_Resnet152/test2_precomputed.pth'

load_train = path+'save_precom_Vgg19/train_precomputed.pth'
load_valid = path+'save_precom_Vgg19/valid_precomputed.pth'
load_test1 = path+'save_precom_Vgg19/test1_precomputed.pth'
load_test2 = path+'save_precom_Vgg19/test2_precomputed.pth'



In [ ]:

    
X_train, y_train = torch.load(load_train)
X_valid, y_valid = torch.load(load_valid)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)



In [ ]:

    
p_train_dataset = j_utils.get_dataset(X_train, y_train)
p_valid_dataset = j_utils.get_dataset(X_valid, y_valid)
p_train_loader = j_utils.get_loader(p_train_dataset, use_cuda, 64*4, shuffle=True)
p_valid_loader = j_utils.get_loader(p_valid_dataset, use_cuda, 64*4, shuffle=True)

Train single layer as classifier



In [ ]:

    
input_dim = X_train.shape[1]
hly1_n = int(input_dim/4)
hly2_n = int(hly1_n/4)
hly3_n = int(hly2_n/4)

class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
#         self.hl1 = nn.Linear(input_dim, hly1_n)
        self.hl1_bn = nn.BatchNorm1d(input_dim)
#         self.hl2 = nn.Linear(hly1_n, hly2_n)
#         self.hl2_bn = nn.BatchNorm1d(hly2_n)
#         self.hl3 = nn.Linear(hly2_n, hly3_n)
#         self.hl3_bn = nn.BatchNorm1d(hly3_n)
        self.out = nn.Linear(input_dim, n_classes)
#         self.dropout_5 = nn.Dropout(p=.5)
        self.dropout_7 = nn.Dropout(p=.7)
#         self.dropout_9 = nn.Dropout(p=.9)
#         self.dropout_crazy = nn.Dropout(p=.995)
#         self.dropout_1 = nn.Dropout(p=.1)
        
    def forward(self, x):
#         x = F.leaky_relu(self.hl1_bn(self.hl1(x)))
#         x = self.dropout_9(x)
#         x = F.leaky_relu(self.hl2_bn(self.hl2(x)))
#         x = self.dropout_9(x)
#         x = F.leaky_relu(self.hl3_bn(self.hl3(x)))
        x = self.dropout_7(x)
        x = F.leaky_relu(self.out(self.hl1_bn(x)))
        return x
        
model_clf = Classifier()

Define loss and optimizer and savedir



In [ ]:

    
if use_cuda:
    model_clf.cuda()
criterion = nn.CrossEntropyLoss()
weight_decay = 0.0001
lr = 0.0001
# optimizer = j_utils.Nadam(model.parameters(), lr=lr, weight_decay = weight_decay,)
optimizer = optim.Adam(model_clf.parameters(), lr=lr, weight_decay = weight_decay,)
# optimizer = optim.SGD(model_clf.parameters(), lr=lr, weight_decay = weight_decay, momentum=0.9)
savedir = j_utils.make_savedir(path, model_name)
variance_pct_thrsh = .05



In [ ]:

    
import importlib; importlib.reload(j_utils)

Train



In [ ]:

    
# calling train model continually increases memory....



In [ ]:

    
try:
    del g_epoch
except:
    pass



In [ ]:

    
try:
    g_epoch
except NameError:
    g_epoch = 1
g_epoch = j_utils.train_model(model_clf, model_name, p_train_loader, p_valid_loader, optimizer, criterion, n_epochs=250, save_epoch = 20, savedir=savedir, variance_pct_thrsh=variance_pct_thrsh, patience_epoch=10, g_epoch = g_epoch, pct_change=.02, decay_rate=.5, continue_training=True, verbose=False, lr_scheduler=False, early_stop=False)



In [ ]:

    
# best validation from epoch 4232 to 4771
# load Vgg19_180
save_path = 'data/nature_conservancy_fish/save_Vgg19/Vgg19_best'



In [ ]:

    
model_clf.load_state_dict(torch.load(save_path))



In [ ]:

    
j_utils.check_accuracy(p_train_loader, model_clf)



In [ ]:

    
j_utils.check_accuracy(p_valid_loader, model_clf)

Make predictions from Test images; psuedolabel



In [ ]:

    
import importlib; importlib.reload(j_utils)



In [ ]:

    
def predict_testset(data_loader, use_cuda, model, softmax = False):
    model.eval()
    ids_list = []
    predictions_list = []
    k = 0
    for i, data in enumerate(data_loader):
        images, ids = data
        k += len(ids)
        predictions = model(Variable(images.cuda()))
#         pdb.set_trace()
        # labels is tensor, predictions is variable; predictions pull data out to numpy
        ids_list.extend(ids)
        if softmax == False:
            predictions_list.extend(predictions.max(1)[1].cpu().data.numpy()) #predictions.max(1)[1] returns indicies of max preds
        else:
            predictions_list.extend(nn.functional.softmax(predictions).cpu().data.numpy())
#         except StopIteration:
    print('Finished predicting for {0} images'.format(k))
    return list(zip(ids_list, predictions_list))

def predictions_vs_pics(iterator, use_cuda, model):
    model.eval()
    images, _ = iterator.next()
    img_list = [rvrs_tsfm_tensor(img) for img in images]
    show(make_grid(img_list, padding=100))
    # display the predictons for the images above
    if use_cuda:
        images = images.cuda()
    predictions = model(Variable(images))
    predictions_string = get_prediction_classes_strings(classes, predictions)
    print('Predictions: ', predictions_string)



In [ ]:

    
X_test1, y_test1 = torch.load(load_test1)
X_test2, y_test2 = torch.load(load_test2)
print(X_test1.shape, y_test1.shape)
print(X_test2.shape, y_test2.shape)
p_test1_dataset = j_utils.get_dataset(X_test1, y_test1)
p_test2_dataset = j_utils.get_dataset(X_test2, y_test2)
p_test1_loader = j_utils.get_loader(p_test1_dataset, use_cuda, 64*4, shuffle=False)
p_test2_loader = j_utils.get_loader(p_test2_dataset, use_cuda, 64*4, shuffle=False)



In [ ]:

    
examine_test1 = set(os.listdir(test1dir))



In [ ]:

    
examine_test2 = set(os.listdir(test2dir))



In [ ]:

    
examine_test1



In [ ]:

    
ls data/nature_conservancy_fish/test_stg1/



In [ ]:

    
'image_00005.jpg' in examine_test1



In [ ]:

    
'image_00005.jpg' in examine_test2



In [ ]:

    
examine_test1.intersection(examine_test2)



In [ ]:

    
testset1_ids = set( _[0] for _ in y_test1)



In [ ]:

    
testset2_ids = set( _[0] for _ in y_test2)



In [ ]:

    
len(testset1_ids.intersection(testset2_ids))



In [ ]:

    
examined = pd.Series([_[0] for _ in y_test2])



In [ ]:

    
examined[examined=='06845']



In [ ]:

    
testset1_ids



In [ ]:

    
'06845' in testset2_ids



In [ ]:



In [ ]:

    
results1 = predict_testset(p_test1_loader, use_cuda, model_clf, softmax=True)
results2 = predict_testset(p_test2_loader, use_cuda, model_clf, softmax=True)



In [ ]:

    
results_dict1 = {}
for result1 in results1:
    results_dict1['img_{0}.jpg'.format(str(result1[0][0]))] = np.clip(result1[1], 0.001, 0.999)
    
results_dict2 = {}
for result2 in results2:
    results_dict2['img_{0}.jpg'.format(str(result2[0][0]))] = np.clip(result2[1], 0.001, 0.999)



In [ ]:

    
results_frame1 = pd.DataFrame.from_dict(results_dict1, orient='index')
results_frame2 = pd.DataFrame.from_dict(results_dict2, orient='index')
# results_frame.sort_values('id', inplace=True)
# results_frame['label'] = np.clip(results_frame['label'], 0.025, 0.975)



In [ ]:

    
class_mapper = {k: v for k,v in zip(range(len(classes)), classes)}
test1_pseudolabels = results_frame1.idxmax(axis=1)
test1_pseudolabels = test1_pseudolabels.map(class_mapper)
test2_pseudolabels = results_frame2.idxmax(axis=1)
test2_pseudolabels = test2_pseudolabels.map(class_mapper)

export a csv and see how I do on test1



In [ ]:

    
results_frame = results_frame1.append(results_frame2)



In [ ]:

    
to_csv = results_frame.reset_index()
to_csv.columns = ['image'] + classes



In [ ]:

    
to_csv.to_csv(os.path.join(path,'test_combined_train_only_submission.csv'), index=False)



In [ ]:

    
to_csv



In [ ]:

    
to_csv['image'].value_counts(dropna=False)



In [ ]:

    
FileLink(os.path.join(path,'test_combined_train_only_submission.csv'))

create test1_pseudolabel folder and make appropriate classes and place tests



In [ ]:

    
sample_submission = pd.read_csv(os.path.join(path,'sample_submission_stg2.csv'))



In [ ]:

    
sample_submission



In [ ]:

    
sample_submission[sample_submission.isnull()]



In [ ]:

    
test1_pseudolabels



In [ ]:

    
# results_frame.reset_index(inplace=True)
# results_frame.columns = sample_submission.columns



In [ ]:

    
results_frame.to_csv(path + 'submission_statefarm.csv', index=False, index_label=False)



In [ ]:

    
from IPython.display import FileLink



In [ ]:

    
FileLink(path + 'submission_statefarm.csv')



In [ ]:

    
results_frame.shape



In [ ]:

    
len(os.listdir(testdir))



In [ ]:

    
ii