In [1]:
import pandas as pd
import numpy as np
import cv2
import tifffile as tif
import matplotlib.pyplot as plt
import os, gc, glob
import json
from shapely import wkt
import utils
import global_vars
In [2]:
im_names = sorted(glob.glob(os.path.join(global_vars.DATA_DIR, 'sixteen_band','*M.tif')))
In [3]:
grids = [x.split('/')[-1].split('_')[0] for x in im_names]
grids = sorted(list(set((grids))))
In [4]:
def load_grid(paths):
for j in range(5):
for i in range(5):
if i == 0:
im = tif.imread(paths[j*5+i]).transpose((1,2,0))[:,:,[4,2,1]]
tmp_im = tif.imread(paths[j*5+i]).transpose((1,2,0))[:,:,[4,2,1]]
im = np.concatenate((im, tmp_im), axis=1)
if j == 0:
final_im = im
final_im = np.concatenate((final_im, im))
return cv2.resize(final_im,(500,500), interpolation=2)
In [5]:
for j in range(6):
for i in range(3):
grid = grids[j*3+i]
tmp = list(filter(lambda x: x.find(grid) != -1, im_names))
if i == 0:
big_im = utils.scl_prc(load_grid(tmp))
tmp_grid = utils.scl_prc(load_grid(tmp))
big_im = np.concatenate((big_im, tmp_grid), axis= 1)
utils.shw(big_im, 7, 17)
The above shows all the images from the entire provided dataset. Immedietly we see that there are only 18 large images from which all the other images are taken from
Another thing to notice is that not all classes appear to be present in all patches. This fact can be used reduce the number of pixels at prediction time, and reduce false positive rate.
Now lets look at the counts of the various labels aggregated by grid
In [6]:
#Get the train file names,
train_wkt = pd.read_csv(os.path.join(global_vars.DATA_DIR,'train_wkt_v4.csv'))
train_wkt['area'] = 0
In [7]:
for i in range(train_wkt.shape[0]):
if train_wkt.iloc[i,2] == 'MULTIPOLYGON EMPTY':
train_wkt.iloc[i,3] = wkt.loads(train_wkt.iloc[i,2]).area * 10**11 #to make the numbers easier to reason about
In [8]:
train_wkt['grid'] = train_wkt['ImageId'].apply(lambda x: x.split('_')[0])
In [9]:
grid_class_areas = train_wkt.groupby(['grid', 'ClassType'])['area'].sum().unstack().astype(
In [10]: