In [65]:
import numpy as np
import pandas as pd
import os
from skimage import io
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.transform import resize

1. Load data and show basic info


In [13]:
train_df = pd.read_csv(r'..\data\processed\train_v2.csv')
print('dataset size: {}'.format(train_df.shape[0]))
train_df.head()


dataset size: 40479
Out[13]:
image_name tags
0 train_0 haze primary
1 train_1 agriculture clear primary water
2 train_2 clear primary
3 train_3 clear primary
4 train_4 agriculture clear habitation primary road

In [14]:
train_jpg_path = r'..\data\processed\train-jpg'
image_names_sample = os.listdir(train_jpg_path)[:10]
sample_image = io.imread(os.path.join(train_jpg_path, image_names_sample[0]))
plt.imshow(sample_image)
plt.show()
print("image dimensiotn({})".format(sample_image.shape))


image dimensiotn((256, 256, 3))

In [15]:
train_df_separate = train_df.copy()
train_df_separate.tags = train_df_separate.tags.apply(lambda x: x.split(' '))

In [18]:
train_df_separate.head()


Out[18]:
image_name tags
0 train_0 [haze, primary]
1 train_1 [agriculture, clear, primary, water]
2 train_2 [clear, primary]
3 train_3 [clear, primary]
4 train_4 [agriculture, clear, habitation, primary, road]

In [53]:
labels_images = {}
unique_tags = np.unique(np.hstack(train_df_separate.tags.values))
for i in range(train_df_separate.shape[0]):
    for tag in train_df_separate.at[i, 'tags']:
        if tag not in labels_images:
            labels_images[tag] = [train_df_separate.at[i, 'image_name']]
            continue
        labels_images[tag].append(train_df_separate.at[i, 'image_name'])
unique_tags


Out[53]:
array(['agriculture', 'artisinal_mine', 'bare_ground', 'blooming',
       'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation',
       'habitation', 'haze', 'partly_cloudy', 'primary', 'road',
       'selective_logging', 'slash_burn', 'water'], 
      dtype='<U17')

In [59]:
len(labels_images['blooming'])


Out[59]:
332

In [83]:
sample_image = io.imread(os.path.join(train_jpg_path, labels_images['bare_ground'][20]) + '.jpg')
sample_image_resize = resize(sample_image, (128, 128))
plt.imshow(sample_image)
plt.show()
plt.imshow(sample_image_resize)
plt.show()


C:\Users\Lingyu\.conda\envs\tensorflow\lib\site-packages\skimage\transform\_warps.py:84: UserWarning: The default mode, 'constant', will be changed to 'reflect' in skimage 0.15.
  warn("The default mode, 'constant', will be changed to 'reflect' in "

In [ ]:

2. Look in details about the tags


In [4]:
print(train_df.tags.value_counts().head(20))

print('count of different tags: {}'.format(len(train_df.tags.unique())))


clear primary                                            13636
partly_cloudy primary                                     3630
cloudy                                                    2089
clear primary water                                       1850
agriculture clear primary road                            1680
agriculture clear primary                                 1626
haze primary                                              1471
agriculture clear cultivation primary                     1170
agriculture clear habitation primary road                 1125
agriculture clear primary water                            712
agriculture partly_cloudy primary                          692
partly_cloudy primary water                                595
agriculture partly_cloudy primary road                     527
clear primary road water                                   504
clear cultivation primary                                  472
agriculture clear primary road water                       449
agriculture clear cultivation primary road                 381
haze primary water                                         314
agriculture cultivation partly_cloudy primary              309
agriculture clear cultivation habitation primary road      271
Name: tags, dtype: int64
count of different tags: 449

In [5]:
tags = train_df.tags.apply(lambda x: x.split(' '))
labels = {}
for item in tags:
    for label in item:
        if label not in labels:
            labels[label] = 1
        else:
            labels[label] += 1
labels = pd.Series(labels).sort_values(ascending=False)
labels.plot.bar()
plt.show()


3. Image processing

3.1 Color Correction


In [70]:
jpg_list = os.listdir(train_jpg_path)[:20000]
train_tif_path = r'..\data\processed\train-tif-v2' 
tif_list = os.listdir(train_tif_path)
np.random.shuffle(jpg_list)
jpg_list = jpg_list[:200]

In [71]:
ref_colors = [[],[],[]]
for _file in jpg_list:
    # keep only the first 3 bands, RGB
    _img = io.imread(os.path.join(train_jpg_path, _file))[:,:,:3]
    # Flatten 2-D to 1-D
    _data = _img[:, :, [2,1,0]].reshape((-1,3))
    # Dump pixel values to aggregation buckets
    for i in range(3): 
        ref_colors[i] = ref_colors[i] + _data[:,i].tolist()
    
ref_colors = np.array(ref_colors)

In [72]:
for i,color in enumerate(['r','g','b']):
    plt.hist(ref_colors[i], bins=30, range=[0,255], label=color, color=color, histtype='step')
plt.legend()
plt.title('Reference color histograms')
plt.show()



In [73]:
ref_means = [np.mean(ref_colors[i]) for i in range(3)]
ref_stds = [np.std(ref_colors[i]) for i in range(3)]

In [74]:
def calibrate_image(rgb_image, ref_means, ref_stds):
    # Transform test image to 32-bit floats to avoid 
    # surprises when doing arithmetic with it 
    calibrated_img = rgb_image.copy().astype('float32')

    # Loop over RGB
    for i in range(3):
        # Subtract mean 
        calibrated_img[:,:,i] = calibrated_img[:,:,i]-np.mean(calibrated_img[:,:,i])
        # Normalize variance
        calibrated_img[:,:,i] = calibrated_img[:,:,i]/np.std(calibrated_img[:,:,i])
        # Scale to reference 
        calibrated_img[:,:,i] = calibrated_img[:,:,i]*ref_stds[i] + ref_means[i]
        # Clip any values going out of the valid range
        calibrated_img[:,:,i] = np.clip(calibrated_img[:,:,i],0,255)

    # Convert to 8-bit unsigned int
    return calibrated_img.astype('uint8')

In [89]:
bgrn_image = io.imread(os.path.join(train_tif_path, 'train_301.tif'))
rgb_image = bgrn_image[:, :, [2,1,0]]
plt.imshow(calibrate_image(rgb_image, ref_means, ref_stds))
plt.show()



In [91]:
bgrn_image.shape


Out[91]:
(256, 256, 4)

3.2 rotate/flip image


In [92]:
from scipy import ndimage

In [95]:
bgrn_image_flipud = np.flipud(bgrn_image)

plt.imshow(calibrate_image(bgrn_image_flipud[:, :, [2, 1, 0]], ref_means, ref_stds))
plt.show()

bgrn_image_fliplr = np.fliplr(bgrn_image)
plt.imshow(calibrate_image(bgrn_image_fliplr[:, :, [2, 1, 0]], ref_means, ref_stds))
plt.show()



In [106]:
bgrn_roatate = ndimage.rotate(bgrn_image, 45, reshape=False)
plt.imshow(calibrate_image(bgrn_roatate[:, :, [2, 1, 0]], ref_means, ref_stds))
plt.show()

bgrn_roatate = ndimage.rotate(bgrn_image, 45, reshape=False, mode='reflect')
plt.imshow(calibrate_image(bgrn_roatate[:, :, [2, 1, 0]], ref_means, ref_stds))
plt.show()