Pre-processing - making the dataset



In [73]:

    
# -*- coding: utf-8 -*-


import os
from __future__ import print_function
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Disclaimer. Images are taken from the ImageCEF collaboration from give link here

Plain black-and-white (two level) images should never be converted to JPEG; they violate all of the conditions given above. You need at least about 16 grey levels before JPEG is useful for grey-scale images. It should also be noted that GIF is lossless for grey-scale images of up to 256 levels, while JPEG is not.

References

Plankton Image Classification github



In [74]:

    
curr_dir = %pwd
proj_dir = os.path.normpath(os.path.join(curr_dir, os.path.pardir))
image_data_path = "data/raw/ImageData.csv"
input_filepath = os.path.normpath(os.path.join(proj_dir, image_data_path))

Reading the image data



In [75]:

    
df = pd.read_csv(input_filepath, header=None, delimiter=";", names=["image", "class"],encoding='utf-8')









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-75-e811eb67764b> in <module>()
----> 1 df = pd.read_csv(input_filepath, header=None, delimiter=";", names=["image", "class"],encoding='utf-8')

~/virtualenvironment/tensorflow/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/virtualenvironment/tensorflow/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/virtualenvironment/tensorflow/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/virtualenvironment/tensorflow/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/virtualenvironment/tensorflow/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'/Users/ericleijonmarck/dev/data/raw/ImageData.csv' does not exist



In [76]:

    
df["class"].unique()









    Out[76]:





array(['Body', 'Head-Neck', 'Lower-Limb', 'Upper-Limb', 'True-Negative'], dtype=object)

Classes

We have now made it possible to attach each image to it's corresponding class

Body - the body of human

Head-Neck - Head and neck

Lower-limb - the lower part of a human body

Upper-limb - upper part of a human body

True-Negative - Image taken with the same x-ray camera that contains non-body parts are considered as true negative.



In [21]:

    
df.head()









    Out[21]:







  
    
      
      image
      class
    
  
  
    
      0
      10.1224.176.125.112.16.201109071451150.jpeg
      Body
    
    
      1
      10.1224.176.125.112.16.201109071557120.1.jpeg
      Body
    
    
      2
      10.1224.176.125.112.16.201109081127120.1.jpeg
      Body
    
    
      3
      10.1224.176.125.112.16.201109081127121.1.jpeg
      Body
    
    
      4
      10.1224.176.125.112.16.20110908123224.4.jpeg
      Body

strip whitespaces



In [22]:

    
df["image"] = df["image"].map(str.strip)

Testing to extract an image for testing



In [34]:

    
raw_images = 'data/raw/Image_Downscaled'
raw_storage = os.path.abspath(os.path.join(proj_dir, raw_images))



In [35]:

    
import scipy.misc
image = scipy.misc.imread(raw_storage + '/' + str(df['image'][0]), mode='L')



In [36]:

    
image.shape









    Out[36]:





(1189, 1600)



In [78]:

    
image[0,:]









    Out[78]:





array([0, 0, 0, ..., 1, 1, 1], dtype=uint8)



In [38]:

    
plt.imshow(image)









    Out[38]:





<matplotlib.image.AxesImage at 0x11696b2e8>

Convert image filenames into np.arrays



In [42]:

    
os.chdir(raw_storage)
from functools import partial

df['image'] = df['image'].map(partial(scipy.misc.imread, mode='L'))



In [43]:

    
df.head()









    Out[43]:







  
    
      
      image
      class
    
  
  
    
      0
      [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
      Body
    
    
      1
      [[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2,...
      Body
    
    
      2
      [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
      Body
    
    
      3
      [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2,...
      Body
    
    
      4
      [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
      Body

creation of the train and test sets of the images



In [44]:

    
df['image'][0].max()









    Out[44]:





255

resizing for less of size of the images



In [45]:

    
class_dict = {'Body': 0,
 'Head-Neck': 1,
 'Lower-Limb': 2,
 'Upper-Limb': 3,
 'True-Negative': 4}



In [47]:

    
train_shuffled_images = []
test_shuffled_images = []

test_percentage = 0.2

for key, value in class_dict.items():
    class_images = df[df['class'] == key]['image'].tolist()
    print(len(class_images))
    #np.random.shuffle(shuffled)

    k = int(len(class_images) * test_percentage)
    print(k)
    test_shuffled_images = test_shuffled_images + class_images[0:k]
    train_shuffled_images = train_shuffled_images + class_images[k:]

preprocessing of the images

util function to create the directory



In [109]:

    
import os
import errno

def make_sure_path_exists(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

Resizing of the images for test and training



In [51]:

    
import scipy.misc



In [52]:

    
train_images = [scipy.misc.imresize(image, (100,50)) for image in train_shuffled_images]
test_images = [scipy.misc.imresize(image, (100,50)) for image in test_shuffled_images]

creation of the train and test folders for keras to use .flow_from_directory method



In [65]:

    
os.chdir(proj_dir)
test_dir = 'data/processed/test'
train_dir = 'data/processed/train'
class_feature = ['Body','Head-Neck','Lower-Limb','Upper-Limb','True-Negative']



In [72]:

    
def put_class_images_in_folders(save_dir, image_list):
    counter = 0
    class_counter = 0
    first = True

    make_sure_path_exists(save_dir)

    for image in image_list:
        counter += 1

        if (counter % int(len(image_list) / len(class_feature)) == 0) and (first == False) != (counter == int(len(image_list))):
            print(counter)
            class_counter += 1
        first = False

        class_dir = os.path.join(save_dir,class_feature[class_counter])
        make_sure_path_exists(class_dir)

        save_image = os.path.join(class_dir,'{}_{}.jpeg'.format(class_feature[class_counter],"".join((map(str,np.random.randint(0,9,8))))))

        scipy.misc.imsave(save_image, image)



In [71]:

    
def put_test_class_images_in_folders(save_dir, image_list):
    counter = 0
    class_counter = 0
    first = True

    make_sure_path_exists(save_dir)

    for image in image_list:
        counter += 1

        if (counter % int(len(image_list) / len(class_feature)) == 0) and (first == False) != (counter == int(len(image_list))):
            print(counter)
            class_counter += 1
        first = False

        class_dir = os.path.join(save_dir,class_feature[class_counter])
        make_sure_path_exists(class_dir)

        save_image = os.path.join(class_dir,'{}_{}.jpeg'.format(class_feature[class_counter],"".join((map(str,np.random.randint(0,9,8))))))

        scipy.misc.imsave(save_image, image)



In [68]:

    
import itertools


def put_classes_into_separate_folders(parent_dir, images):
    make_sure_path_exists(parent_dir)



In [69]:

    
mydict = dict(zip(df['class'],df['image']))



In [70]:

    
put_test_class_images_in_folders(test_dir, test_images)
put_class_images_in_folders(train_dir, train_images)



In [ ]:

	image	class
0	10.1224.176.125.112.16.201109071451150.jpeg	Body
1	10.1224.176.125.112.16.201109071557120.1.jpeg	Body
2	10.1224.176.125.112.16.201109081127120.1.jpeg	Body
3	10.1224.176.125.112.16.201109081127121.1.jpeg	Body
4	10.1224.176.125.112.16.20110908123224.4.jpeg	Body

	image	class
0	[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...	Body
1	[[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2,...	Body
2	[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...	Body
3	[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2,...	Body
4	[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...	Body