In [1]:
%pwd


Out[1]:
u'/home/ubuntu/fastai/nbs'

In [2]:
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/galaxy'

In [3]:
from utils import *


Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)
Using Theano backend.

In [4]:
%cd $DATA_HOME_DIR
%mkdir train
%mkdir valid
%mkdir results
%mkdir model
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results


/home/ubuntu/fastai/nbs/data/galaxy

In [5]:
%cd $DATA_HOME_DIR
%mv images_test_rev1 test


/home/ubuntu/fastai/nbs/data/galaxy

In [6]:
import pandas as pd
df = pd.read_csv("training_solutions_rev1.csv")
print df.shape


(61578, 38)

In [7]:
# shuffle and create training and validation csvs
import sklearn.utils
shuf = sklearn.utils.shuffle(df)

In [8]:
num_train = int(0.90 * df.shape[0])
print 'Number of training images', num_train
print 'Number of validation images', df.shape[0] - num_train

train = shuf[:num_train]
valid = shuf[num_train:]

train.to_csv(DATA_HOME_DIR+'/train.csv')
valid.to_csv(DATA_HOME_DIR+'/valid.csv')


Number of training images 55420
Number of validation images 6158

In [9]:
%cd $DATA_HOME_DIR/images_training_rev1


/home/ubuntu/fastai/nbs/data/galaxy/images_training_rev1

In [11]:
# move pictures into correct folders and crop to 256x256
import cv2
g = glob('*.jpg')

for i in range(len(g)):
    filename, _ = os.path.splitext(g[i])
    gal_id = int(filename)
    if train['GalaxyID'].isin([gal_id]).any():
        img = cv2.imread(g[i])
        crop_img = img[84:340, 84:340]
        cv2.imwrite(DATA_HOME_DIR+'/train/' + g[i], crop_img)
    elif valid['GalaxyID'].isin([gal_id]).any():
        img = cv2.imread(g[i])
        crop_img = img[84:340, 84:340]
        cv2.imwrite(DATA_HOME_DIR+'/valid/' + g[i], crop_img)
    else:
        print 'ID not found'

In [12]:
# clean up
%cd $DATA_HOME_DIR
%rm -r images_training_rev1/
%rm training_solutions_rev1.csv


/home/ubuntu/fastai/nbs/data/galaxy

In [13]:
from shutil import copyfile

In [14]:
# create sample
%cd $DATA_HOME_DIR/train
g = glob('*.jpg')

shuf = sklearn.utils.shuffle(train)
samp_train = shuf[:1500]
samp_valid = shuf[1501:2501]
samp_train.to_csv(DATA_HOME_DIR + '/sample/train.csv')
samp_valid.to_csv(DATA_HOME_DIR + '/sample/valid.csv')

num_train = 0
num_valid = 0
for i in range(len(g)):
    filename, _ = os.path.splitext(g[i])
    gal_id = int(filename)
    if samp_train['GalaxyID'].isin([gal_id]).any():
        copyfile(g[i], DATA_HOME_DIR+'/sample/train/' + g[i])
        num_train += 1
    elif samp_valid['GalaxyID'].isin([gal_id]).any():
        copyfile(g[i], DATA_HOME_DIR+'/sample/valid/' + g[i])
        num_valid += 1
    else:
        pass
    
print 'Num train copied:', num_train
print 'Num valid copied:', num_valid


/home/ubuntu/fastai/nbs/data/galaxy/train
Num train copied: 1500
Num valid copied: 1000

In [15]:
%cd $DATA_HOME_DIR/test
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/' + shuf[i])


/home/ubuntu/fastai/nbs/data/galaxy/test

In [16]:
%cd $DATA_HOME_DIR/sample/train
%mkdir images
%mv *.jpg images/

%cd $DATA_HOME_DIR/sample/valid
%mkdir images
%mv *.jpg images/

%cd $DATA_HOME_DIR/sample/test
%mkdir images
%mv *.jpg images/

%cd $DATA_HOME_DIR/valid
%mkdir images
%mv *.jpg images/

%cd $DATA_HOME_DIR/train
%mkdir images
%mv *.jpg images/

%cd $DATA_HOME_DIR/test
%mkdir images
%mv *.jpg images/


/home/ubuntu/fastai/nbs/data/galaxy/sample/train
/home/ubuntu/fastai/nbs/data/galaxy/sample/valid
/home/ubuntu/fastai/nbs/data/galaxy/sample/test
/home/ubuntu/fastai/nbs/data/galaxy/valid
/home/ubuntu/fastai/nbs/data/galaxy/train
/home/ubuntu/fastai/nbs/data/galaxy/test