In [1]:
%pwd
Out[1]:
In [2]:
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/galaxy'
In [3]:
from utils import *
In [4]:
%cd $DATA_HOME_DIR
%mkdir train
%mkdir valid
%mkdir results
%mkdir model
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
In [5]:
%cd $DATA_HOME_DIR
%mv images_test_rev1 test
In [6]:
import pandas as pd
df = pd.read_csv("training_solutions_rev1.csv")
print df.shape
In [7]:
# shuffle and create training and validation csvs
import sklearn.utils
shuf = sklearn.utils.shuffle(df)
In [8]:
num_train = int(0.90 * df.shape[0])
print 'Number of training images', num_train
print 'Number of validation images', df.shape[0] - num_train
train = shuf[:num_train]
valid = shuf[num_train:]
train.to_csv(DATA_HOME_DIR+'/train.csv')
valid.to_csv(DATA_HOME_DIR+'/valid.csv')
In [9]:
%cd $DATA_HOME_DIR/images_training_rev1
In [11]:
# move pictures into correct folders and crop to 256x256
import cv2
g = glob('*.jpg')
for i in range(len(g)):
filename, _ = os.path.splitext(g[i])
gal_id = int(filename)
if train['GalaxyID'].isin([gal_id]).any():
img = cv2.imread(g[i])
crop_img = img[84:340, 84:340]
cv2.imwrite(DATA_HOME_DIR+'/train/' + g[i], crop_img)
elif valid['GalaxyID'].isin([gal_id]).any():
img = cv2.imread(g[i])
crop_img = img[84:340, 84:340]
cv2.imwrite(DATA_HOME_DIR+'/valid/' + g[i], crop_img)
else:
print 'ID not found'
In [12]:
# clean up
%cd $DATA_HOME_DIR
%rm -r images_training_rev1/
%rm training_solutions_rev1.csv
In [13]:
from shutil import copyfile
In [14]:
# create sample
%cd $DATA_HOME_DIR/train
g = glob('*.jpg')
shuf = sklearn.utils.shuffle(train)
samp_train = shuf[:1500]
samp_valid = shuf[1501:2501]
samp_train.to_csv(DATA_HOME_DIR + '/sample/train.csv')
samp_valid.to_csv(DATA_HOME_DIR + '/sample/valid.csv')
num_train = 0
num_valid = 0
for i in range(len(g)):
filename, _ = os.path.splitext(g[i])
gal_id = int(filename)
if samp_train['GalaxyID'].isin([gal_id]).any():
copyfile(g[i], DATA_HOME_DIR+'/sample/train/' + g[i])
num_train += 1
elif samp_valid['GalaxyID'].isin([gal_id]).any():
copyfile(g[i], DATA_HOME_DIR+'/sample/valid/' + g[i])
num_valid += 1
else:
pass
print 'Num train copied:', num_train
print 'Num valid copied:', num_valid
In [15]:
%cd $DATA_HOME_DIR/test
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/' + shuf[i])
In [16]:
%cd $DATA_HOME_DIR/sample/train
%mkdir images
%mv *.jpg images/
%cd $DATA_HOME_DIR/sample/valid
%mkdir images
%mv *.jpg images/
%cd $DATA_HOME_DIR/sample/test
%mkdir images
%mv *.jpg images/
%cd $DATA_HOME_DIR/valid
%mkdir images
%mv *.jpg images/
%cd $DATA_HOME_DIR/train
%mkdir images
%mv *.jpg images/
%cd $DATA_HOME_DIR/test
%mkdir images
%mv *.jpg images/