Cleaning data from imagenet


In [ ]:
import os
from glob import glob
from numpy import random

In [ ]:
#Importer nødvendige libs:
import os, sys
import numpy as np
from glob import glob

from __future__ import division,print_function
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import json

In [ ]:
# Lag stivariabler
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/suitsglasses'
%pwd

In [ ]:
# Klargjør mappestruktur
%cd $DATA_HOME_DIR
%pwd

# Slett dirs:
%rm -rf ./train/*
%rm -rf ./valid/*


# Opprett nødvendige mapper:
%mkdir -p train
%mkdir -p valid
%mkdir -p test
%cd valid
%mkdir -p suits
%mkdir -p glasses
%cd ../test
%mkdir -p unknown

In [ ]:
# Unzip filer:
%cd $DATA_HOME_DIR/train

import zipfile

def unzip_file(file):
    zip_ref = zipfile.ZipFile(file, 'r')
    zip_ref.extractall()
    zip_ref.close()

unzip_file('../suits.zip')
unzip_file('../glasses.zip')

In [ ]:
# Klargjør data ved å slette alle bilder som ikke validerer som jpg:
%cd $DATA_HOME_DIR/train
%pwd
import imghdr

def clean_dir(path):
    #path =  os.getcwd() + '/' + path
    for file in os.listdir(path):
        filepath = path + '/' + file
        type = imghdr.what(filepath)
        if type != 'jpeg' and type != 'jpg':
            os.remove(filepath)
            print ('Deleting ', file)

clean_dir('glasses')
clean_dir('suits')

In [ ]:
# Plukk ut femti tilfeldige bilder som vi plasserer i validation-settet
%cd $DATA_HOME_DIR/train/suits
g = glob('*.jpg')
shuf = random.permutation(g)
for i in range(50):
    os.rename(shuf[i], DATA_HOME_DIR+'/valid/suits/' + shuf[i])

In [ ]:
%cd $DATA_HOME_DIR/train/glasses
g = glob('*.jpg')
shuf = random.permutation(g)
for i in range(50):
    os.rename(shuf[i], DATA_HOME_DIR+'/valid/glasses/' + shuf[i])

In [ ]: