In [1]:
# Inicialmente se deben importar todos los modulos necesarios
# esto se debe hacer antes de ejecutar cualquier otra instruccion
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
%matplotlib inline
Se debe descargar la base de datos
In [ ]:
url = 'http://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
global last_percent_reported
percent = int(count * blockSize * 100 / totalSize)
if last_percent_reported != percent:
if percent % 5 == 0:
sys.stdout.write("%s%%" % percent)
sys.stdout.flush()
else:
sys.stdout.write(".")
sys.stdout.flush()
last_percent_reported = percent
def maybe_download(filename, expected_bytes, force=False):
"""Descargar archivos si no se encuentran o tienen tamano diferente al que deberian tener."""
if force or not os.path.exists(filename):
print('Intentando descargar:', filename)
filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
print('\n Descarga completa!')
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Datos encontrados y verificados', filename)
else:
raise Exception(
'No se pudo verificar ' + filename + '. Por favor usar un navegador para descargarlo')
return filename
train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)
Se debe extraer la base de datos
In [ ]:
num_classes = 10
np.random.seed(133)
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# force=True: puede sobreescribir datos existentes
print('%s Ya existe - Omitir extraccion de %s.' % (root, filename))
else:
print('Extraer datos de %s. Esto puede demorar. Por favor espere.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall()
tar.close()
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root))
if os.path.isdir(os.path.join(root, d))]
if len(data_folders) != num_classes:
raise Exception(
'ERROR: se esperaban %d directorios, uno por clase. Se encontraron %d .' % (
num_classes, len(data_folders)))
print(data_folders)
return data_folders
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
In [ ]:
for root in train_folders:
list_im=sorted(os.listdir(root))
indx=np.random.permutation(len(list_im))
im=root+'/'+list_im[indx[0]]
display(Image(filename=im))
A continuacion se organizan los datos de una forma mas manejable. Dependiendo de las capacidades del computador, se prefiere no cargar todos los datos en memoria, es mejor guardar cada clase en un archivo y cargarla cuando sea necesario. Se guardaran archivos de cada clase en un arreglo 3-D: (indice de la imagen, pixel x, pixel y), cada imagen se normaliza para que tenga media cero y desviacion estandar de aproximadamente 0.5, esto facilita el proceso de datos por los algoritmos.
In [ ]:
image_size = 28 # Altura y ancho en pixeles.
pixel_depth = 255.0 # Numero de niveles por pixel
def load_letter(folder, min_num_images):
image_files = os.listdir(folder)
dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
dtype=np.float32)
print(folder)
num_images = 0
for image in image_files:
image_file = os.path.join(folder, image)
try:
image_data = (ndimage.imread(image_file).astype(float) -
pixel_depth / 2) / pixel_depth
if image_data.shape != (image_size, image_size):
raise Exception('Tamano de imagen no correcto: %s' % str(image_data.shape))
dataset[num_images, :, :] = image_data
num_images = num_images + 1
except IOError as e:
print('No se pudo leer:', image_file, ':', e, '- it\'s ok, ignorando....')
dataset = dataset[0:num_images, :, :]
if num_images < min_num_images:
raise Exception('No se satisface el numero minimo de imagenes: %d < %d' %
(num_images, min_num_images))
print('Dimensiones totales:', dataset.shape)
print('Media:', np.mean(dataset))
print('Desviacion estandar:', np.std(dataset))
return dataset
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
dataset_names = []
for folder in data_folders:
set_filename = folder + '.pickle'
dataset_names.append(set_filename)
if os.path.exists(set_filename) and not force:
print('%s Ya existe - Ignorando....' % set_filename)
else:
print('Pickling %s.' % set_filename)
dataset = load_letter(folder, min_num_images_per_class)
try:
with open(set_filename, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('No se puede guardar datos en: ', set_filename, ':', e)
return dataset_names
train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)
Verificar que los datos todavia son correctos. El estudiante debe mostrar 10 muestras de cada clase seleccionadas de forma aleatoria. Para hacer esto se puede usar matplotlib.pyplot
El estudiante debe tambien verificar que el numero de imagenes es balanceado en las clases. Mostrar cuantas muestras tiene cada clase.
In [ ]:
IM=np.zeros([2,282])
nbr_samples=[]
for root in train_datasets:
f=open(root,'r')
xx=pickle.load(f)
xx_root=np.zeros([28,2])
indx=np.random.permutation(xx.shape[0])
nbr_samples.append(xx.shape[0])
for ii in range(10):
xx_root=np.hstack((xx_root,xx[indx[ii],:,:]))
IM=np.vstack((IM,xx_root))
plt.imshow(IM,cmap="gray")
plt.show()
print(nbr_samples)
In [ ]:
def make_arrays(nb_rows, img_size):
if nb_rows:
dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
labels = np.ndarray(nb_rows, dtype=np.int32)
else:
dataset, labels = None, None
return dataset, labels
def merge_datasets(pickle_files, train_size, valid_size=0):
num_classes = len(pickle_files)
valid_dataset, valid_labels = make_arrays(valid_size, image_size)
train_dataset, train_labels = make_arrays(train_size, image_size)
vsize_per_class = valid_size // num_classes
tsize_per_class = train_size // num_classes
start_v, start_t = 0, 0
end_v, end_t = vsize_per_class, tsize_per_class
end_l = vsize_per_class+tsize_per_class
for label, pickle_file in enumerate(pickle_files):
try:
with open(pickle_file, 'rb') as f:
letter_set = pickle.load(f)
#Mezclar los datos de una misma clase para que se presenten de forma aleatoria
np.random.shuffle(letter_set)
if valid_dataset is not None:
valid_letter = letter_set[:vsize_per_class, :, :]
valid_dataset[start_v:end_v, :, :] = valid_letter
valid_labels[start_v:end_v] = label
start_v += vsize_per_class
end_v += vsize_per_class
train_letter = letter_set[vsize_per_class:end_l, :, :]
train_dataset[start_t:end_t, :, :] = train_letter
train_labels[start_t:end_t] = label
start_t += tsize_per_class
end_t += tsize_per_class
except Exception as e:
print('No se pudo procesar datos de: ', pickle_file, ':', e)
raise
return valid_dataset, valid_labels, train_dataset, train_labels
train_size = 200000
valid_size = 10000
test_size = 10000
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)
print('Entrenamiento:', train_dataset.shape, train_labels.shape)
print('Validacion:', valid_dataset.shape, valid_labels.shape)
print('Prueba:', test_dataset.shape, test_labels.shape)
Luego se mezcla de forma aleatoria todos los datos de todas las clases. Es importante tener los datos bien distribuidos para evitar sesgo en diferentes etapas de entrenamiento
In [ ]:
def randomize(dataset, labels):
permutation = np.random.permutation(labels.shape[0])
shuffled_dataset = dataset[permutation,:,:]
shuffled_labels = labels[permutation]
return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)
In [ ]:
IM=np.zeros([2,282])
models = np.unique(train_labels)
idx = np.arange(train_labels.size)
for mm in models:
xx_root=np.zeros([28,2])
idx_mm = idx[train_labels == mm]
for ii in range(10):
xx_root=np.hstack((xx_root,train_dataset[idx_mm[ii],:,:]))
IM=np.vstack((IM,xx_root))
plt.imshow(IM,cmap="gray")
plt.show()
In [ ]: