In [1]:
    
# Inicialmente se deben importar todos los modulos necesarios
# esto se debe hacer antes de ejecutar cualquier otra instruccion
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
%matplotlib inline
    
Se debe descargar la base de datos
In [ ]:
    
url = 'http://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)
  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Descargar archivos si no se encuentran o tienen tamano diferente al que deberian tener."""
  if force or not os.path.exists(filename):
    print('Intentando descargar:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\n Descarga completa!')
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Datos encontrados y verificados', filename)
  else:
    raise Exception(
      'No se pudo verificar ' + filename + '. Por favor usar un navegador para descargarlo')
  return filename
train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)
    
Se debe extraer la base de datos
In [ ]:
    
num_classes = 10
np.random.seed(133)
def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # force=True: puede sobreescribir datos existentes
    print('%s Ya existe - Omitir extraccion de %s.' % (root, filename))
  else:
    print('Extraer datos de %s. Esto puede demorar. Por favor espere.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'ERROR: se esperaban %d directorios, uno por clase. Se encontraron %d .' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
    
In [ ]:
    
for root in train_folders:
    list_im=sorted(os.listdir(root))
    indx=np.random.permutation(len(list_im))
    im=root+'/'+list_im[indx[0]]
    display(Image(filename=im))
    
A continuacion se organizan los datos de una forma mas manejable. Dependiendo de las capacidades del computador, se prefiere no cargar todos los datos en memoria, es mejor guardar cada clase en un archivo y cargarla cuando sea necesario. Se guardaran archivos de cada clase en un arreglo 3-D: (indice de la imagen, pixel x, pixel y), cada imagen se normaliza para que tenga media cero y desviacion estandar de aproximadamente 0.5, esto facilita el proceso de datos por los algoritmos.
In [ ]:
    
image_size = 28  # Altura y ancho en pixeles.
pixel_depth = 255.0  # Numero de niveles por pixel
def load_letter(folder, min_num_images):
  
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (ndimage.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Tamano de imagen no correcto: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('No se pudo leer:', image_file, ':', e, '- it\'s ok, ignorando....')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('No se satisface el numero minimo de imagenes: %d < %d' %
                    (num_images, min_num_images))
    
  print('Dimensiones totales:', dataset.shape)
  print('Media:', np.mean(dataset))
  print('Desviacion estandar:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      
      print('%s Ya existe - Ignorando....' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('No se puede guardar datos en: ', set_filename, ':', e)
  
  return dataset_names
train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)
    
Verificar que los datos todavia son correctos. El estudiante debe mostrar 10 muestras de cada clase seleccionadas de forma aleatoria. Para hacer esto se puede usar matplotlib.pyplot
El estudiante debe tambien verificar que el numero de imagenes es balanceado en las clases. Mostrar cuantas muestras tiene cada clase.
In [ ]:
    
IM=np.zeros([2,282])
nbr_samples=[]
for root in train_datasets:
    f=open(root,'r')
    xx=pickle.load(f)
    xx_root=np.zeros([28,2])    
    indx=np.random.permutation(xx.shape[0])
    nbr_samples.append(xx.shape[0])
    for ii in range(10):
         xx_root=np.hstack((xx_root,xx[indx[ii],:,:]))
    IM=np.vstack((IM,xx_root))
plt.imshow(IM,cmap="gray")
plt.show()
print(nbr_samples)
    
In [ ]:
    
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels
def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        #Mezclar los datos de una misma clase para que se presenten de forma aleatoria
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('No se pudo procesar datos de: ', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)
print('Entrenamiento:', train_dataset.shape, train_labels.shape)
print('Validacion:', valid_dataset.shape, valid_labels.shape)
print('Prueba:', test_dataset.shape, test_labels.shape)
    
Luego se mezcla de forma aleatoria todos los datos de todas las clases. Es importante tener los datos bien distribuidos para evitar sesgo en diferentes etapas de entrenamiento
In [ ]:
    
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)
    
In [ ]:
    
IM=np.zeros([2,282])
models = np.unique(train_labels)
idx    = np.arange(train_labels.size)
for mm in models:    
    xx_root=np.zeros([28,2])     
    idx_mm = idx[train_labels == mm]    
    for ii in range(10):
         xx_root=np.hstack((xx_root,train_dataset[idx_mm[ii],:,:]))
    IM=np.vstack((IM,xx_root))
plt.imshow(IM,cmap="gray")
plt.show()
    
In [ ]: