In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import glob
import rarfile, os
from collections import Counter
from collections import defaultdict
from PIL import Image
from datetime import datetime
import shutil

def imshow(*args, **kwargs):
    params = dict(cmap=plt.cm.gray, interpolation='nearest')
    params.update(kwargs)
    plt.imshow(*args, **params)

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

def getDate(filename, split='\\'):
    if split in filename:
        s = re.findall('\d+', filename.split(split)[-1])[0]
    else:
        s = re.findall('\d+', filename)[0]
    return datetime.strptime(s, '%Y%m%d')

def myround(x, base=2):
    return int(base * round(float(x)/base))

Cleaning image directory


In [ ]:
directories = Counter()
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
bad_filenames = []
for i, filename in enumerate(filenames):
#     # First remove sub directories
#     origdir,filename = os.path.split(filename)
#     dirname = os.path.dirname(origdir)
#     shutil.move(os.path.join(origdir,filename), os.path.join(dirname, filename))
    
#     # Filter BW/Color ones
#     bwfilename = filename.replace('-colour', '')
#     if (filename != bwfilename) and (bwfilename in filenames):
#         os.remove(bwfilename)
#         continue
    
#     # Remove small copies
#     sfilename = filename.replace('-small','')
#     if (filename != sfilename) and (sfilename in filenames):
#         os.remove(filename)
#         continue
    
#     # Files without size
#     if os.stat(filename).st_size == 0:
#         os.remove(filename)
#         print(filename)
    
    
    gfilename = filename.replace('.gif', '.jpg')
    if (filename != gfilename) and (gfilename in filenames):
        os.remove(gfilename)
        continue
    
    
    
    directories[os.path.basename(os.path.dirname(filename))] += 1
directories.most_common(1000)

In [ ]:
simplesizes = defaultdict(list)
simpleshapes = defaultdict(list)
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
for filename in filenames:
    with Image.open(filename) as img:
        key = (myround(img.height), myround(img.width))
        simplesizes[key].append(filename)
        simpleshapes[key].append((img.height, img.width))

In [ ]:
sorted(map(lambda x: (x[0], len(x[1])), simplesizes.items()), key=lambda x: -x[-1])[:10]

In [ ]:
np.sort()

In [ ]:
def plotBox(xmin, xmax, ymin, ymax):
    plt.plot([xmin, xmax, xmax, xmin, xmin],
             [ymin, ymin, ymax, ymax, ymin], color='r', lw=2)

def threshold(img, thresh=60):
    Y = np.zeros(img.shape)+255
    Y[img < thresh] = img[img < thresh]
    return(Y)

def getOffsets(im, sep=120, axis=1):
    m = np.mean(im, axis=axis)
    thresh = np.percentile(m, 2)
    s = np.std(im, axis=axis)
    sthresh = np.percentile(s, 2)
    pts = np.where((m<thresh) | (s<sthresh))[0]
    lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
    delta = np.diff(lines)
    offsets = lines[np.where(delta > xm)[0]]
    sizes = delta[np.where(delta > xm)[0]]
    for offset,size in zip(offsets, sizes):
        yield offset, offset+size

def carve(img, nx=120, ny=120, linethresh=None):
    Y = threshold(img)
    Y = img
    ymean = np.mean(Y, axis=1)
    linethresh = np.percentile(np.mean(Y, axis=1), 2)
    ylines = np.where(np.mean(Y,1)<linethresh)[0]
    lines = np.sort(np.concatenate([[0], ylines,[img.shape[0]]]))
    
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
    heights = deltas[np.where(deltas>ny)[0]]#[::2]
    
    for yoffset,height in zip(yoffsets, heights):
        X = Y[yoffset:yoffset+height,:]
        xmean = np.mean(X, axis=0)
        xthresh = np.percentile(xmean, 2)
        xlines = np.where(xmean < xthresh)[0]
        
        lines = np.sort(np.concatenate([[0], xlines, [img.shape[1]]]))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>nx)[0]]#[::2]
        widths = deltas[np.where(deltas>nx)[0]]#[::2]
        
        for xoffset, width in zip(xoffsets, widths):
            yield xoffset, xoffset+width, yoffset, yoffset+height

            
# plt.figure(figsize=(12,6))
# imshow(image)
# for box in carve(image):
#     plotBox(*box)

In [ ]:
def oned(im, xm=120, axis=1):
    m = np.mean(im, axis=axis)
    plt.plot(m)
    thresh = np.percentile(m, 2)

    s = np.std(im, axis=axis)
    plt.plot(s)
    sthresh = np.percentile(s, 2)

    plt.axhline(thresh, color='orange')
    plt.axhline(thresh, color='red')
    pts = np.where((m<thresh) | (s<sthresh))[0]
    lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
    delta = np.diff(lines)
    offsets = lines[np.where(delta > xm)[0]]
    heights = delta[np.where(delta > xm)[0]]
    for offset, height in zip(offsets, heights):
        plt.axvspan(offset, offset+height, zorder=2, color='0.5')
for i, ax in enumerate(plt.subplots(2,1, figsize=(12,6))[1]):
    plt.sca(ax)
    oned(image, axis=1-i)

In [132]:
nskip = 5000
bad = []
params = {}
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
#     if len(filenames) < 2:
#         continue
#     if j < 2:
#         continue
#     if j not in bad:
#         continue
    key = (height,width)
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        if i > nskip:
            continue
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        image += x[:image.shape[0], 
                   :image.shape[1]]*1.0/np.min([nskip, len(shapes)])
    outfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(outfilename):
        continue
    
    outfilename = '/Users/ajmendez/data/dilbert/stacks/stack_{:03d}.png'.format(j)
#     plt.imsave(outfilename, image, cmap=plt.cm.gray)
    
    plt.figure(figsize=(12,6))
    imshow(image)
    params[key] = []
    for i,box in enumerate(carve(image)):
        plotBox(*box)
        params[key].append(box)
    plt.title((height,width,len(filenames)))
    plt.savefig(outfilename.replace('.png', '.fig.png'))
    plt.close()
#     break

Skip Bad Files


In [145]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(badfilename):
        continue
    
    key = (height,width)
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        
        basename = os.path.splitext(os.path.basename(filename))[0]
        dirname = os.path.dirname(filename).replace('images', 'panels')
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        
        for k, box in enumerate(params[key]):
            outfilename = os.path.join(dirname, 
                                       basename+'.{:02d}.png'.format(k))
            xmin,xmax, ymin,ymax = box
            im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
            im.thumbnail((128,128))
            im.save(outfilename)
#             plt.imsave(outfilename, x[ymin:ymax, xmin:xmax], 
#                        cmap=plt.cm.gray)
#         break
#     break

In [173]:
w,h = 620,425
heights, widths = map(np.array, zip(*simplesizes.keys()))
d = (widths-w)**2 + (heights-h)**2
ii = np.argmin(d)

plt.plot(widths, heights, '.')
plt.plot(widths[ii], heights[ii], 'og')
plt.plot(w,h, 'sr')


Out[173]:
[<matplotlib.lines.Line2D at 0x143213710>]

In [183]:
nbad = 0
heights,widths = [],[]
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(badfilename):
        nbad += len(filenames)
        plt.scatter(width,height, s=len(filenames)+5, lw=0, alpha=0.5, color='r')
    else:
        heights.append(height)
        widths.append(width)
        plt.scatter(width,height, lw=0, alpha=0.5, color='k')
heights,widths = map(np.array, (heights,widths))
print('{:,d} files are still unprocessed.  ~{:0,.0f} panels'.format(nbad, nbad*(3*5/6 + 8*1/6)))


1,241 files are still unprocessed.  ~4,757 panels

In [189]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if not os.path.exists(badfilename):
        continue
    
    shapes = simpleshapes[(height,width)]
    image = np.zeros(np.min(shapes, axis=0))
    
    d = (widths-w)**2 + (heights-h)**2
    d[(widths<w)&(height<w)] = 1e6
    i = np.argmin(d)
    key = (heights[i],widths[i])
    boxes = params[key]
    
    
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        image += x[:image.shape[0], 
                   :image.shape[1]]*1.0/len(shapes)
    
    outfilename = '/Users/ajmendez/data/dilbert/stacks_nearest/stack_{:03d}.png'.format(j)
    dirname = os.path.dirname(outfilename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    
    plt.figure(figsize=(12,6))
    imshow(image)
    params[key] = []
    for i,box in enumerate(carve(image)):
        plotBox(*box)
        params[key].append(box)
    plt.title((height,width,len(filenames)))
    plt.savefig(outfilename.replace('.png', '.fig.png'))
    plt.close()
#     break

In [185]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if not os.path.exists(badfilename):
        continue
    
    d = (widths-w)**2 + (heights-h)**2
    i = np.argmin(d)
    key = (heights[i],widths[i])
    boxes = params[key]
    
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        
        basename = os.path.splitext(os.path.basename(filename))[0]
        dirname = os.path.dirname(filename).replace('images', 'panels2')
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        
        for k, box in enumerate(boxes):
            outfilename = os.path.join(dirname, 
                                       basename+'.{:02d}.png'.format(k))
            xmin,xmax, ymin,ymax = box
            im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
            im.thumbnail((128,128))
            im.save(outfilename)
#             plt.imsave(outfilename, x[ymin:ymax, xmin:xmax], 
#                        cmap=plt.cm.gray)
        break
    break

In [ ]:
# from pprint import pformat, pprint

In [ ]:
# pprint(params, width=1000)

In [ ]:
# params2 = {}
# for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
#     params2[j] = params[(height,width)]

In [ ]:
# pprint(params2, width=1000)

In [ ]: