In [ ]:
    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import glob
import rarfile, os
from collections import Counter
from collections import defaultdict
from PIL import Image
from datetime import datetime
import shutil
def imshow(*args, **kwargs):
    params = dict(cmap=plt.cm.gray, interpolation='nearest')
    params.update(kwargs)
    plt.imshow(*args, **params)
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])
def getDate(filename, split='\\'):
    if split in filename:
        s = re.findall('\d+', filename.split(split)[-1])[0]
    else:
        s = re.findall('\d+', filename)[0]
    return datetime.strptime(s, '%Y%m%d')
def myround(x, base=2):
    return int(base * round(float(x)/base))
    
In [ ]:
    
directories = Counter()
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
bad_filenames = []
for i, filename in enumerate(filenames):
#     # First remove sub directories
#     origdir,filename = os.path.split(filename)
#     dirname = os.path.dirname(origdir)
#     shutil.move(os.path.join(origdir,filename), os.path.join(dirname, filename))
    
#     # Filter BW/Color ones
#     bwfilename = filename.replace('-colour', '')
#     if (filename != bwfilename) and (bwfilename in filenames):
#         os.remove(bwfilename)
#         continue
    
#     # Remove small copies
#     sfilename = filename.replace('-small','')
#     if (filename != sfilename) and (sfilename in filenames):
#         os.remove(filename)
#         continue
    
#     # Files without size
#     if os.stat(filename).st_size == 0:
#         os.remove(filename)
#         print(filename)
    
    
    gfilename = filename.replace('.gif', '.jpg')
    if (filename != gfilename) and (gfilename in filenames):
        os.remove(gfilename)
        continue
    
    
    
    directories[os.path.basename(os.path.dirname(filename))] += 1
directories.most_common(1000)
    
In [ ]:
    
simplesizes = defaultdict(list)
simpleshapes = defaultdict(list)
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
for filename in filenames:
    with Image.open(filename) as img:
        key = (myround(img.height), myround(img.width))
        simplesizes[key].append(filename)
        simpleshapes[key].append((img.height, img.width))
    
In [ ]:
    
sorted(map(lambda x: (x[0], len(x[1])), simplesizes.items()), key=lambda x: -x[-1])[:10]
    
In [ ]:
    
np.sort()
    
In [ ]:
    
def plotBox(xmin, xmax, ymin, ymax):
    plt.plot([xmin, xmax, xmax, xmin, xmin],
             [ymin, ymin, ymax, ymax, ymin], color='r', lw=2)
def threshold(img, thresh=60):
    Y = np.zeros(img.shape)+255
    Y[img < thresh] = img[img < thresh]
    return(Y)
def getOffsets(im, sep=120, axis=1):
    m = np.mean(im, axis=axis)
    thresh = np.percentile(m, 2)
    s = np.std(im, axis=axis)
    sthresh = np.percentile(s, 2)
    pts = np.where((m<thresh) | (s<sthresh))[0]
    lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
    delta = np.diff(lines)
    offsets = lines[np.where(delta > xm)[0]]
    sizes = delta[np.where(delta > xm)[0]]
    for offset,size in zip(offsets, sizes):
        yield offset, offset+size
def carve(img, nx=120, ny=120, linethresh=None):
    Y = threshold(img)
    Y = img
    ymean = np.mean(Y, axis=1)
    linethresh = np.percentile(np.mean(Y, axis=1), 2)
    ylines = np.where(np.mean(Y,1)<linethresh)[0]
    lines = np.sort(np.concatenate([[0], ylines,[img.shape[0]]]))
    
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
    heights = deltas[np.where(deltas>ny)[0]]#[::2]
    
    for yoffset,height in zip(yoffsets, heights):
        X = Y[yoffset:yoffset+height,:]
        xmean = np.mean(X, axis=0)
        xthresh = np.percentile(xmean, 2)
        xlines = np.where(xmean < xthresh)[0]
        
        lines = np.sort(np.concatenate([[0], xlines, [img.shape[1]]]))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>nx)[0]]#[::2]
        widths = deltas[np.where(deltas>nx)[0]]#[::2]
        
        for xoffset, width in zip(xoffsets, widths):
            yield xoffset, xoffset+width, yoffset, yoffset+height
            
# plt.figure(figsize=(12,6))
# imshow(image)
# for box in carve(image):
#     plotBox(*box)
    
In [ ]:
    
def oned(im, xm=120, axis=1):
    m = np.mean(im, axis=axis)
    plt.plot(m)
    thresh = np.percentile(m, 2)
    s = np.std(im, axis=axis)
    plt.plot(s)
    sthresh = np.percentile(s, 2)
    plt.axhline(thresh, color='orange')
    plt.axhline(thresh, color='red')
    pts = np.where((m<thresh) | (s<sthresh))[0]
    lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
    delta = np.diff(lines)
    offsets = lines[np.where(delta > xm)[0]]
    heights = delta[np.where(delta > xm)[0]]
    for offset, height in zip(offsets, heights):
        plt.axvspan(offset, offset+height, zorder=2, color='0.5')
for i, ax in enumerate(plt.subplots(2,1, figsize=(12,6))[1]):
    plt.sca(ax)
    oned(image, axis=1-i)
    
In [132]:
    
nskip = 5000
bad = []
params = {}
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
#     if len(filenames) < 2:
#         continue
#     if j < 2:
#         continue
#     if j not in bad:
#         continue
    key = (height,width)
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        if i > nskip:
            continue
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        image += x[:image.shape[0], 
                   :image.shape[1]]*1.0/np.min([nskip, len(shapes)])
    outfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(outfilename):
        continue
    
    outfilename = '/Users/ajmendez/data/dilbert/stacks/stack_{:03d}.png'.format(j)
#     plt.imsave(outfilename, image, cmap=plt.cm.gray)
    
    plt.figure(figsize=(12,6))
    imshow(image)
    params[key] = []
    for i,box in enumerate(carve(image)):
        plotBox(*box)
        params[key].append(box)
    plt.title((height,width,len(filenames)))
    plt.savefig(outfilename.replace('.png', '.fig.png'))
    plt.close()
#     break
    
In [145]:
    
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(badfilename):
        continue
    
    key = (height,width)
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        
        basename = os.path.splitext(os.path.basename(filename))[0]
        dirname = os.path.dirname(filename).replace('images', 'panels')
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        
        for k, box in enumerate(params[key]):
            outfilename = os.path.join(dirname, 
                                       basename+'.{:02d}.png'.format(k))
            xmin,xmax, ymin,ymax = box
            im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
            im.thumbnail((128,128))
            im.save(outfilename)
#             plt.imsave(outfilename, x[ymin:ymax, xmin:xmax], 
#                        cmap=plt.cm.gray)
#         break
#     break
    
In [173]:
    
w,h = 620,425
heights, widths = map(np.array, zip(*simplesizes.keys()))
d = (widths-w)**2 + (heights-h)**2
ii = np.argmin(d)
plt.plot(widths, heights, '.')
plt.plot(widths[ii], heights[ii], 'og')
plt.plot(w,h, 'sr')
    
    Out[173]:
    
In [183]:
    
nbad = 0
heights,widths = [],[]
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if os.path.exists(badfilename):
        nbad += len(filenames)
        plt.scatter(width,height, s=len(filenames)+5, lw=0, alpha=0.5, color='r')
    else:
        heights.append(height)
        widths.append(width)
        plt.scatter(width,height, lw=0, alpha=0.5, color='k')
heights,widths = map(np.array, (heights,widths))
print('{:,d} files are still unprocessed.  ~{:0,.0f} panels'.format(nbad, nbad*(3*5/6 + 8*1/6)))
    
    
    
In [189]:
    
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if not os.path.exists(badfilename):
        continue
    
    shapes = simpleshapes[(height,width)]
    image = np.zeros(np.min(shapes, axis=0))
    
    d = (widths-w)**2 + (heights-h)**2
    d[(widths<w)&(height<w)] = 1e6
    i = np.argmin(d)
    key = (heights[i],widths[i])
    boxes = params[key]
    
    
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        image += x[:image.shape[0], 
                   :image.shape[1]]*1.0/len(shapes)
    
    outfilename = '/Users/ajmendez/data/dilbert/stacks_nearest/stack_{:03d}.png'.format(j)
    dirname = os.path.dirname(outfilename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    
    plt.figure(figsize=(12,6))
    imshow(image)
    params[key] = []
    for i,box in enumerate(carve(image)):
        plotBox(*box)
        params[key].append(box)
    plt.title((height,width,len(filenames)))
    plt.savefig(outfilename.replace('.png', '.fig.png'))
    plt.close()
#     break
    
In [185]:
    
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
    if not os.path.exists(badfilename):
        continue
    
    d = (widths-w)**2 + (heights-h)**2
    i = np.argmin(d)
    key = (heights[i],widths[i])
    boxes = params[key]
    
    shapes = simpleshapes[key]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        try:
            x = plt.imread(filename)
        except Exception as e:
            print(e)
            print(filename)
            continue
        if len(x.shape) in [3,4]:
            x = rgb2gray(x)
        
        basename = os.path.splitext(os.path.basename(filename))[0]
        dirname = os.path.dirname(filename).replace('images', 'panels2')
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        
        for k, box in enumerate(boxes):
            outfilename = os.path.join(dirname, 
                                       basename+'.{:02d}.png'.format(k))
            xmin,xmax, ymin,ymax = box
            im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
            im.thumbnail((128,128))
            im.save(outfilename)
#             plt.imsave(outfilename, x[ymin:ymax, xmin:xmax], 
#                        cmap=plt.cm.gray)
        break
    break
    
In [ ]:
    
# from pprint import pformat, pprint
    
In [ ]:
    
# pprint(params, width=1000)
    
In [ ]:
    
# params2 = {}
# for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
#     params2[j] = params[(height,width)]
    
In [ ]:
    
# pprint(params2, width=1000)
    
In [ ]: