In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import glob
import rarfile, os
from collections import Counter
from PIL import Image
from datetime import datetime
import shutil

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

In [ ]:
img = rgb2gray(plt.imread('test.jpg'))
img.shape

In [ ]:
plt.figure(figsize=(12,6))
plt.imshow(img, cmap=plt.cm.gray)

In [ ]:
def plotBox(xmin, xmax, ymin, ymax):
    plt.plot([xmin, xmax, xmax, xmin, xmin],
             [ymin, ymin, ymax, ymax, ymin], color='r', lw=2)

def threshold(img, thresh=60):
    Y = np.zeros(img.shape)+255
    Y[img < thresh] = img[img < thresh]
    return(Y)
    
# linethresh = 180

nx,ny = 90,90
Y = threshold(img)
linethresh = np.clip(np.percentile(np.mean(Y, axis=1), 4), 0, 180)
ylines = np.where(np.mean(Y,1) <= linethresh)[0]
if len(ylines):
    # two rows
    lines = (np.concatenate([[0], ylines,[img.shape[0]]]))
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>=ny)[0]]#[::2]
    heights = deltas[np.where(deltas>=ny)[0]]#[::2]
    print(lines)
    print(deltas)
    print (yoffsets)
    print(heights)
else:
    print(ylines)
    plt.plot(np.mean(Y,1))
    raise(ValueError('Could not figure out boxes'))

plt.figure(figsize=(12,6))
plt.imshow(img, cmap=plt.cm.gray)
plt.xlim(-1, img.shape[1]+1)
plt.ylim(img.shape[0]+1,-1)
    
for yoffset,height in zip(yoffsets, heights):
    X = Y[yoffset:yoffset+height,:]
    linethresh = np.clip(np.percentile(np.mean(X, axis=0), 1),0,180)
    xlines = np.where(np.mean(X, 0) <= linethresh)[0]
    
    if len(xlines):
        # 4 columns
        lines = (np.concatenate([[0], xlines,[img.shape[1]]]))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>=nx)[0]]#[::2]
        widths = deltas[np.where(deltas>=nx)[0]]#[::2]
        print(linethresh, len(xlines))
        for xoffset, width in zip(xoffsets, widths):
            plotBox(xoffset, xoffset+width, yoffset, yoffset+height)
    else:
        print(len(xlines))
        raise(ValueError('Not Sure'))
#         break
    
#     break

# plt.figure(figsize=(12,6))
# plt.imshow(Y, cmap=plt.cm.gray)
plt.figure(figsize=(6,6))
plt.plot(np.mean(Y, axis=1))
    
# plt.figure(figsize=(12,6))
# plt.imshow(X, cmap=plt.cm.gray)
# plt.figure(figsize=(6,6))
# plt.plot(np.mean(X, axis=0))


# plt.plot(np.mean(X, axis=0))

In [ ]:
pattern = '/Users/ajmendez/data/dilbert/raw/*.rar'
sizes = Counter()
for filename in sorted(glob.iglob(pattern)):
    year = int(re.findall('\d+', filename)[0])
    
    try:
        with rarfile.RarFile(filename, crc_check=False) as rf:
            files = sorted(rf.namelist())
            goodfiles = [f for f in files
                         if ('.gif' in f) or (not (f.replace('.jpg','-colour.jpg') in files) )]
            for i,f in enumerate(goodfiles):
                try:
                    img = plt.imread(rf.open(f))
                except Exception as e:
                    print('Failed to read {}: {}'.format(f,e))
                if len(img.shape) in [3,4]:
                    # automatically drops alpha channel
                    img = rgb2gray(img)
                    
                sizes[(img.shape[0], img.shape[1], year)] += 1
    except Exception as e:
        print(e)

In [ ]:
def myround(x, base=10):
    return int(base * round(float(x)/base))

generalsizes = Counter()
for (height,width,year),number in sizes.items():
    generalsizes[(myround(height),myround(width))] += number

framespercomic = 3*(5/6) + 8*(1/6)
ncomics = sum(sizes.values())
print('Total Shapes: {:,d}, Total Images: {:,d}; Total Panels: {:0,.0f}'
      .format(len(generalsizes), ncomics, framespercomic*ncomics))

In [ ]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,6))

plt.sca(ax1)
for (height,width),number in generalsizes.items():
    sca = plt.scatter(width, height, s=np.clip(number,5,50), lw=0, alpha=0.5)
ax1.set(xlabel='width', ylabel='height',)


plt.sca(ax2)
for (height,width, year),number in sizes.items():
    sca = plt.scatter(width, height, c=year, vmin=1989, vmax=2015,
                      s=number, lw=0, alpha=0.5)
plt.colorbar(sca, label='year')
ax2.set(xlabel='width', ylabel='height')

In [ ]:
for (height,width, year),number in sizes.items():
    sca = plt.scatter(width, height, c=year, vmin=1989, vmax=2015,s=number, lw=0, alpha=0.5)
plt.colorbar(sca, label='year')
plt.gca().set(xlabel='width', xlim=[0, 1000],
              ylabel='height', ylim=[0, 700])

framespercomic = 3*(5/6) + 8*(1/6)d
ncomics = sum(sizes.values())
print('Total Images: {:,d}; Total Panels: {:0,.0f}'.format(ncomics, framespercomic*ncomics))

In [ ]:
def carve(img, nx=100, ny=100, linethresh=None):
    Y = threshold(img)
    linethresh = np.clip(np.percentile(np.mean(Y, axis=1), 4), 0, 180)
    ylines = np.where(np.mean(Y,1)<linethresh)[0]
    lines = (np.concatenate([[0], ylines,[img.shape[0]]]))
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
    heights = deltas[np.where(deltas>ny)[0]]#[::2]
    
    for yoffset,height in zip(yoffsets, heights):
        X = Y[yoffset:yoffset+height,:]
        xmean = np.mean(X, axis=0)/np.std(X, axis=0)
        xlines = np.where(xmean < 1)[0]
#         linethresh = np.clip(np.percentile(xmean, 4), 0, 180)
#         linethresh = np.percentile(xmean, np.pi)
#         xlines = np.where(np.mean(X, 0) < linethresh)[0]

        lines = (np.concatenate([[0], xlines,[img.shape[1]]]))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>ny)[0]]#[::2]
        widths = deltas[np.where(deltas>ny)[0]]#[::2]
        
#         plt.figure(1)
#         plt.axhline(1)
#         plt.plot(xmean)
#         plt.xlim(0,2)
        
#         print(sxoffsets)
#         print(widths)
        for xoffset, width in zip(xoffsets, widths):
            yield xoffset, xoffset+width, yoffset, yoffset+height
#             yield img[yoffset:yoffset+height, 
#                       xoffset:xoffset+width]

def genAxes():
    fig, axes = plt.subplots(5,5, figsize=(12,12), 
                         subplot_kw={'xticks':[], 'yticks':[]})
    for ax in axes.flatten():
        yield ax

    

Axes = genAxes()
sz = Counter()
images = {}
pattern = '/Users/ajmendez/data/dilbert/raw/*.rar'
for filename in sorted(glob.iglob(pattern)):
    try:
        with rarfile.RarFile(filename, crc_check=False) as rf:
            files = sorted(rf.namelist())
            goodfiles = [f for f in files
                         if ('.gif' in f) or (not (f.replace('.jpg','-colour.jpg') in files) )]
            for i,f in enumerate(goodfiles):
                try:
                    img = plt.imread(rf.open(f))
                except Exception as e:
                    print('Could not load {}: {}'.format(f,e))
                
                key = (myround(img.shape[0]), myround(img.shape[1]))
                if key in sz:
                    continue
#                 raise ValueError('Done')
                sz[key] += 1
                if len(img.shape) in [3,4]:
                    img = rgb2gray(img)
                
                plt.sca(next(Axes))
                title = '{:%Y-%m-%d %a}'.format(getDate(f))
                plt.gca().set(title=title)
                plt.imshow(img, cmap=plt.cm.gray)
                for im in carve(img):
                    plotBox(*im)
                plt.autoscale(enable=True, axis='both', tight=True)
                
                
                # save
                images[title] = img
                if i > 1:
                    break
    except StopIteration:
        break
    except Exception as e:
        print('Failed to load: {}: {}'.format(filename,e))
        break
#     break
# plt.show()

In [ ]:
def getDate(filename, split='\\'):
    if split in filename:
        s = re.findall('\d+', filename.split(split)[-1])[0]
    else:
        s = re.findall('\d+', filename)[0]
    return datetime.strptime(s, '%Y%m%d')
'{:%Y-%m-%d %a}'.format(getDate(f))

In [ ]:
# data.seek(0)
# # plt.imread(data)
# with open('test2.gif', 'w+b') as f:
#     f.write((data.read()))

In [ ]:
for (height,width),number in generalsizes.items():
    plt.plot(number, height/width, '.')
_ = plt.gca().set(xscale='log', xlim=[0.2, 1e4])

In [ ]:
def carve(img, nx=None, ny=100, linethresh=None):
    nx = np.floor(img.shape[1]/3)-50
    print(nx)
    Y = threshold(img)
    linethresh = np.clip(np.percentile(np.mean(Y, axis=1), 4), 0, 180)
    ylines = np.where(np.mean(Y,1)<linethresh)[0]
    lines = (np.concatenate([[0], ylines,[img.shape[0]]]))
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
    heights = deltas[np.where(deltas>ny)[0]]#[::2]
    
    for yoffset,height in zip(yoffsets, heights):
        X = Y[yoffset:yoffset+height,:]
        xmean = np.mean(X, axis=0)/np.std(X, axis=0)
        xp = np.percentile(xmean, 1)
        xlines = np.where(xmean < xp)[0]

        lines = np.concatenate([xlines,np.linspace(0, img.shape[1], 2)])
        lines = np.array(sorted(lines))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>nx)[0]]#[::2]
        widths = deltas[np.where(deltas>nx)[0]]#[::2]
        
        for xoffset, width in zip(xoffsets, widths):
            yield xoffset, xoffset+width, yoffset, yoffset+height

for i, (d,img) in enumerate(sorted(images.items())):
    if i < 2:
        continue
    plt.imshow(img, cmap=plt.cm.gray)
    for box in carve(img):
        print(box)
        plotBox(*box)
    plt.title('{}: {:0.2f}'.format(d, img.shape[0]/img.shape[1]))
    break

In [ ]:
plt.plot(np.mean(img, axis=0))

In [ ]:
plt.plot(np.mean(img, axis=0)/np.std(img, axis=0))
plt.yscale('log')

In [ ]:


In [ ]:
directories = Counter()
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
bad_filenames = []
for i, filename in enumerate(filenames):
#     # First remove sub directories
#     origdir,filename = os.path.split(filename)
#     dirname = os.path.dirname(origdir)
#     shutil.move(os.path.join(origdir,filename), os.path.join(dirname, filename))
    
#     # Filter BW/Color ones
#     bwfilename = filename.replace('-colour', '')
#     if (filename != bwfilename) and (bwfilename in filenames):
#         os.remove(bwfilename)
#         continue
    
#     # Remove small copies
#     sfilename = filename.replace('-small','')
#     if (filename != sfilename) and (sfilename in filenames):
#         os.remove(filename)
#         continue
    
#     # Files without size
#     if os.stat(filename).st_size == 0:
#         os.remove(filename)
#         print(filename)
    
    
    gfilename = filename.replace('.gif', '.jpg')
    if (filename != gfilename) and (gfilename in filenames):
        os.remove(gfilename)
        continue
    
    
    
    directories[os.path.basename(os.path.dirname(filename))] += 1
directories.most_common(1000)

In [ ]:
from collections import defaultdict

In [ ]:
simplesizes = defaultdict(list)
simpleshapes = defaultdict(list)
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
for filename in filenames:
    with Image.open(filename) as img:
        key = (myround(img.height), myround(img.width))
        simplesizes[key].append(filename)
        simpleshapes[key].append((img.height, img.width))

In [ ]:
sorted(map(lambda x: (x[0], len(x[1])), simplesizes.items()), key=lambda x: -x[-1])

In [ ]:
nskip = 500
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
    if len(filenames) < 100:
        continue
    if j < 2:
        continue
    shapes = simpleshapes[(height,width)]
    image = np.zeros(np.min(shapes, axis=0))
    for i,filename in enumerate(filenames):
        if i > nskip:
            continue
        with Image.open(filename) as img:
            x = np.array(img)
            if len(x.shape) in [3,4]:
                x = rgb2gray(x)
#             if image is None:
#                 image = np.zeros(x.shape)
            image += x[:image.shape[0], :image.shape[1]]*1.0/np.min([nskip, len(shapes)])
    
    plt.figure(figsize=(12,6))
    plt.imshow(image, cmap=plt.cm.gray)
    plt.title((height,width,len(filenames)))
    break

In [ ]:
filename

In [ ]:
img = Image.open(filename)
plt.imshow(img)
img.close()

In [ ]:
def segment(image, axis=0, p=0.97, npix=30):
    '''
    axis == 0 for x
    axis == 1 for y
    '''
    a = np.mean(image, axis=axis)
    b = a/(1+np.std(image, axis=axis))
    ii = np.where(#(b > np.percentile(b,p*100)) &
                  (a > np.mean(a)+1.8*np.std(a)) |
                  (a < np.mean(a)-1.8*np.std(a)))[0]
    
    pts = []
    for i in ii:
        for p in pts:
            if any([np.abs(i-j)<npix for j in p]):
                p.append(i)
                break
        else:
            pts.append([i])
    pts = [
        np.min(p) if i == 0 else
        np.max(p) if i == (len(pts)-1) else
        np.mean(p)
        for i,p in enumerate(pts)
    ]
    return ii, a, sorted(pts)



fig, axes = plt.subplots(2,1, figsize=(12,6))
items = [['height',1],
         ['width', 0]]
for i,(ax,(name,axis)) in enumerate(zip(axes.flatten(),items)):
    plt.sca(ax)
    if name == 'height':
        ii,a,ypts = segment(image, axis)
        plt.plot(a)
        plt.plot(ii, a[ii], 's')
        for p in ypts:
            plt.axvline(p, color='r',lw=2)
    else:
        for imin,imax in zip(ypts, ypts[1:]):
            print(imin, imax)
            jj,b,xpts = segment(image[imin:imax,:], axis)
            plt.plot(b, 'b')
            plt.plot(jj, b[jj], 'sg')
            for p in xpts:
                plt.axvline(p, color='r',lw=2)

In [ ]:
def yxsegment(image):
    ii,a,ypts = segment(image, 1)
    for ymin,ymax in zip(ypts, ypts[1:]):
        jj,b,xpts = segment(image[ymin:ymax,:], 0)
        for xmin, xmax in zip(xpts, xpts[1:]):
            yield xmin, xmax, ymin, ymax
plt.figure(figsize=(12,6))
plt.imshow(image, cmap=plt.cm.gray)
for box in yxsegment(image):
    plotBox(*box)

In [ ]:
def plotBox(xmin, xmax, ymin, ymax):
    plt.plot([xmin, xmax, xmax, xmin, xmin],
             [ymin, ymin, ymax, ymax, ymin], color='r', lw=2)

def threshold(img, thresh=60):
    Y = np.zeros(img.shape)+255
    Y[img < thresh] = img[img < thresh]
    return(Y)

def carve(img, nx=None, ny=100, linethresh=None):
    nx = np.floor(img.shape[1]/3)-50
    print(nx)
    Y = threshold(img)
    linethresh = np.clip(np.percentile(np.mean(Y, axis=1), 4), 0, 180)
    ylines = np.where(np.mean(Y,1)<linethresh)[0]
    lines = (np.concatenate([[0], ylines,[img.shape[0]]]))
    deltas = np.diff(lines)
    yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
    heights = deltas[np.where(deltas>ny)[0]]#[::2]
    
    for yoffset,height in zip(yoffsets, heights):
        X = Y[yoffset:yoffset+height,:]
        xmean = np.mean(X, axis=0)/np.std(X, axis=0)
        xp = np.percentile(xmean, 90)
        xlines = np.where(xmean > xp)[0]
        print(xlines)
        
        lines = np.concatenate([xlines,np.linspace(0, img.shape[1], 2)])
        lines = np.array(sorted(lines))
        deltas = np.diff(lines)
        xoffsets = lines[np.where(deltas>nx)[0]]#[::2]
        widths = deltas[np.where(deltas>nx)[0]]#[::2]
        
        for xoffset, width in zip(xoffsets, widths):
            yield xoffset, xoffset+width, yoffset, yoffset+height

            
plt.figure(figsize=(12,6))
plt.imshow(image, cmap=plt.cm.gray)
for box in carve(image):
    plotBox(*box)

In [ ]:
fig,(ax1,ax2) = plt.subplots(2,1,figsize=(12,12))
ax1.imshow(x, cmap=plt.cm.gray)
ax2.imshow(image, cmap=plt.cm.gray)
ax2.imshow(x, cmap=plt.cm.Reds, alpha=0.6)

In [ ]:
np.array(simpleshapes[(300,640)])

In [ ]:
with Image.open(filename) as img:
    print(img.height, img.width)
    x = np.array(img)
    plt.imshow(x)

In [ ]:
img.height, img.width, x.shape

In [ ]: