In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import glob
import rarfile, os
from collections import Counter
from collections import defaultdict
from PIL import Image
from datetime import datetime
import shutil
def imshow(*args, **kwargs):
params = dict(cmap=plt.cm.gray, interpolation='nearest')
params.update(kwargs)
plt.imshow(*args, **params)
def rgb2gray(rgb):
return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])
def getDate(filename, split='\\'):
if split in filename:
s = re.findall('\d+', filename.split(split)[-1])[0]
else:
s = re.findall('\d+', filename)[0]
return datetime.strptime(s, '%Y%m%d')
def myround(x, base=2):
return int(base * round(float(x)/base))
In [ ]:
directories = Counter()
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
bad_filenames = []
for i, filename in enumerate(filenames):
# # First remove sub directories
# origdir,filename = os.path.split(filename)
# dirname = os.path.dirname(origdir)
# shutil.move(os.path.join(origdir,filename), os.path.join(dirname, filename))
# # Filter BW/Color ones
# bwfilename = filename.replace('-colour', '')
# if (filename != bwfilename) and (bwfilename in filenames):
# os.remove(bwfilename)
# continue
# # Remove small copies
# sfilename = filename.replace('-small','')
# if (filename != sfilename) and (sfilename in filenames):
# os.remove(filename)
# continue
# # Files without size
# if os.stat(filename).st_size == 0:
# os.remove(filename)
# print(filename)
gfilename = filename.replace('.gif', '.jpg')
if (filename != gfilename) and (gfilename in filenames):
os.remove(gfilename)
continue
directories[os.path.basename(os.path.dirname(filename))] += 1
directories.most_common(1000)
In [ ]:
simplesizes = defaultdict(list)
simpleshapes = defaultdict(list)
filenames = sorted(glob.iglob('/Users/ajmendez/data/dilbert/images/*/*'))
for filename in filenames:
with Image.open(filename) as img:
key = (myround(img.height), myround(img.width))
simplesizes[key].append(filename)
simpleshapes[key].append((img.height, img.width))
In [ ]:
sorted(map(lambda x: (x[0], len(x[1])), simplesizes.items()), key=lambda x: -x[-1])[:10]
In [ ]:
np.sort()
In [ ]:
def plotBox(xmin, xmax, ymin, ymax):
plt.plot([xmin, xmax, xmax, xmin, xmin],
[ymin, ymin, ymax, ymax, ymin], color='r', lw=2)
def threshold(img, thresh=60):
Y = np.zeros(img.shape)+255
Y[img < thresh] = img[img < thresh]
return(Y)
def getOffsets(im, sep=120, axis=1):
m = np.mean(im, axis=axis)
thresh = np.percentile(m, 2)
s = np.std(im, axis=axis)
sthresh = np.percentile(s, 2)
pts = np.where((m<thresh) | (s<sthresh))[0]
lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
delta = np.diff(lines)
offsets = lines[np.where(delta > xm)[0]]
sizes = delta[np.where(delta > xm)[0]]
for offset,size in zip(offsets, sizes):
yield offset, offset+size
def carve(img, nx=120, ny=120, linethresh=None):
Y = threshold(img)
Y = img
ymean = np.mean(Y, axis=1)
linethresh = np.percentile(np.mean(Y, axis=1), 2)
ylines = np.where(np.mean(Y,1)<linethresh)[0]
lines = np.sort(np.concatenate([[0], ylines,[img.shape[0]]]))
deltas = np.diff(lines)
yoffsets = lines[np.where(deltas>ny)[0]]#[::2]
heights = deltas[np.where(deltas>ny)[0]]#[::2]
for yoffset,height in zip(yoffsets, heights):
X = Y[yoffset:yoffset+height,:]
xmean = np.mean(X, axis=0)
xthresh = np.percentile(xmean, 2)
xlines = np.where(xmean < xthresh)[0]
lines = np.sort(np.concatenate([[0], xlines, [img.shape[1]]]))
deltas = np.diff(lines)
xoffsets = lines[np.where(deltas>nx)[0]]#[::2]
widths = deltas[np.where(deltas>nx)[0]]#[::2]
for xoffset, width in zip(xoffsets, widths):
yield xoffset, xoffset+width, yoffset, yoffset+height
# plt.figure(figsize=(12,6))
# imshow(image)
# for box in carve(image):
# plotBox(*box)
In [ ]:
def oned(im, xm=120, axis=1):
m = np.mean(im, axis=axis)
plt.plot(m)
thresh = np.percentile(m, 2)
s = np.std(im, axis=axis)
plt.plot(s)
sthresh = np.percentile(s, 2)
plt.axhline(thresh, color='orange')
plt.axhline(thresh, color='red')
pts = np.where((m<thresh) | (s<sthresh))[0]
lines = np.sort(np.concatenate([[0], pts, [im.shape[i]]]))
delta = np.diff(lines)
offsets = lines[np.where(delta > xm)[0]]
heights = delta[np.where(delta > xm)[0]]
for offset, height in zip(offsets, heights):
plt.axvspan(offset, offset+height, zorder=2, color='0.5')
for i, ax in enumerate(plt.subplots(2,1, figsize=(12,6))[1]):
plt.sca(ax)
oned(image, axis=1-i)
In [132]:
nskip = 5000
bad = []
params = {}
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
# if len(filenames) < 2:
# continue
# if j < 2:
# continue
# if j not in bad:
# continue
key = (height,width)
shapes = simpleshapes[key]
image = np.zeros(np.min(shapes, axis=0))
for i,filename in enumerate(filenames):
if i > nskip:
continue
try:
x = plt.imread(filename)
except Exception as e:
print(e)
print(filename)
continue
if len(x.shape) in [3,4]:
x = rgb2gray(x)
image += x[:image.shape[0],
:image.shape[1]]*1.0/np.min([nskip, len(shapes)])
outfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
if os.path.exists(outfilename):
continue
outfilename = '/Users/ajmendez/data/dilbert/stacks/stack_{:03d}.png'.format(j)
# plt.imsave(outfilename, image, cmap=plt.cm.gray)
plt.figure(figsize=(12,6))
imshow(image)
params[key] = []
for i,box in enumerate(carve(image)):
plotBox(*box)
params[key].append(box)
plt.title((height,width,len(filenames)))
plt.savefig(outfilename.replace('.png', '.fig.png'))
plt.close()
# break
In [145]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
if os.path.exists(badfilename):
continue
key = (height,width)
shapes = simpleshapes[key]
image = np.zeros(np.min(shapes, axis=0))
for i,filename in enumerate(filenames):
try:
x = plt.imread(filename)
except Exception as e:
print(e)
print(filename)
continue
if len(x.shape) in [3,4]:
x = rgb2gray(x)
basename = os.path.splitext(os.path.basename(filename))[0]
dirname = os.path.dirname(filename).replace('images', 'panels')
if not os.path.exists(dirname):
os.makedirs(dirname)
for k, box in enumerate(params[key]):
outfilename = os.path.join(dirname,
basename+'.{:02d}.png'.format(k))
xmin,xmax, ymin,ymax = box
im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
im.thumbnail((128,128))
im.save(outfilename)
# plt.imsave(outfilename, x[ymin:ymax, xmin:xmax],
# cmap=plt.cm.gray)
# break
# break
In [173]:
w,h = 620,425
heights, widths = map(np.array, zip(*simplesizes.keys()))
d = (widths-w)**2 + (heights-h)**2
ii = np.argmin(d)
plt.plot(widths, heights, '.')
plt.plot(widths[ii], heights[ii], 'og')
plt.plot(w,h, 'sr')
Out[173]:
In [183]:
nbad = 0
heights,widths = [],[]
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
if os.path.exists(badfilename):
nbad += len(filenames)
plt.scatter(width,height, s=len(filenames)+5, lw=0, alpha=0.5, color='r')
else:
heights.append(height)
widths.append(width)
plt.scatter(width,height, lw=0, alpha=0.5, color='k')
heights,widths = map(np.array, (heights,widths))
print('{:,d} files are still unprocessed. ~{:0,.0f} panels'.format(nbad, nbad*(3*5/6 + 8*1/6)))
In [189]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
if not os.path.exists(badfilename):
continue
shapes = simpleshapes[(height,width)]
image = np.zeros(np.min(shapes, axis=0))
d = (widths-w)**2 + (heights-h)**2
d[(widths<w)&(height<w)] = 1e6
i = np.argmin(d)
key = (heights[i],widths[i])
boxes = params[key]
for i,filename in enumerate(filenames):
try:
x = plt.imread(filename)
except Exception as e:
print(e)
print(filename)
continue
if len(x.shape) in [3,4]:
x = rgb2gray(x)
image += x[:image.shape[0],
:image.shape[1]]*1.0/len(shapes)
outfilename = '/Users/ajmendez/data/dilbert/stacks_nearest/stack_{:03d}.png'.format(j)
dirname = os.path.dirname(outfilename)
if not os.path.exists(dirname):
os.makedirs(dirname)
plt.figure(figsize=(12,6))
imshow(image)
params[key] = []
for i,box in enumerate(carve(image)):
plotBox(*box)
params[key].append(box)
plt.title((height,width,len(filenames)))
plt.savefig(outfilename.replace('.png', '.fig.png'))
plt.close()
# break
In [185]:
for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
badfilename = '/Users/ajmendez/data/dilbert/stacks_bad/stack_{:03d}.fig.png'.format(j)
if not os.path.exists(badfilename):
continue
d = (widths-w)**2 + (heights-h)**2
i = np.argmin(d)
key = (heights[i],widths[i])
boxes = params[key]
shapes = simpleshapes[key]
image = np.zeros(np.min(shapes, axis=0))
for i,filename in enumerate(filenames):
try:
x = plt.imread(filename)
except Exception as e:
print(e)
print(filename)
continue
if len(x.shape) in [3,4]:
x = rgb2gray(x)
basename = os.path.splitext(os.path.basename(filename))[0]
dirname = os.path.dirname(filename).replace('images', 'panels2')
if not os.path.exists(dirname):
os.makedirs(dirname)
for k, box in enumerate(boxes):
outfilename = os.path.join(dirname,
basename+'.{:02d}.png'.format(k))
xmin,xmax, ymin,ymax = box
im = Image.fromarray(x[ymin:ymax, xmin:xmax].astype(np.uint8))
im.thumbnail((128,128))
im.save(outfilename)
# plt.imsave(outfilename, x[ymin:ymax, xmin:xmax],
# cmap=plt.cm.gray)
break
break
In [ ]:
# from pprint import pformat, pprint
In [ ]:
# pprint(params, width=1000)
In [ ]:
# params2 = {}
# for j,((height,width),filenames) in enumerate(sorted(simplesizes.items(), key=lambda x: -len(x[-1]))):
# params2[j] = params[(height,width)]
In [ ]:
# pprint(params2, width=1000)
In [ ]: