In [1]:
import cv2
import numpy
import glob
import os
import numpy
import collections
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
In [3]:
import skimage.measure
import skimage.color
In [4]:
files = sorted(glob.glob('images/captcha[1-6].png'))
In [5]:
sufixes = {
cv2.THRESH_BINARY: 'binary',
cv2.THRESH_BINARY_INV: 'binary_inv',
cv2.THRESH_TRUNC: 'trunc',
cv2.THRESH_TOZERO: 'tozero',
cv2.THRESH_TOZERO_INV: 'tozero_inv'
}
# le a imagem
image = cv2.imread('images/captcha1.png')
# transforma ela de colorida (RGB) para tons de cinza
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
# aplica o thresholding
for thresh in [cv2.THRESH_BINARY, cv2.THRESH_BINARY_INV, cv2.THRESH_TRUNC, cv2.THRESH_TOZERO, cv2.THRESH_TOZERO_INV]:
# o segundo parametro, 127, é ignorado
(limiar, bw) = cv2.threshold(gray, 127, 255, thresh | cv2.THRESH_OTSU)
cv2.imwrite('bw_captcha_' + sufixes[thresh] + '.png', bw)
print limiar
In [6]:
img = cv2.imread('images/captcha6.png')
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
hist, bins = numpy.histogram(gray, gray.max() - gray.min() + 1)
In [7]:
plt.bar(bins[:-1], hist, width = 1)
plt.xlim(min(bins), max(bins))
plt.figaspect(0.5)
plt.show()
In [8]:
def binarize(file):
image = cv2.imread(file)
# transforma ela de colorida (RGB) para tons de cinza
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
(_, bw) = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
return bw
In [9]:
for file in files:
base = os.path.basename(file)
bw = binarize(file)
cv2.imwrite(base.replace('captcha', 'bw_captcha'), bw)
In [10]:
for file in files:
base = os.path.basename(file)
bw = binarize(file)
bw = bw[:, 30:180]
cv2.imwrite(base.replace('captcha', 'bw_captcha').replace('.png', '_crop.png'), bw)
In [11]:
def cut_and_binarize(file):
image = cv2.imread(file)
# transforma ela de colorida (RGB) para tons de cinza
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
gray = gray[:, 30:180]
(_, bw) = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
return bw
In [12]:
for file in files:
base = os.path.basename(file)
bw = cut_and_binarize(file)
cv2.imwrite(base.replace('captcha', 'bw_captcha').replace('.png', '_crop2.png'), bw)
In [13]:
def labelize(file):
bw = cut_and_binarize(file)
(labels, total) = skimage.measure.label(bw, background=0, return_num=True, connectivity=2)
images = [numpy.uint8(labels==i) * 255 for i in range(total) if numpy.uint8(labels==i).sum() > 25]
img = skimage.color.label2rgb(labels, bg_color=[1, 1, 1])
for label in images:
(countours, _) = cv2.findContours(label.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
(x,y,w,h) = cv2.boundingRect(countours[0])
cv2.rectangle(img, (x, y), (x+w, y+h), (1, 0, 0), 1)
img2 = img * 255.0
return img2.astype(numpy.uint8)
In [14]:
for file in files:
base = os.path.basename(file)
bw = labelize(file)
cv2.imwrite(base.replace('captcha', 'bw_captcha').replace('.png', '_label.png'), bw)
In [15]:
def cleanup(file):
bw = cut_and_binarize(file)
(labels, total) = skimage.measure.label(bw, background=0, return_num=True, connectivity=2)
images = [numpy.uint8(labels==i) * 255 for i in range(total) if numpy.uint8(labels==i).sum() > 25]
cleaned = numpy.zeros(bw.shape, numpy.uint8)
# junta todos os labels reconhecidos
for label in images:
cleaned = cleaned + label
cleaned = cv2.cvtColor(cleaned, cv2.COLOR_GRAY2RGB)
# pinta o retangulo neles
for label in images:
(countours, _) = cv2.findContours(label.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
(x,y,w,h) = cv2.boundingRect(countours[0])
cv2.rectangle(cleaned, (x, y), (x+w, y+h), (255, 0, 0), 1)
return cleaned
In [16]:
for file in files:
base = os.path.basename(file)
bw = cleanup(file)
cv2.imwrite(base.replace('captcha', 'bw_captcha').replace('.png', '_cleaned.png'), bw)
In [17]:
def simple_cleanup(file):
bw = cut_and_binarize(file)
(labels, total) = skimage.measure.label(bw, background=0, return_num=True, connectivity=2)
images = [numpy.uint8(labels==i) * 255 for i in range(total) if numpy.uint8(labels==i).sum() > 25]
cropped = []
dimensions = []
for label in images:
(countours, _) = cv2.findContours(label.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
(x,y,w,h) = cv2.boundingRect(countours[0])
surface = label.sum() / 255
area = w * h
label = label[y:y+h, x:x+w]
dimensions.append((x, y, w, h, area, surface, label))
return dimensions
In [18]:
i = 1
stats = []
for file in glob.glob('images/*.png'):
_stats = simple_cleanup(file)
stats.extend(_stats)
for crop in _stats:
cv2.imwrite('crop/%04d.png' % (i), crop[6])
i = i + 1
In [19]:
def histogram(points):
hist, bins = numpy.histogram(points, points.max() - points.min() + 1)
plt.bar(bins[:-1], hist, width = 1)
plt.xlim(min(bins), max(bins))
plt.show()
In [20]:
# arrays com todos os x iniciais
xs = numpy.array([s[0] for s in stats])
# arrays com todos os y iniciais
ys = numpy.array([s[1] for s in stats])
# arrays com todas as larguras
widths = numpy.array([s[2] for s in stats])
# arrays com todas as alturas
heights = numpy.array([s[3] for s in stats])
# array com as areas dos bounding boxes
areas = numpy.array([s[4] for s in stats])
# array com as quantidade de pixels de cada letra
surfaces = numpy.array([s[5] for s in stats])
In [21]:
histogram(xs)
numpy.median(xs), xs.mean(), xs.std(), xs.min(), xs.max(), xs.size
Out[21]:
In [22]:
histogram(ys)
numpy.median(ys), ys.mean(), ys.std(), ys.min(), ys.max(), ys.size
Out[22]:
In [23]:
histogram(widths)
numpy.median(widths), widths.mean(), widths.std(), widths.min(), widths.max(), widths.size
Out[23]:
In [24]:
histogram(heights)
numpy.median(heights), heights.mean(), heights.std(), heights.min(), heights.max(), heights.size
Out[24]:
In [25]:
histogram(areas)
numpy.median(areas), areas.mean(), areas.std(), areas.min(), areas.max(), areas.size
Out[25]:
In [26]:
histogram(surfaces)
numpy.median(surfaces), surfaces.mean(), surfaces.std(), surfaces.min(), surfaces.max(), surfaces.size
Out[26]:
In [27]:
def compute_outliers(data):
q1 = numpy.percentile(data, 25)
q2 = numpy.percentile(data, 50)
q3 = numpy.percentile(data, 75)
iqr = q3 - q1
lower_bound = max(data.min(), q1 - (1.5 * iqr))
upper_bound = min(data.max(), q3 + (1.5 * iqr))
return (q1, q2, q3, iqr, lower_bound, upper_bound)
In [28]:
compute_outliers(xs)
Out[28]:
In [29]:
compute_outliers(ys)
Out[29]:
In [30]:
compute_outliers(widths)
Out[30]:
In [31]:
compute_outliers(heights)
Out[31]:
In [32]:
compute_outliers(areas)
Out[32]:
In [33]:
compute_outliers(surfaces)
Out[33]:
In [34]:
_ = plt.boxplot(xs, vert=0)
In [35]:
_ = plt.boxplot(ys, vert=0)
In [36]:
_ = plt.boxplot(widths, vert=0)
In [37]:
_ = plt.boxplot(heights, vert=0)
In [38]:
_ = plt.boxplot(areas, vert=0)
In [39]:
_ = plt.boxplot(surfaces, vert=0)
In [40]:
filtered = stats
filtered = [s for s in filtered if s[1] < 20 and s[2] > 6 and s[2] < 46 and s[3] > 9 and s[4] < 1785 and s[5] < 727]
# arrays com todos os x iniciais
xs = numpy.array([s[0] for s in filtered])
# arrays com todos os y iniciais
ys = numpy.array([s[1] for s in filtered])
# arrays com todas as larguras
widths = numpy.array([s[2] for s in filtered])
# arrays com todas as alturas
heights = numpy.array([s[3] for s in filtered])
# array com as areas dos bounding boxes
areas = numpy.array([s[4] for s in filtered])
# array com as quantidade de pixels de cada letra
surfaces = numpy.array([s[5] for s in filtered])
In [41]:
histogram(xs)
numpy.median(xs), xs.mean(), xs.std(), xs.min(), xs.max(), xs.size
Out[41]:
In [42]:
histogram(ys)
numpy.median(ys), ys.mean(), ys.std(), ys.min(), ys.max(), ys.size
Out[42]:
In [43]:
histogram(widths)
numpy.median(widths), widths.mean(), widths.std(), widths.min(), widths.max(), widths.size
Out[43]:
In [44]:
histogram(heights)
numpy.median(heights), heights.mean(), heights.std(), heights.min(), heights.max(), heights.size
Out[44]:
In [45]:
histogram(areas)
numpy.median(areas), areas.mean(), areas.std(), areas.min(), areas.max(), areas.size
Out[45]:
In [46]:
histogram(surfaces)
numpy.median(surfaces), surfaces.mean(), surfaces.std(), surfaces.min(), surfaces.max(), surfaces.size
Out[46]:
In [47]:
def normalize(image):
(h, w) = image.shape
fx = w / 64.0
fy = h / 64.0
f = max(fx, fy)
w_ = int(w / f)
h_ = int(h / f)
resized = cv2.resize(image, (w_, h_))
box = numpy.zeros((64, 64), dtype=numpy.uint8)
x0 = (64 - w_) / 2
y0 = (64 - h_) / 2
box[y0:y0 + h_, x0:x0 + w_] = resized
return box
In [48]:
normalized = [(normalize(s[6]), s[6]) for s in filtered]
In [49]:
normalized = sorted(normalized, key=lambda x: x[0].sum())
In [50]:
def mse(image, image2):
error = numpy.sum(((image.astype(numpy.float) - image2.astype(numpy.float))/255.0) ** 2)
error /= float(image.shape[0] * image.shape[1])
return error
In [51]:
def group(limiar, normalized):
groups = collections.defaultdict(list)
computed = set()
for (i, images) in enumerate(normalized):
if i in computed:
continue
found = False
for (j, images2) in enumerate(normalized):
if i >= j or j in computed:
continue
dist = mse(images[0], images2[0])
if dist < limiar:
groups[i].append(j)
found = True
computed.add(j)
if not found:
groups[i].append(i)
return groups
In [52]:
groups = group(0.05, normalized)
In [53]:
groups6 = group(0.06, normalized)
In [54]:
groups7 = group(0.07, normalized)
In [55]:
groups8 = group(0.08, normalized)
In [56]:
groups9 = group(0.09, normalized)
In [57]:
groups10 = group(0.1, normalized)
In [58]:
groups15 = group(0.15, normalized)
In [59]:
def save(directory, groups, normalized):
for i in groups.keys():
if len(groups[i]) < 3:
continue
subdirectory = os.path.join(directory, '%04d' % (i))
if not os.path.isdir(subdirectory):
os.makedirs(subdirectory)
file = os.path.join(subdirectory, '%04d.png' % (i))
cv2.imwrite(file, normalized[i][0])
file = os.path.join(subdirectory, '%04d_original.png' % (i))
cv2.imwrite(file, normalized[i][1])
for j in groups[i]:
dist = mse(normalized[i][0], normalized[j][0])
file = os.path.join(subdirectory, '%04d_%0.3f.png' % (j, dist))
cv2.imwrite(file, normalized[j][0])
file = os.path.join(subdirectory, '%04d_%0.3f_original.png' % (j, dist))
cv2.imwrite(file, normalized[j][1])
In [60]:
save('groups10', groups10, normalized)
In [61]:
def load_templates(directory):
pattern = os.path.join(directory, '*', '*_original.png')
images = glob.glob(pattern)
templates = collections.defaultdict(list)
for image in images:
if '0.' in image:
continue
letter = os.path.basename(os.path.dirname(image))
# carrega a imagem, como o opencv carrega PNG como
# RGB, tem q transformar para tons de cinza
template = cv2.imread(image)
template = cv2.cvtColor(template, cv2.COLOR_RGB2GRAY)
templates[letter].append(template)
return templates
In [62]:
templates = load_templates('templates/')
In [63]:
def search_for_letter(image, letter, templates, method):
best = 2 ** 32
if method not in [cv2.TM_SQDIFF, cv2.TM_SQDIFF_NORMED]:
best = 0.0
pos = None
for template in templates:
match = cv2.matchTemplate(image, template, method)
minVal,maxVal,minLoc,maxLoc = cv2.minMaxLoc(match)
if method not in [cv2.TM_SQDIFF, cv2.TM_SQDIFF_NORMED]:
if best < maxVal:
pos = {
'error': maxVal,
'location': maxLoc,
'letter': letter
}
best = maxVal
else:
if best > minVal:
pos = {
'error': minVal,
'location': minLoc,
'letter': letter
}
best = minVal
return pos
def search(file, templates, method):
matches = []
image = cut_and_binarize(file)
for letter in templates:
pos = search_for_letter(image, letter, templates[letter], method)
if pos is not None:
matches.append(pos)
reverse = False
if method not in [cv2.TM_SQDIFF, cv2.TM_SQDIFF_NORMED]:
reverse = True
matches = sorted(matches, key=lambda x:x['error'],reverse=reverse)
return sorted(matches[:4], key=lambda x:x['location'][0])
In [64]:
def validation(directory, templates, method):
captchas = glob.glob(directory + '/*.png')
corrects = 0
found = 0
for file in captchas:
# tenta quebrar o captcha
matches = search(file, templates, method)
letters = [match['letter'] for match in matches]
# testa se o captcha casa com o nome do arquivo
filename = os.path.basename(file)
captcha = filename.replace('.png', '')
captcha = captcha.upper()
correct = 0
for letter in letters:
if letter in captcha:
correct += 1
if correct == 4:
found += 1
corrects += correct
print "Letters:", corrects, "from", (4 * len(captchas)), "(", corrects / (0.04 * len(captchas)), "%)"
print "Captchas:", found, "from", len(captchas), "(", found / (0.01 * len(captchas)), "%)"
In [65]:
for method in [cv2.TM_SQDIFF, cv2.TM_SQDIFF_NORMED, cv2.TM_CCORR, cv2.TM_CCORR_NORMED, cv2.TM_CCOEFF, cv2.TM_CCOEFF_NORMED]:
print method
validation('teste/', templates, method)
In [ ]: