In [1]:
from time import time
from itertools import permutations
from collections import defaultdict
import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
Считаем simhash-и.
In [2]:
%%time
with open("simhash_sorted.txt") as f:
simhashes = [int(line[:-1]) for line in f.readlines()]
simhashes = np.array(simhashes, dtype=np.uint64) # found out before that simhash fits uint64
In [3]:
SIMHASH_SIZE = 64
num_samples = len(simhashes)
print "Number of samples:", num_samples
print "SimHash example:", format(simhashes[0], "b")
print "SimHash size:", SIMHASH_SIZE
Поделим simhash-и на 4 части для индексирования.
In [4]:
MAX_DISTANCE = 3
NUM_PARTS = MAX_DISTANCE + 1
PART_SIZE = SIMHASH_SIZE / NUM_PARTS
In [5]:
neg_part_mask = "0" * PART_SIZE
pos_part_mask = "1" * PART_SIZE
masks = [neg_part_mask * part_id + pos_part_mask + neg_part_mask * (NUM_PARTS - part_id - 1)\
for part_id in range(NUM_PARTS)]
masks = np.array([int(mask, 2) for mask in masks], dtype=np.uint64)
def get_part(simhash, part_id):
return int(simhash & masks[part_id]) >> (PART_SIZE * (NUM_PARTS - part_id - 1))
In [6]:
%%time
simhashes_parts = np.zeros((len(simhashes), NUM_PARTS), dtype=np.int32)
for simhash_id, simhash in enumerate(simhashes):
for part_id in xrange(NUM_PARTS):
simhashes_parts[simhash_id][part_id] = get_part(simhash, part_id)
Построим индексы.
In [7]:
%%time
indices = [[list() for __ in xrange(2 ** PART_SIZE)] for _ in xrange(NUM_PARTS)]
for simhash_id in xrange(num_samples):
simhash_parts = simhashes_parts[simhash_id]
for part_id in xrange(NUM_PARTS):
indices[part_id][simhash_parts[part_id]].append(simhash_id)
Прокластеризуем хеши.
In [38]:
def ones_positions(num_ones, size=SIMHASH_SIZE):
if num_ones == 0:
yield []
return
for position in range(size):
for positions in ones_positions(num_ones - 1, size):
yield [position] + positions
accepted_xors = set()
for num_ones in xrange(MAX_DISTANCE + 1):
for positions in ones_positions(num_ones):
xor = ["0"] * SIMHASH_SIZE
for pos in positions:
xor[pos] = "1"
accepted_xors.add(np.uint64(int("".join(xor), 2)))
print len(accepted_xors)
In [45]:
def similar(hash1, hash2):
return (hash1 ^ hash2) in accepted_xors
In [46]:
%%time
groups_sizes = []
assigned = [False] * num_samples # indicators of simhashes assigned to any of the considered groups
num_assigned = 0
start = time()
for simhash_id, simhash in enumerate(simhashes):
if assigned[simhash_id]:
continue
group_size = 0
simhash_parts = simhashes_parts[simhash_id]
for part_id, part in enumerate(simhash_parts):
for candidate_id in indices[part_id][part]:
if assigned[candidate_id]:
continue
if similar(simhash, simhashes[candidate_id]):
group_size += 1
assigned[candidate_id] = True #.add(candidate_id)
num_assigned += 1
groups_sizes.append(group_size)
if simhash_id % 3000 == 0:
spent = time() - start
clear_output()
print "assigned: {}\tRemained time: {:.2f} days".format(
num_assigned,(float(num_samples) / num_assigned - 1) * spent / 60 / 60 / 24)
Отработав 6 часов, скрипт обещал работать еще около 9 суток, что слишком долго. Поэтому проведём анализ уже полученных данных как репрезентативной выборки.
In [39]:
groups_sizes = np.array(groups_sizes)
In [54]:
plt.figure(figsize=(12,8))
plt.plot(groups_sizes);
plt.xlabel("Group ID")
plt.ylabel("Group size");
Как видим, с течением времени в размере групп не наблюдается какого-либо тренда.
In [55]:
plt.figure(figsize=(12,8))
plt.hist(groups_sizes, bins=100, log=True)
plt.xlabel("Group size")
plt.ylabel("Number of groups");
На логарифмированной гистограмме распределения размеров групп видно, что тотальное преимущество за группами размера 1 и чем больше размер группы, тем меньше вероятность её встретить.