In [1]:
from time import time
from itertools import permutations
from collections import defaultdict

import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

Считаем simhash-и.


In [2]:
%%time
with open("simhash_sorted.txt") as f:
    simhashes = [int(line[:-1]) for line in f.readlines()]
simhashes = np.array(simhashes, dtype=np.uint64)  # found out before that simhash fits uint64


CPU times: user 18.5 s, sys: 1.04 s, total: 19.6 s
Wall time: 19.5 s

In [3]:
SIMHASH_SIZE = 64
num_samples = len(simhashes)
print "Number of samples:", num_samples
print "SimHash example:", format(simhashes[0], "b")
print "SimHash size:", SIMHASH_SIZE


Number of samples: 33015945
SimHash example: 1000101011000111001001000001000101110101110110111111101010100101
SimHash size: 64

Поделим simhash-и на 4 части для индексирования.


In [4]:
MAX_DISTANCE = 3
NUM_PARTS = MAX_DISTANCE + 1
PART_SIZE = SIMHASH_SIZE / NUM_PARTS

In [5]:
neg_part_mask = "0" * PART_SIZE
pos_part_mask = "1" * PART_SIZE
masks = [neg_part_mask * part_id + pos_part_mask + neg_part_mask * (NUM_PARTS - part_id - 1)\
         for part_id in range(NUM_PARTS)]
masks = np.array([int(mask, 2) for mask in masks], dtype=np.uint64)

def get_part(simhash, part_id):
    return int(simhash & masks[part_id]) >> (PART_SIZE * (NUM_PARTS - part_id - 1))

In [6]:
%%time
simhashes_parts = np.zeros((len(simhashes), NUM_PARTS), dtype=np.int32)
for simhash_id, simhash in enumerate(simhashes):
    for part_id in xrange(NUM_PARTS):
        simhashes_parts[simhash_id][part_id] = get_part(simhash, part_id)


CPU times: user 1min 45s, sys: 316 ms, total: 1min 45s
Wall time: 1min 45s

Построим индексы.


In [7]:
%%time
indices = [[list() for __ in xrange(2 ** PART_SIZE)] for _ in xrange(NUM_PARTS)]
for simhash_id in xrange(num_samples):
    simhash_parts = simhashes_parts[simhash_id]
    for part_id in xrange(NUM_PARTS):
        indices[part_id][simhash_parts[part_id]].append(simhash_id)


CPU times: user 1min 3s, sys: 500 ms, total: 1min 4s
Wall time: 1min 3s

Прокластеризуем хеши.


In [38]:
def ones_positions(num_ones, size=SIMHASH_SIZE):
    if num_ones == 0:
        yield []
        return
    for position in range(size):
        for positions in ones_positions(num_ones - 1, size):
            yield [position] + positions

accepted_xors = set()

for num_ones in xrange(MAX_DISTANCE + 1):
    for positions in ones_positions(num_ones):
        xor = ["0"] * SIMHASH_SIZE
        for pos in positions:
            xor[pos] = "1"
        accepted_xors.add(np.uint64(int("".join(xor), 2)))

print len(accepted_xors)


43745

In [45]:
def similar(hash1, hash2):
    return (hash1 ^ hash2) in accepted_xors

In [46]:
%%time
groups_sizes = []
assigned = [False] * num_samples  # indicators of simhashes assigned to any of the considered groups
num_assigned = 0

start = time()

for simhash_id, simhash in enumerate(simhashes):
    if assigned[simhash_id]:
        continue
    
    group_size = 0
    simhash_parts = simhashes_parts[simhash_id]
    for part_id, part in enumerate(simhash_parts):
        for candidate_id in indices[part_id][part]:
            if assigned[candidate_id]:
                continue
            if similar(simhash, simhashes[candidate_id]):
                group_size += 1
                assigned[candidate_id] = True #.add(candidate_id)
                num_assigned += 1
    
    groups_sizes.append(group_size)
    
    if simhash_id % 3000 == 0:
        spent = time() - start
        clear_output()
        print "assigned: {}\tRemained time: {:.2f} days".format(
            num_assigned,(float(num_samples) / num_assigned - 1) * spent / 60 / 60 / 24)


assigned: 626430	Remained time: 8.98 days
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-46-6944819795ad> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'groups_sizes = []\nassigned = [False] * num_samples  # id-s of simhashes assigned to any of the considered groups\nnum_assigned = 0\n\nstart = time()\n\nfor simhash_id, simhash in enumerate(simhashes):\n    if assigned[simhash_id]:\n        continue\n    \n    group_size = 0\n    simhash_parts = simhashes_parts[simhash_id]\n    for part_id, part in enumerate(simhash_parts):\n        for candidate_id in indices[part_id][part]:\n            if assigned[candidate_id]:\n                continue\n            if similar(simhash, simhashes[candidate_id]):\n                group_size += 1\n                assigned[candidate_id] = True #.add(candidate_id)\n                num_assigned += 1\n    \n    groups_sizes.append(group_size)\n    \n    if simhash_id % 3000 == 0:\n        spent = time() - start\n        clear_output()\n        print "assigned: {}\\tRemained time: {:.2f} days".format(\n            num_assigned,(float(num_samples) / num_assigned - 1) * spent / 60 / 60 / 24)')

/home/ilivans/.virtualenvs/cmn/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-59> in time(self, line, cell, local_ns)

/home/ilivans/.virtualenvs/cmn/local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/home/ilivans/.virtualenvs/cmn/local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1183         else:
   1184             st = clock2()
-> 1185             exec(code, glob, local_ns)
   1186             end = clock2()
   1187             out = None

<timed exec> in <module>()

KeyboardInterrupt: 

Отработав 6 часов, скрипт обещал работать еще около 9 суток, что слишком долго. Поэтому проведём анализ уже полученных данных как репрезентативной выборки.


In [39]:
groups_sizes = np.array(groups_sizes)

In [54]:
plt.figure(figsize=(12,8))
plt.plot(groups_sizes);
plt.xlabel("Group ID")
plt.ylabel("Group size");


Как видим, с течением времени в размере групп не наблюдается какого-либо тренда.


In [55]:
plt.figure(figsize=(12,8))
plt.hist(groups_sizes, bins=100, log=True)
plt.xlabel("Group size")
plt.ylabel("Number of groups");


На логарифмированной гистограмме распределения размеров групп видно, что тотальное преимущество за группами размера 1 и чем больше размер группы, тем меньше вероятность её встретить.