In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt

import mmap
from mmap import mmap as fileview, ACCESS_READ

from collections import defaultdict
import hashlib

from lib.mypywidgets import ProgressBar, ProgressBarF, Slider, SliderF

In [3]:
import comparedirs
comparison = comparedirs.Comparison()

In [4]:
comparison.scan(paths=("G:\\",), excludes=(".svn", ".git", "G:\\SEM", "G:\\$RECYCLE.BIN", ".idea", ".venv", 'Allo Allo'))


[WinError 5] Access is denied: 'G:\\System Volume Information'

In [5]:
file_counts = dict((sz, len(f)) for sz, f in comparison.sizes.items())
total_files = sum(file_counts.values())
total_data  = sum(sz * count for sz, count in file_counts.items())

print("Files:", total_files, "Data:", total_data)
min_size = min(comparison.sizes.keys())
max_size = max(comparison.sizes.keys())

if total_files > 0:
    print("Min size:", min_size, "Max size:", max_size)
    print("Min files:", min(file_counts.values()), "Max files:", max(file_counts.values()))

sizes = {}
for size in sorted(comparison.sizes.keys()):
    sizes[size] = len(sizes)


Files: 91517 Data: 262346753419
Min size: 1 Max size: 34159919104
Min files: 2 Max files: 2258

In [6]:
import time

# Data bar
db = ProgressBarF("Data", total_data, 1024)
fb = ProgressBarF("Files", total_files)
sz = Slider("Size", len(sizes), min=0, default=0, readout=False)
ch = SliderF("Cache", 1.0, default=1.0, readout=False)

db.display()
fb.display()
sz.display()
ch.display()

loaded, nfiles = 0, 0
last_tick = 0.0
size_step = max(64 * 1024, int(total_data * 0.05))
file_step = max(1, int(total_files * 0.05))

def update_bars(size, force=False):
    global loaded, nfiles
    size_num = sizes.get(size, -1)
    if force or nfiles >= file_step or loaded >= size_step or size_num != sz.data.value:
        db.increment(loaded, "KB")
        fb.increment(nfiles)
        loaded, nfiles = 0, 0
        sz.data.value = size_num
        sz.label.value = "{:,} byte{:s}" . format(size, "" if size == 1 else "s")
    
    ratio = int(comparison.cache_hit / (comparison.cache_hit + comparison.cache_miss) * 10000) / 10000
    if ch.data.value != ratio:
        ch.data.value = ratio
        ch.label.value = "{:.2f}%".format(ratio * 100)
    
def update_files(file_info, amount_read):
    global last_tick, nfiles, loaded
    if amount_read >= file_info.size:
        loaded += amount_read
        nfiles += 1
    now = time.time()
    if last_tick <= now - 0.3:
        update_bars(file_info.size, last_tick <= now - 1.0)
        last_tick = now

comparison.match(filecallback=update_files, reverse=True)
update_bars(0, True)



In [7]:
comparison.classify()

In [8]:
comparison.folders['g:\\luggage\\home\\osmith\\virtualbox vms\\precise'].hash_matches['precise-disk1.vmdk']


Out[8]:
[<FileInfo('g:\luggage\home\osmith\virtualbox vms\precise\precise-disk1.vmdk')>,
 <FileInfo('g:\wispa\oliver\vmimages\virtualboxes\precise\precise-disk1.vmdk')>]

In [9]:
comparison.folders['g:\\luggage\\home\\osmith\\virtualbox vms\\precise'].name_matches


Out[9]:
defaultdict(set,
            {'precise-disk1.vmdk': {'g:\\wispa\\oliver\\vmimages\\virtualboxes\\precise'},
             'precise.vbox': {'g:\\wispa\\oliver\\vmimages\\virtualboxes\\precise'},
             'precise.vbox-prev': {'g:\\wispa\\oliver\\vmimages\\virtualboxes\\precise'}})

Todo: Iterate comparison.folders and build a table of:

folder : len(files) : num-files-with-matches : folders matched to

Determine any folders with 100% bi-directional matches


In [ ]: