In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import mmap
from mmap import mmap as fileview, ACCESS_READ
from collections import defaultdict
import hashlib
from lib.mypywidgets import ProgressBar, ProgressBarF, Slider, SliderF
In [3]:
import comparedirs
comparison = comparedirs.Comparison()
In [4]:
comparison.scan(paths=("G:\\",), excludes=(".svn", ".git", "G:\\SEM", "G:\\$RECYCLE.BIN", ".idea", ".venv", 'Allo Allo'))
In [5]:
file_counts = dict((sz, len(f)) for sz, f in comparison.sizes.items())
total_files = sum(file_counts.values())
total_data = sum(sz * count for sz, count in file_counts.items())
print("Files:", total_files, "Data:", total_data)
min_size = min(comparison.sizes.keys())
max_size = max(comparison.sizes.keys())
if total_files > 0:
print("Min size:", min_size, "Max size:", max_size)
print("Min files:", min(file_counts.values()), "Max files:", max(file_counts.values()))
sizes = {}
for size in sorted(comparison.sizes.keys()):
sizes[size] = len(sizes)
In [6]:
import time
# Data bar
db = ProgressBarF("Data", total_data, 1024)
fb = ProgressBarF("Files", total_files)
sz = Slider("Size", len(sizes), min=0, default=0, readout=False)
ch = SliderF("Cache", 1.0, default=1.0, readout=False)
db.display()
fb.display()
sz.display()
ch.display()
loaded, nfiles = 0, 0
last_tick = 0.0
size_step = max(64 * 1024, int(total_data * 0.05))
file_step = max(1, int(total_files * 0.05))
def update_bars(size, force=False):
global loaded, nfiles
size_num = sizes.get(size, -1)
if force or nfiles >= file_step or loaded >= size_step or size_num != sz.data.value:
db.increment(loaded, "KB")
fb.increment(nfiles)
loaded, nfiles = 0, 0
sz.data.value = size_num
sz.label.value = "{:,} byte{:s}" . format(size, "" if size == 1 else "s")
ratio = int(comparison.cache_hit / (comparison.cache_hit + comparison.cache_miss) * 10000) / 10000
if ch.data.value != ratio:
ch.data.value = ratio
ch.label.value = "{:.2f}%".format(ratio * 100)
def update_files(file_info, amount_read):
global last_tick, nfiles, loaded
if amount_read >= file_info.size:
loaded += amount_read
nfiles += 1
now = time.time()
if last_tick <= now - 0.3:
update_bars(file_info.size, last_tick <= now - 1.0)
last_tick = now
comparison.match(filecallback=update_files, reverse=True)
update_bars(0, True)
In [7]:
comparison.classify()
In [8]:
comparison.folders['g:\\luggage\\home\\osmith\\virtualbox vms\\precise'].hash_matches['precise-disk1.vmdk']
Out[8]:
In [9]:
comparison.folders['g:\\luggage\\home\\osmith\\virtualbox vms\\precise'].name_matches
Out[9]:
In [ ]: