In [122]:
import time
import re
from qumulo.rest_client import RestClient
class Path:
def __init__(self, level, path, sz, bucket):
self.level = level
self.path = path
self.sz = sz
self.bucket = bucket
class Bucket:
def __init__(self):
self.sz = 0
self.include_paths = {}
self.exclude_paths = {}
def add_path(self, path_obj):
self.sz += path_obj.sz
self.include_paths[path_obj.path] = path_obj
def show_contents(self):
print("# Bucket size: %.1f%%" % (self.sz*100,))
for path, p in self.include_paths.iteritems():
print(" %-90s # %.2f%%" % (path, p.sz*100))
for path, p in self.exclude_paths.iteritems():
print("-%-90s " % (path, ))
def add_exclusions(self, buckets):
for p in self.include_paths:
for b in buckets:
if b is not None:
for included_p in b.include_paths:
if re.search("^" + p, included_p) \
and included_p not in self.include_paths:
self.exclude_paths[included_p] = b.include_paths[included_p]
class EasySplit:
root_cap = None
root_meta = None
paths = {}
def __init__(self, rest_client, split_type='capacity', bucket_count=4, min_dir_size=0.003, only_dirs=False):
self.paths = {}
self.rc = rest_client
self.split_type = split_type
self.only_dirs = only_dirs
self.bucket_count = bucket_count
self.bucket_sz = 1.0 / bucket_count
self.min_dir_size = min_dir_size
def process_paths(self):
paths = sorted(self.paths.values(), key=lambda k: (-k.level, k.sz))
buckets = []
for i in range(0, self.bucket_count):
buckets.append(Bucket())
bucket_id = 0
for p in paths:
if buckets[bucket_id].sz + p.sz > self.bucket_sz and bucket_id < self.bucket_count-1:
bucket_id += 1
if p.level == 1:
buckets[self.bucket_count-1].add_path(p)
elif p.sz >= self.min_dir_size:
buckets[bucket_id].add_path(p)
buckets[self.bucket_count-1].sz = 1 - sum([d.sz for d in buckets[0:-1]])
for i, b in enumerate(buckets):
b.add_exclusions([b if ii != i else None for ii, b in enumerate(buckets)])
for i, b in enumerate(buckets):
print("------------ %s -----------" % i)
b.show_contents()
def walk_dir(self, path, level=1):
data = self.rc.fs.read_dir_aggregates(path = path)
data_sz = float(data['total_capacity'])
meta_sz = float(data['total_meta'])
sep = '/'
if path == '/':
sep = ''
if level == 1:
self.root_data = float(data['total_capacity'])
self.root_meta = float(data['total_meta'])
sz = data_sz / self.root_data
if self.split_type == 'files':
sz = meta_sz / self.root_meta
self.paths[path] = Path(level=level, path=path, sz=sz, bucket=None)
for d in data['files']:
if self.only_dirs and d['type'] == 'FS_FILE_TYPE_DIRECTORY':
sz = float(d['data_usage']) / self.root_data
if self.split_type == 'files':
sz = float(d['meta_usage']) / self.root_meta
if sz > self.min_dir_size:
self.paths[path].sz -= sz
self.walk_dir(path + sep + d['name'], level + 1)
return self.paths
In [123]:
rc = RestClient("<qumulo-cluster>", 8000)
rc.login("<qumulo-user>", "<qumulo-password>");
es = EasySplit(rest_client=rc, split_type='capacity', only_dirs=True, bucket_count=4, min_dir_size=0.01)
es.walk_dir("/")
es.process_paths()
In [119]:
rc = RestClient("<qumulo-cluster>", 8000)
rc.login("<qumulo-user>", "<qumulo-password>");
es = EasySplit(rest_client=rc, split_type='capacity', only_dirs=True, bucket_count=4, min_dir_size=0.005)
es.walk_dir("/")
es.process_paths()
In [ ]: