Compute all empirical results for the PhD thesis

This requires:

the tom toolkit to be compiled and installed
Python 3, numpy, scipy, ipython, ipyparallel, jupyter
A running ipyparallel cluster with 3 engines (from this directory?)
the Tools.ipynb script and benchmarks
8GB memory + 8GB fast swap, or 16GB memory
5GB free disk space
Approx. 2.5 days computation time on my MacBok (2 GHz Intel Core i7 (4 cores)):
- ~ 12 hours for the comparison results
- ~ 52 hours for the missing values results



In [1]:

    
# Initialize parallel computation (require 3 clients)
research_directory = %pwd
tools_script = research_directory + '/Tools.ipynb'
%run $tools_script
from ipyparallel import Client
rc = Client(); dview = rc[:]
if len(dview) != 3: raise RuntimeError('Want to have 3 parallel clients.')
for i in range(3): rc[i]['ID'] = i
ID = 0
dview['research_directory'] = research_directory
%px %run $tools_script
%px tom.util.mkl_set_num_threads(4)
%px tom.util.setNbThreads(4)
%px tom.util.omp_set_num_threads(2)
%px exec("try: import mkl; mkl.set_num_threads(2)\nexcept: pass")
assert(dview['tom.version'] == [tom.version]*3)
tom.util.mkl_set_num_threads(8)
tom.util.setNbThreads(8)
tom.util.omp_set_num_threads(8)
display_width(90)



In [2]:

    
# Global settings:
%px regularization = (2,3)
regularization = (2,3)
import os
os.makedirs(research_directory + '/results', exist_ok=True)
os.makedirs(research_directory + '/results/RANDOM27_32_max_data', exist_ok=True)



In [3]:

    
# Print some prorties (alphabet size, word length, resulting matrix sizes)
def print_benchmark_properties():
    for oomName in BenchData.OOMs + BenchData.IOOOMs:
        bd = BenchData(oomName)
        Σₒ, Σᵢ = bd.nO(), max(1, bd.nU())
        Σ = Σₒ * Σᵢ
        L = int(np.log(1200)/np.log(Σ))
        print('%12s: Σ=%2d L=%d  Σ^L=%5d Σ^(L+1)=%5d Σ^{<L}=%5d'% (oomName, Σ, L, Σ**L, Σ**(L+1), sum([Σ**l for l in range(L+1)])))
print_benchmark_properties()









    



   RANDOM4_7: Σ= 4 L=5  Σ^L= 1024 Σ^(L+1)= 4096 Σ^{<L}= 1365
  RANDOM4_32: Σ= 4 L=5  Σ^L= 1024 Σ^(L+1)= 4096 Σ^{<L}= 1365
  RANDOM27_7: Σ=27 L=2  Σ^L=  729 Σ^(L+1)=19683 Σ^{<L}=  757
 RANDOM27_32: Σ=27 L=2  Σ^L=  729 Σ^(L+1)=19683 Σ^{<L}=  757
       TIGER: Σ= 6 L=3  Σ^L=  216 Σ^(L+1)= 1296 Σ^{<L}=  259
       PAINT: Σ= 8 L=3  Σ^L=  512 Σ^(L+1)= 4096 Σ^{<L}=  585
      BRIDGE: Σ=60 L=1  Σ^L=   60 Σ^(L+1)= 3600 Σ^{<L}=   61
     NETWORK: Σ= 8 L=3  Σ^L=  512 Σ^(L+1)= 4096 Σ^{<L}=  585
     SHUTTLE: Σ=15 L=2  Σ^L=  225 Σ^(L+1)= 3375 Σ^{<L}=  241
     MAZE4X3: Σ=24 L=2  Σ^L=  576 Σ^(L+1)=13824 Σ^{<L}=  601
      CHEESE: Σ=28 L=2  Σ^L=  784 Σ^(L+1)=21952 Σ^{<L}=  813



In [4]:

    
# Compute and save the `tom.Data` objects of the training data for RANDOM27_32 and length 10**8 (~0:20h and 3.2GB on disk)
def pre_compute_RANDOM27_32_long_Data():
    bd = BenchData('RANDOM27_32')
    RANDOM27_32_max_data = [{}, {}, {}]
    for ID in range(3):
        print(':', end='', flush=True)
        data = tom.Data()
        data.sequence = bd.getSequence(ID)
        wordSettings = [(0, 0, 1, maxWords) for maxWords in [32, 64, 128, 256, 512, 1024]]
        wordSettings += [(0, 0, 'o_min', 1024), (1, 1, 1, 0), (2, 2, 1, 0), (0, 1, 1, 0), (0, 2, 1, 0)]
        for wordSetting in wordSettings:
            print('.', end='', flush=True)
            wS = list(wordSetting)
            if wS[2] == 'o_min': wS[2] = bd.o_min(10**8)
            data.regularization = (2,3)
            data.X = wS
            data.Y = data.X
            data.pre_compute()
            data.V_YX(regularization=(0,0))
            data.V_YX(regularization=(2,0))
            with open(research_directory + '/results/RANDOM27_32_max_data/' + str(ID) + str(wS) + '.p', 'wb') as f:
                pickle.dump({'cache': data._cache, 'X': data.X, 'Y': data.Y}, f)
pre_compute_RANDOM27_32_long_Data()









    



:...........:...........:...........



In [5]:

    
%%# Compute word setting results (~1:15h)
%%px
def compute_word_setting_results(oomName):
    bd = BenchData(oomName)
    print(time.strftime("%Y-%m-%d %H:%M:%S"), str(oomName), file = f, flush=True)
    Σₒ, Σᵢ = bd.nO(), max(1, bd.nU())
    Σ = Σₒ * Σᵢ
    wordSettings = [(0, 0, 1, maxWords) for maxWords in [32, 64, 128, 256, 512, 1024]]
    wordSettings += [(0, 0, 'o_min', 1024)]
    wordSettings += [(L, L, 1, 0) for L in range(1, int(np.log(1200)/np.log(Σ))+1)]
    wordSettings += [(0, L, 1, 0) for L in range(1, int(np.log(1200)/np.log(Σ))+1)]
    res[oomName] = {wS: {bd.dim(): {'GLS' : {regularization : []},
                                    'SPEC': {None: []}} } for wS in wordSettings}
    train = bd.getSequence(ID)
    data = tom.Data()
    data.sequence = train.sub(0)
    data.regularization = regularization
    for tl in bd.trainLengths():
        print(time.strftime("    %Y-%m-%d %H:%M:%S"), "%8d " % tl, file = f, end='', flush=True)
        if tl < 10**8:
            data.sequence = train.sub(tl)
        for wordSetting in wordSettings:
            print(".", file = f, end='', flush=True)
            wS = list(wordSetting)
            if wS[2] == 'o_min': wS[2] = bd.o_min(tl)
            if tl < 10**8:
                data.X = wS
                data.Y = data.X
                data.pre_compute()
            else:
                data._stree = None # This ensures that we rely only on the cache!
                with open(research_directory + '/results/RANDOM27_32_max_data/' + str(ID) + str(wS) + '.p', 'rb') as f2:
                    data_save = pickle.load(f2)
                data.X = data_save['X']
                data.Y = data_save['Y']
                data._cache = data_save['cache']
            spec = tom.learn.model_estimate(data, bd.dim(), method='SPEC')
            res[oomName][wordSetting][bd.dim()]['SPEC'][None].append(bd.evaluate(spec))            
            gls = tom.learn.model_estimate(data, bd.dim(), method='GLS')
            res[oomName][wordSetting][bd.dim()]['GLS'][regularization].append(bd.evaluate(gls))
            data.cache = []
        print("done!", file = f, flush=True)

res = {}       
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    for oomName in BenchData.OOMs + BenchData.IOOOMs: compute_word_setting_results(oomName)



In [6]:

    
# Save word setting results to 'resultsWords.p'
res = dview['res']
with open(research_directory + '/results/resultsWords.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [7]:

    
%%# Compute dimension setting results (~2:35h)
%%px
def compute_dimension_setting_results(oomName):
    bd = BenchData(oomName)
    print(time.strftime("%Y-%m-%d %H:%M:%S"), str(oomName), file = f, flush=True)
    Σₒ, Σᵢ = bd.nO(), max(1, bd.nU())
    Σ = Σₒ * Σᵢ
    lenX = int(np.log(1200)/np.log(Σ))
    wordSettings = [(0, 0, 'o_min', 1024), (0, lenX, 1, 0)]
    if bd.dim() < 12:
        dimSettings = list(range(1, bd.dim() + 1)) + [bd.dim() + 1, bd.dim() + 3, bd.dim() + 10]
    else:
        dimSettings =  [1,2,4,8,12,16,20,23,26,28,30,31,32,33,35,42]
    res[oomName] = {wS: {d: {'GLS' : {regularization : []},
                             'SPEC': {None: []}} for d in dimSettings} for wS in wordSettings}
    train = bd.getSequence(ID)
    data = tom.Data()
    data.sequence = train.sub(0)
    data.regularization = regularization
    for tl in bd.trainLengths():
        print(time.strftime("    %Y-%m-%d %H:%M:%S"), "%8d " % tl, file = f, end='', flush=True)
        if tl < 10**8:
            data.sequence = train.sub(tl)
        for wordSetting in wordSettings:
            print(":", file = f, end='', flush=True)
            wS = list(wordSetting)
            if wS[2] == 'o_min': wS[2] = bd.o_min(tl)
            if tl < 10**8:
                data.X = wS
                data.Y = data.X
                data.pre_compute()
            else:
                data._stree = None # This ensures that we rely only on the cache!
                with open(research_directory + '/results/RANDOM27_32_max_data/' + str(ID) + str(wS) + '.p', 'rb') as f2:
                    data_save = pickle.load(f2)
                data.X = data_save['X']
                data.Y = data_save['Y']
                data._cache = data_save['cache']
            for dim in dimSettings:
                spec = tom.learn.model_estimate(data, dim, method='SPEC')
                res[oomName][wordSetting][dim]['SPEC'][None].append(bd.evaluate(spec))            
                gls = tom.learn.model_estimate(data, dim, method='GLS')
                res[oomName][wordSetting][dim]['GLS'][regularization].append(bd.evaluate(gls))
                
            data.cache = []
        print("done!", file = f, flush=True)

res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    for oomName in BenchData.OOMs + BenchData.IOOOMs: compute_dimension_setting_results(oomName)



In [8]:

    
# Save dimension setting results to 'resultsDim.p'
res = dview['res']
with open(research_directory + '/results/resultsDim.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [9]:

    
%%# Compute dimension estimates (~0:20h)
%%px
def compute_dimension_estimates(oomName, wordSettings = [(0, 0, 'o_min', 1024), (0, 'lenX', 1, 0)], pqrs = [(0,0), (1, 1, (2,0))]):
    bd = BenchData(oomName)
    print(time.strftime("%Y-%m-%d %H:%M:%S"), str(oomName), file = f, flush=True)
    Σₒ, Σᵢ = bd.nO(), max(1, bd.nU())
    Σ = Σₒ * Σᵢ
    lenX = int(np.log(1200)/np.log(Σ))
    res[oomName] = {tl: {wS: {pqr: {} for pqr in pqrs} for wS in wordSettings} for tl in bd.trainLengths()}
    train = bd.getSequence(ID)
    data = tom.Data()
    data.sequence = train.sub(0)
    for tl in bd.trainLengths():
        print(time.strftime("    %Y-%m-%d %H:%M:%S"), "%8d " % tl, file = f, end='', flush=True)
        data.sequence = train.sub(tl)
        for wordSetting in wordSettings:
            print(":", file = f, end='', flush=True)
            wS = list(wordSetting)
            if wS[0] == 'lenX': wS[0] = lenX
            if wS[1] == 'lenX': wS[1] = lenX                
            if wS[2] == 'o_min': wS[2] = bd.o_min(tl)
            data.X = wS
            data.Y = data.X
            data.regularization = (0,0)
            data.pre_compute(only_F_and_V=True)
            res[oomName][tl][wordSetting]['shape'] = (len(data.Y), len(data.X))
            F = data.F_YX()
            for pqr in pqrs:
                v_Y, v_X = tom.learn.v_Y_v_X_from_data(data, *pqr)
                U, s, VT = tom.linalg.cached_wsvd(v_Y**-0.5, F, v_X**-0.5)
                res[oomName][tl][wordSetting][pqr]['spectrum'] = s
                res_for_norm = res[oomName][tl][wordSetting][pqr]
                print(".", file = f, end='', flush=True)
                for norm in ['frob', 'spec', 'exspec', 'relative', 'avspec']:
                    res_for_norm[norm] = tom.learn.numerical_rank(F, data.V_YX(), v_Y, v_X, norm, True)
                mid_spec = (res_for_norm['avspec'][1] * res_for_norm['exspec'][1])**0.5
                mid_spec_dim = 0
                while mid_spec_dim < len(s) and s[mid_spec_dim] > mid_spec: mid_spec_dim += 1
                res_for_norm['mid_spec'] = (mid_spec_dim, mid_spec)
                frob = res_for_norm['frob']
                res_for_norm['frob_mid_spec'] = (max(mid_spec_dim, frob[0]), min(mid_spec, frob[1]))
            data.cache = []
        print("done!", file = f, flush=True)

# Dimension estimation uses sampling to estimate the expectation of a spectral norm.
# For reproducibility, we therefore seed the numpy random number generator. It is only used for this purpose.
np.random.seed(123456789 + ID)
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    for oomName in BenchData.OOMs + BenchData.IOOOMs: compute_dimension_estimates(oomName)



In [10]:

    
# Save dimension estimates to 'resultsDimEstimation.p'
res = dview['res']
with open(research_directory + '/results/resultsDimEstimation.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [11]:

    
%%# Algorithm variant comparison incl. appropriate dimension estimates
%%px --local
def compute_variant_comparison(oomName, wordSetting = (0, 0, 'o_min', 1024)):
    bd = BenchData(oomName)
    print(time.strftime("%Y-%m-%d %H:%M:%S"), str(oomName), file = f, flush=True)
    Σₒ, Σᵢ = bd.nO(), max(1, bd.nU())
    Σ = Σₒ * Σᵢ
    
    res[oomName] = {wordSetting: {'dim': [], 0: {algo: {'eval': []} for algo in ['SPEC', 'RCW', 'ES', 'WLS', 'GLS'] },
                                  bd.dim(): {algo: {'eval': []} for algo in ['SPEC', 'RCW', 'ES', 'WLS', 'GLS'] }}}
    train = bd.getSequence(ID)
    data = tom.Data()
    data.regularization = regularization
    data.sequence = train.sub(0)
    for tl in bd.trainLengths():
        print(time.strftime("    %Y-%m-%d %H:%M:%S"), "%8d " % tl, file = f, end='', flush=True)
        if tl < 10**8:
            data.sequence = train.sub(tl)
        print(".", file = f, end='', flush=True)
        wS = list(wordSetting)
        if wS[2] == 'o_min': wS[2] = bd.o_min(tl)
        if tl < 10**8:
            data.X = wS
            data.Y = data.X
            data.regularization = regularization
            data.pre_compute()
        else:
            data._stree = None # This ensures that we rely only on the cache!
            with open(research_directory + '/results/RANDOM27_32_max_data/' + str(ID) + str(wS) + '.p', 'rb') as f2:
                data_save = pickle.load(f2)
            data.X = data_save['X']
            data.Y = data_save['Y']
            data._cache = data_save['cache']
        v_Y, v_X = tom.learn.v_Y_v_X_from_data(data)
        for d in set([0, bd.dim()]):
            print(":", file = f, end='', flush=True)
            if d != 0:
                dim = bd.dim()
            else:
                dim = tom.learn.numerical_rank(data.F_YX(), data.V_YX(regularization=(0,0)), v_Y, v_X)
                res[oomName][wordSetting]['dim'].append(dim)
            print(".", file = f, end='', flush=True)

            spec, subspace = tom.learn.model_estimate(data, dim, method='SPEC', return_subspace=True)
            res[oomName][wordSetting][d]['SPEC']['eval'].append(bd.evaluate(spec))
            print(".", file = f, end='', flush=True)

            es = tom.learn.model_estimate(data, spec, v = (v_Y, v_X), method='ES')
            res[oomName][wordSetting][d]['ES']['eval'].append(bd.evaluate(es))
            print(".", file = f, end='', flush=True)

            rcw = tom.learn.model_estimate(data, dim, method='RCW', v=(v_Y, v_X))
            res[oomName][wordSetting][d]['RCW']['eval'].append(bd.evaluate(rcw))
            print(".", file = f, end='', flush=True)

            wls, subspace = tom.learn.model_estimate(data, subspace, method='WLS', return_subspace=True)
            res[oomName][wordSetting][d]['WLS']['eval'].append(bd.evaluate(wls))
            print(".", file = f, end='', flush=True)

            if dim > 192:
                subspace = tom.learn.subspace_by_alternating_projections(data.F_YX(), 192, data.V_YX(), ls_method='iCholesky')
            gls = tom.learn.model_by_weighted_equations(data, subspace, ls_method='iCholesky')
            res[oomName][wordSetting][d]['GLS']['eval'].append(bd.evaluate(gls))
            print(".", file = f, end='', flush=True)
        data.cache = []
        print("done!", file = f, flush=True)



In [12]:

    
%%# Compute dimension estimates and comparison results for synthetic benchmarks (~0:40h)
%%px
# Dimension estimation uses sampling to estimate the expectation of a spectral norm.
# For reproducibility, we therefore seed the numpy random number generator. It is only used for this purpose.
np.random.seed(123456789 + ID)
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    for oomName in BenchData.OOMs + BenchData.IOOOMs: compute_variant_comparison(oomName)



In [13]:

    
# Save comparison results to 'resultsSynthetic.p'
res = dview['res']
with open(research_directory + '/results/resultsSynthetic.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [14]:

    
# Compute and save dimension estimates and comparison results for real-world data (~ 7 h)

# Dimension estimation uses sampling to estimate the expectation of a spectral norm.
# For reproducibility, we therefore seed the numpy random number generator. It is only used for this purpose.
np.random.seed(123456789)
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    for oomName in BenchData.realWorldData: compute_variant_comparison(oomName)
with open(research_directory + '/results/resultsRealworld.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res

Learning with missing Values



In [15]:

    
%%# Settings for learning with missing values
%%px --local
class MissingRelevance(tom.stree.PositionRelevance):
    def __init__(self, missingFactor = 0, notMissingFactor = 1):
        self.missingFactor = missingFactor
        self.notMissingFactor = notMissingFactor
        super().__init__()
    def compute(self, position):
        seq = position.sequence()
        nMissing = seq.inputSum()
        nNotMissing = seq.length() - nMissing
        return super().compute(position) * self.missingFactor**nMissing * self.notMissingFactor**nNotMissing
def o_min(l, Σₒ, Σᵢ):
    Σᵢ = max(1, Σᵢ)
    Σ = Σₒ * Σᵢ
    l = np.log(l) / (np.log(Σ) + np.log(Σᵢ))
    return (l+1) * Σᵢ**l
def get_dimensions_for_missing_benchmarks():
    dims = {}
    for oomName in BenchData.OOMs:
        bd = BenchData(oomName)
        dims[oomName] = len(bd.trainLengths()) * [bd.dim()]
    with open(research_directory + '/results/resultsRealworld.p', 'rb') as f:
        res = pickle.load(f)['res']
    for oomName in BenchData.realWorldData: dims[oomName] = res[oomName][(0, 0, 'o_min', 1024)]['dim']
    return dims
def cx_validate_missing(loom, cx_test, stabilization=(0.0002, 0.03, 5, 1e-8)):
    moom = tom.Oom(loom.dimension(), loom.nOutputSymbols() + 1, 2)
    moom.sig(loom.sig())
    for o in range(loom.nOutputSymbols()):
        moom.tau(o, 0, loom.tau(o))
        moom.tau(o, 1, np.zeros((loom.dimension(), loom.dimension())))
    moom.tau(loom.nOutputSymbols(), 0, np.zeros((loom.dimension(), loom.dimension())))
    moom.tau(loom.nOutputSymbols(), 1, sum([loom.tau(o) for o in range(loom.nOutputSymbols())]))
    moom.w0(loom.w0())
    moom.initialize()
    moom.stabilization(*stabilization)
    return moom.l2l(cx_test)*cx_test.length()/(cx_test.length()-cx_test.inputSum())
def compute_missing_value_results(oomName, dims, regular=False, stabilization=(0.0002, 0.03, 5, 1e-8)):
    print(time.strftime("%Y-%m-%d %H:%M:%S"), str(oomName), file = f, flush=True)
    bd = BenchData(oomName)
    Σₒ = bd.nO()
    res[oomName] = {mP: {wordSettingAndWildcard: {reg: {'SPEC': {'cx' : [], 'eval' : []}, 'Weighted' : {'cx' : [], 'eval' : []}}
                                                  for reg in regularizations}
                         for wordSettingAndWildcard in wordSettingsAndWildcards} for mP in missingProbs}
    for mP in missingProbs:
        print(time.strftime("   %Y-%m-%d %H:%M:%S"), str(oomName), str(mP), file = f, flush=True)
        data = tom.Data()
        train = bd.getSequence(ID, missingProb=mP, regular=regular, nU=1)
        data.sequence = train.sub(0)
        for tli, tl in enumerate(bd.trainLengths()):
            print(time.strftime("      %Y-%m-%d %H:%M:%S"), "%8d " % tl, file = f, end='', flush=True)
            data.sequence = train.sub(tl)
            data.nInputSymbols = 1
            cx_test = train.sub(tl)[-int(2e5):]
            for wordSetting, wildcard in wordSettingsAndWildcards:
                wS = []
                for i in wordSetting:
                    wS.append(eval(i) if type(i) is str else i)
                data.X = wS
                data.Y = data.X
                data._estimator = tom.EstimatorMCAR(data.stree) if wildcard else tom.Estimator(data.stree)
                for regularization in regularizations:
                    reg = []
                    for i in regularization: # Note: Cannot use list comprehension due to `eval` here!
                        reg.append(eval(i) if type(i) is str else i)
                    data.regularization = reg
                    data.pre_compute()
                    print(".", file = f, end='', flush=True)
                    dim = dims[oomName][tli]
                    spec, subspace = tom.learn.model_estimate(data, dim, method='SPEC', return_subspace=True)
                    spec.setIO(False)
                    res[oomName][mP][(wordSetting, wildcard)][regularization]['SPEC']['eval'].append(bd.evaluate(spec, stabilization=stabilization))
                    res[oomName][mP][(wordSetting, wildcard)][regularization]['SPEC']['cx'].append(cx_validate_missing(spec, cx_test, stabilization))
                    print(".", file = f, end='', flush=True)

                    weighted = tom.learn.model_estimate(data, subspace, method='WLS' if dim > 192 else 'GLS', ls_method='iCholesky')
                    weighted.setIO(False)
                    res[oomName][mP][(wordSetting, wildcard)][regularization]['Weighted']['eval'].append(bd.evaluate(weighted, stabilization=stabilization))
                    res[oomName][mP][(wordSetting, wildcard)][regularization]['Weighted']['cx'].append(cx_validate_missing(weighted, cx_test, stabilization))                        
                    print(".", file = f, end='', flush=True)

                    del spec, subspace, weighted
                    data.cache = []
                del wS
            print("done!", file = f, flush=True)

regularizations = [(2,3), (2,'3/tl**2')]
missingProbs = [1/9.5, 1/4.5, 1/2.5]
wordSettingsAndWildcards = [((0, 0, 'o_min(tl,Σₒ+1,1)', 1024, False, False, 'MissingRelevance((Σₒ)**-1,1)'), False),
                            ((0, 0, 'o_min(tl,Σₒ+1,1)', 1024, False, False, 'MissingRelevance((Σₒ)**-1,1)'), True),
                            ((0, 0, 'o_min(tl,Σₒ+1,1)', 1024, False, False, 'MissingRelevance(0,1)'), False)]
dims_for_missing = get_dimensions_for_missing_benchmarks()



In [16]:

    
# Compute and save results with missing values for OOM benchmarks (~10:00 h)
%px res = {}
%px with open(research_directory + '/results/px_log' + str(ID), 'w') as f: [compute_missing_value_results(oomName, dims_for_missing, regular=False) for oomName in BenchData.OOMs]
res = dview['res']
with open(research_directory + '/results/resultsMissingRandom.p', 'wb') as f: pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res
%px res = {}
%px with open(research_directory + '/results/px_log' + str(ID), 'w') as f: [compute_missing_value_results(oomName, dims_for_missing, regular=True) for oomName in BenchData.OOMs]
res = dview['res']
with open(research_directory + '/results/resultsMissingRegular.p', 'wb') as f: pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [17]:

    
# Compute and save results with random missing values for real-world benchmarks (~19 h)
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f: [compute_missing_value_results(oomName, dims_for_missing, regular=False) for oomName in BenchData.realWorldData]
with open(research_directory + '/results/resultsMissingRandomRW.p', 'wb') as f: pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [18]:

    
# Compute and save results with regular missing values for real-world benchmarks (~17 h)
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f: [compute_missing_value_results(oomName, dims_for_missing, regular=True) for oomName in BenchData.realWorldData]
with open(research_directory + '/results/resultsMissingRegularRW.p', 'wb') as f: pickle.dump({'res':res, 'tom_version': tom.version}, f)
del res



In [19]:

    
# Compute and save results with regular missing values for BIBLE27 benchmark with tweaked dimension (~6h)
missingProbs = [1/4.5]
wordSettingsAndWildcards = [((0, 0, 'o_min(tl,Σₒ+1,1)', 1024, False, False, 'MissingRelevance((Σₒ)**-1,1)'), True)]
res = {}
with open(research_directory + '/results/px_log' + str(ID), 'w') as f:
    compute_missing_value_results('BIBLE27', {'BIBLE27': [8, 16, 32, 64, 96, 128, 160, 192]}, regular=True, stabilization=(0.0002, 0.01, 5, 1e-8))
with open(research_directory + '/results/resultsMissingRegularBIBLE27tweakedDim.p', 'wb') as f:
    pickle.dump({'res':res, 'tom_version': tom.version}, f)



In [3]:

    
%run Tools.ipynb
research_directory='.'
def get_dimensions_for_missing_benchmarks():
    dims = {}
    for oomName in BenchData.OOMs:
        bd = BenchData(oomName)
        dims[oomName] = len(bd.trainLengths()) * [bd.dim()]
    with open(research_directory + '/results/resultsRealworld.p', 'rb') as f:
        res = pickle.load(f)['res']
    for oomName in BenchData.realWorldData: dims[oomName] = res[oomName][(0, 0, 'o_min', 1024)]['dim']
    return dims



In [4]:

    
get_dimensions_for_missing_benchmarks()









    Out[4]:





{'BIBLE27': [10, 17, 47, 93, 147, 204, 279, 372],
 'ECOLI': [2, 3, 4, 5, 9, 14, 28, 53],
 'RANDOM27_32': [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32],
 'RANDOM27_7': [7, 7, 7, 7, 7, 7, 7, 7, 7],
 'RANDOM4_32': [32, 32, 32, 32, 32, 32, 32, 32, 32],
 'RANDOM4_7': [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}



In [ ]: