In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '../python/')
import pml
import util
In [17]:
S = 10_000 # support set size
p = np.ones(S)/S # distribution
# iterate over S unknown, known
S_known_list = [False,True]
# make sample size list n_list
n_min = np.sqrt(S)
n_max = 100*S
num_n_points = 21
n_list = np.logspace(np.log10(n_min), np.log10(n_max), num_n_points).astype(int)
num_trials = 100 # number of trials for each value of n
record_num_clumps = {S_known: [[] for n in n_list] for S_known in S_known_list} # num clumps by trial, by n
record_inferred_S = {S_known: [[] for n in n_list] for S_known in S_known_list} # inferred support set size by trial, by n
record_has_continuous_part = {S_known: [[] for n in n_list] for S_known in S_known_list} # true iff inferred distribution has continuous part
for S_known in S_known_list:
print('S known:',S_known)
for (i,n) in enumerate(n_list):
print('n:',n)
for trial_num in range(num_trials):
# draw empirical histogram
hist = util.draw_histogram_from_multinomial(p, n)
S_empirical = np.count_nonzero(hist)
# get approximate PML
if S_known:
(p_pml, F0, _, _) = pml.PML_distribution_approximate(hist, S, warn_on_continuous_part=False)
else:
(p_pml, F0, _, _) = pml.PML_distribution_approximate(hist, warn_on_continuous_part=False)
# record num clumps
record_num_clumps[S_known][i].append(np.unique(p_pml).size)
# record estimated support set size
if not np.isinf(F0):
record_inferred_S[S_known][i].append(S_empirical + F0)
# record whether has continuous part
record_has_continuous_part[S_known][i].append(np.isinf(F0))
In [18]:
colors_by_num_clumps = {1:[0,0,0], 2:[0.6,0,0], 3:[1,0,0]}
for S_known in S_known_list:
fig = plt.figure(figsize=(15,5))
# plot probability to have certain number of clumps
# case (1,2,>=3)
for (num_clumps, strict_equal) in zip([1,2,3],[True,True,False]):
if strict_equal:
y = np.array([np.sum(np.array(record_num_clumps[S_known][i])==num_clumps)/num_trials for i in range(n_list.size)])
label = 'num clumps == {}'.format(num_clumps)
else:
y = np.array([np.sum(np.array(record_num_clumps[S_known][i])>=num_clumps)/num_trials for i in range(n_list.size)])
label = 'num clumps >= {}'.format(num_clumps)
yerr = 2*np.sqrt(y*(1-y)/num_trials)
plt.errorbar(n_list,y,yerr=yerr,fmt='o-',color=colors_by_num_clumps[num_clumps], label=label)
if not S_known:
# plot fraction of trials that result in approx. PML distribution having continuous part
y = np.array([np.count_nonzero(np.array(record_has_continuous_part[S_known][i]))/num_trials for i in range(n_list.size)])
yerr = 2*np.sqrt(y*(1-y)/num_trials)
plt.errorbar(n_list,y,yerr=yerr,fmt='x:',color=[0,0,1], label='has continuous part')
plt.plot([S, S], plt.gca().get_ylim(), '--', label='S = {}'.format(S), color=[0,0.5,0])
plt.plot(plt.gca().get_xlim(),[1,1],'-',linewidth=1, color=[0.5,0.5,0.5])
plt.plot(plt.gca().get_xlim(),[0,0],'-',linewidth=1, color=[0.5,0.5,0.5])
plt.gca().set_xscale('log')
plt.legend(fontsize=20)
plt.title('$S$ {}known ($S$ = {})\nFraction of trials vs. n'.format({True:'',False:'un'}[S_known], S),fontsize=20)
plt.xlabel('$n$',fontsize=20)
plt.show()
fig = plt.figure(figsize=(15,5))
# plot mean, standard error of inferred support set size
y = np.array([np.mean(record_inferred_S[False][i]) for i in range(n_list.size)])
yerr = np.array([2*np.std(record_inferred_S[False][i]) for i in range(n_list.size)])
plt.errorbar(n_list, y, yerr=yerr, fmt='ro', label='mean$\pm 2 \sigma$')
plt.plot(plt.gca().get_xlim(),[S,S],'--', label='$S$={}'.format(S),color=[0,0.5,0])
plt.gca().set_xscale('log')
plt.legend(fontsize=20)
plt.title('Inferred $S$ distribution vs. $n$\n($S$ = {})'.format(S),fontsize=20)
plt.xlabel('$n$',fontsize=20)
plt.show()
In [19]:
S = 10_000 # support set size
p = np.append(np.ones(S)/S, np.zeros(S)) # distribution
# make sample size list n_list
n_min = np.sqrt(S)
n_max = 100*S
num_n_points = 21
n_list = np.logspace(np.log10(n_min), np.log10(n_max), num_n_points).astype(int)
num_trials = 100 # number of trials for each value of n
record_num_clumps = [[] for n in n_list] # num clumps by trial, by n
for (i,n) in enumerate(n_list):
print('n:',n)
for trial_num in range(num_trials):
# draw empirical histogram
hist = util.draw_histogram_from_multinomial(p, n)
S_empirical = np.count_nonzero(hist)
# get approximate PML
(p_pml, F0, _, _) = pml.PML_distribution_approximate(hist, 2*S, warn_on_continuous_part=False)
# record num clumps
record_num_clumps[i].append(np.unique(p_pml).size)
In [20]:
colors_by_num_clumps = {1:[0,0,0], 2:[0.6,0,0], 3:[1,0,0]}
fig = plt.figure(figsize=(15,5))
# plot probability to have certain number of clumps
# case (1,2,>=3)
for (num_clumps, strict_equal) in zip([1,2,3],[True,True,False]):
if strict_equal:
y = np.array([np.sum(np.array(record_num_clumps[i])==num_clumps)/num_trials for i in range(n_list.size)])
label = 'num clumps == {}'.format(num_clumps)
else:
y = np.array([np.sum(np.array(record_num_clumps[i])>=num_clumps)/num_trials for i in range(n_list.size)])
label = 'num clumps >= {}'.format(num_clumps)
yerr = 2*np.sqrt(y*(1-y)/num_trials)
plt.errorbar(n_list,y,yerr=yerr,fmt='o-',color=colors_by_num_clumps[num_clumps], label=label)
plt.plot([S, S], plt.gca().get_ylim(), '--', label='S = {}'.format(S), color=[0,0.5,0])
plt.plot(plt.gca().get_xlim(),[1,1],'-',linewidth=1, color=[0.5,0.5,0.5])
plt.plot(plt.gca().get_xlim(),[0,0],'-',linewidth=1, color=[0.5,0.5,0.5])
plt.gca().set_xscale('log')
plt.legend(fontsize=20)
plt.title('Fraction of trials vs. n',fontsize=20)
plt.xlabel('$n$',fontsize=20)
plt.show()