In [2]:
source("../experiment_aws/sampling_utils.R")
In [19]:
n=10;
m=20;
R=runif(n);#as.numeric(1:n);
w=R;
norm = sum(w);
w=w/norm;#!!!
n_repl=10;
#The following works for NORMALISED weights and SUM or AVG
#TODO: other aggregation functions
estimates = sort(sapply(1:n_repl,
function(x) {
S=sample(1:n,prob=w,m,replace=TRUE);
sum_est = sum(R[S]*((w[S])^-1))/m;#Hansen-Hurwitz
return(sum_est);
}
));
print(sum(R))
print(mean(estimates))
plot(estimates,col="red",pch=3)
abline(h=sum(R),lty=2,col="blue")
abline(h=sum(R)*1.01,lty=3,col="blue")
abline(h=sum(R)*0.99,lty=3,col="blue")
abline(h=mean(estimates),lty=1,col="red")
rel_errs=sort(abs(estimates-sum(R))/sum(R))
show_sigma_levels(rel_errs)
#This is a good estimator, weighing w with the value of R is the best
#However, we cannot estimate the norm of w exactly. Would precomputing it make sense?
In [359]:
#Test with strata
n=10000;
m=200;
n_strata = 10;
R=as.numeric(1:n);#NOTE: for strata we assume sorted R
#w=c(sapply(runif(n_strata),function(x) replicate(n/n_strata,x)));
w=c(sapply((n_strata+1-1:n_strata)^2,function(x) replicate(n/n_strata,x)));
strata_boundaries = (0:n_strata)*n/n_strata;
strata_weights=sapply(1:n_strata,function(p) w[p*n/n_strata-1]);
strata_sizes = replicate(n_strata,n/n_strata);
total_strata_weight = sum(strata_weights*strata_sizes);
n_repl=100;
#TODO: other aggregation functions
estimates = sort(sapply(1:n_repl,
function(x) {
S=sort(sample(R,prob=w,m,replace=TRUE));
sum_est = 0;
for(i in 1:n_strata) {
stratum_min_index = strata_boundaries[i]+1;
stratum_max_index = strata_boundaries[i+1];
stratum_sample = S[S>=R[stratum_min_index]&S<=R[stratum_max_index]];
if(length(stratum_sample) == 0) {
next;
}
sum_est_term = mean(stratum_sample)*strata_sizes[i];
sum_est = sum_est + sum_est_term;
}
return(sum_est);
}
));
print(paste("True sum: ",sum(R)));
print(paste("estimate: ",mean(estimates)));
plot(estimates,col="red",pch=3)
abline(h=sum(R),lty=2,col="blue")
abline(h=sum(R)*1.01,lty=3,col="blue")
abline(h=sum(R)*0.99,lty=3,col="blue")
abline(h=mean(estimates),lty=1,col="red")
#This estimator works if the weights are not too extreme
#The weights are "too extreme" if some strata are undersampled
#I expect that the jumps in the estimate stem from the sample size in a small (and important) stratum
In [360]:
#This is just a slight reformulation of the above, too make it look like equation (5.1) (page 91) in Cochran.
#Test with strata
n=10000;
m=200;
n_strata = 10;
R=as.numeric(1:n);#NOTE: for strata we assume sorted R
#w=c(sapply(runif(n_strata),function(x) replicate(n/n_strata,x)));
w=c(sapply((n_strata+1-1:n_strata)^1.5,function(x) replicate(n/n_strata,x)));
strata_boundaries = (0:n_strata)*n/n_strata;
strata_weights=sapply(1:n_strata,function(p) w[p*n/n_strata-1]);
strata_sizes = replicate(n_strata,n/n_strata);
total_strata_weight = sum(strata_weights*strata_sizes);
n_repl=100;
#TODO: other aggregation functions
estimates = sort(sapply(1:n_repl,
function(x) {
S=sort(sample(R,prob=w,m,replace=TRUE));
avg_est = 0;
for(i in 1:n_strata) {
stratum_min_index = strata_boundaries[i]+1;
stratum_max_index = strata_boundaries[i+1];
stratum_sample = S[S>=R[stratum_min_index]&S<=R[stratum_max_index]];
if(length(stratum_sample) == 0) {
next;
}
#avg_est_term = mean(stratum_sample)*strata_sizes[i]*strata_weights[i]/total_strata_weight;
avg_est_term = mean(stratum_sample)*strata_sizes[i]/n;
avg_est = avg_est + avg_est_term;
}
sum_est = avg_est * n;
return(sum_est);
}
));
print(paste("True sum: ",sum(R)));
print(paste("estimate: ",mean(estimates)));
plot(estimates,col="red",pch=3)
abline(h=sum(R),lty=2,col="blue")
abline(h=sum(R)*1.01,lty=3,col="blue")
abline(h=sum(R)*0.99,lty=3,col="blue")
abline(h=mean(estimates),lty=1,col="red")