Database:
1. All RCTs registered at WHO ICTRP by Jan 1st 2016,
2. with start date between 2006 and 2015
3. with study type and design corresponding to RCT
4. with at least one country location among the 187 countries included in the GBD2010 study
5. with sample size information and sample size between 10 and 150,000
We will:
1. General numbers of Nb Patients across regions and over time
2. Create replicates of the mapping of Patients across diseases and evaluate the uncertainty intervals of the local share of patients across diseases within regions
In [17]:
#Upload database
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Flowchart/database_all_diseases_final_ok.txt")
data <- data[!is.na(data$Sample),]
data <- data[data$Sample>=10 & data$Sample<=150000,]
N <- nrow(data)
names(data)
In [18]:
#Upload traduction names/label categories
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")
grep("Injur",Mgbd$cause_name)
In [19]:
#We supress from GBD28 the label 28
GBD27 <- sapply(strsplit(as.character(data$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})
data$GBD27 <- GBD27
#Number of trials relevant to the burden of diseases
table(GBD27=="")
In [20]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
In [21]:
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')
pats <- data.frame(TrialID = rep(data$TrialID,sapply(nb_ctrs,length)),
Nb_ctrs = unlist(nb_ctrs),
Region = unlist(RGs),
Tot_sample = rep(data$Sample,sapply(nb_ctrs,length)))
In [22]:
pats$tot_ctrs <- rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length))
pats$sample_per_reg <- pats$Tot_sample*pats$Nb_ctrs/pats$tot_ctrs
In [23]:
Lgbd <- lapply(as.character(data$GBD27),function(x){as.numeric(unlist(strsplit(x,"&")))})
In [24]:
tot <- tapply(pats$sample_per_reg,pats$Region,sum)
tot
sum(tot)
sum(data$Sample)
In [25]:
#Distribution sample sizes
summary(data$Sample)
In [41]:
spl_qt <- c(10,20,40,60,100,200,400,1000,2000,10000,20000,100000,200000)
In [42]:
data$Sple_cl <- cut(data$Sample,spl_qt,right=F)
data$Sple_cl <- as.character(data$Sple_cl)
#Base$IF_classe<-factor(Base$IF_classe,levels=c("[10,56)","[5,10)","[0,5)","No IF"),labels=c("IF greater or equal than 10","IF between 5 and 10","IF less than 5","No IF"))
data$Sple_cl <- as.factor(data$Sple_cl)
In [36]:
library(gdata)
In [43]:
levels(data$Sple_cl)
In [44]:
data$Sple_cl <- reorder(data$Sple_cl,new.order=c('[10,20)',
'[20,40)',
'[40,60)',
'[60,100)',
'[100,200)',
'[200,400)',
'[400,1e+03)',
'[1e+03,2e+03)',
'[2e+03,1e+04)',
'[1e+04,2e+04)',
'[2e+04,1e+05)',
'[1e+05,2e+05)'
))
In [47]:
barplot(table(data$Sple_cl))
In [ ]:
In [ ]:
In [6]:
DRY <- do.call('cbind',tapply(regs,data$year,function(x){table(unlist(x))}))
In [7]:
DRY <- DRY[order(apply(DRY,1,sum)),]
In [8]:
DRY
In [9]:
barplot(DRY[rownames(DRY)!="High-income",])
For each disease, we simulate what would have been the mapping of RCTs within regions if the misclassification of RCTs towards groups of diseases was corrected, given the sensitivities and specificities of the classifier to identify each group of disease.
To estimate the performances of the classifier for each group of diseases, we dispose a test set with 2,763 trials manually classified towards the 27-class grouping of diseases used in this work. The test set is described at Atal et al. BMC Bioinformatics 2016.
The method used is based on the method presented at Fox et al. Int J Epidemiol 2005.
To do so, for each disease for which we found a local research gap we will:
In [10]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)
In [11]:
Lgbd <- lapply(as.character(data$GBD28),function(x){as.numeric(unlist(strsplit(x,"&")))})
Lgbd <- lapply(Lgbd,function(x){x[x!=28]})
In [12]:
PERF <- read.csv('Tables/Performances_per_27disease_data.csv')
In [13]:
NK <- 10000
set.seed(7212)
In [14]:
#For all diseases, we will simulate the mapping across regions of trials concerning
#the disease or concerning other diseases
dis <- 1:27
In [15]:
#For each disease
t0 <- proc.time()
for(g in dis){
PERF_g <- PERF[PERF$dis==g,]
#which trials concern the disease
is_dis <- sapply(Lgbd,function(x){g%in%x})
#which trials concern another disease
is_oth <- sapply(Lgbd,function(x){sum(setdiff(1:27,g)%in%x)>0})
#PPV et NPVs for finding the disease
sens_r <- PERF_g$TP_Dis
sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
spec_r <- PERF_g$TN_Dis
spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
a_dis <- sum(is_dis)
b_dis <- N-a_dis
As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_dis <- T1/(T1+F1)
NPV_dis <- T0/(T0+F0)
#PPV and NPVs for finding another disease
sens_r <- PERF_g$TP_Oth
sens_n <- PERF_g$TP_Oth + PERF_g$FN_Oth
spec_r <- PERF_g$TN_Oth
spec_n <- PERF_g$TN_Oth + PERF_g$FP_Oth
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
a_oth <- sum(is_oth)
b_oth <- N-a_oth
As <- (a_oth-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_oth <- T1/(T1+F1)
NPV_oth <- T0/(T0+F0)
#Some values of sens and spec may lead to impossible values of PPV or NPV (>1 or <0)
#We supress and count them. If the total of suppressed iterations is higher than 10% of total iterations we
#will modify the distributions for Specificity and Sensitivity
false_it <- PPV_dis<0 | PPV_dis>1 |
NPV_dis<0 | NPV_dis>1 |
PPV_oth<0 | PPV_oth>1 |
NPV_oth<0 | NPV_oth>1
print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
),collapse=" "))
PPV_dis <- PPV_dis[!false_it]
NPV_dis <- NPV_dis[!false_it]
PPV_oth <- PPV_oth[!false_it]
NPV_oth <- NPV_oth[!false_it]
L <- list()
#Simulation: reclassifying each trial
for(k in 1:length(PPV_dis)){
AR <- matrix(0, nrow=length(regs)+1, ncol=2)
tp_dis <- runif(a_dis)
tn_dis <- runif(b_dis)
recl_dis <- is_dis
recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
#Rq: we count all trials (even those with more than 3 diseases)
#it is a conservative choice
rt <- as.numeric(recl_dis)
if(sum(recl_dis)==0) AR[,1] <- c(rep(0,length(regs)+1))
else{ if(sum(recl_dis)==1) AR[,1] <- c(as.numeric(LR[recl_dis,]),1)
else AR[,1] <- c(apply(LR[recl_dis,],2,sum),sum(recl_dis))
}
#Oth_dis
tp_oth <- runif(a_oth)
tn_oth <- runif(b_oth)
recl_oth <- is_oth
recl_oth[recl_oth==TRUE][tp_oth>PPV_oth[k]] <- FALSE
recl_oth[recl_oth==FALSE][tn_oth>NPV_oth[k]] <- TRUE
rt <- rt + as.numeric(recl_oth)
if(sum(rt)==0) AR[,2] <- c(rep(0,length(regs)+1))
else{ if(sum(rt)==1) AR[,2] <- c(as.numeric(LR[rt!=0,]),1)
else AR[,2] <- c(apply(LR[rt!=0,],2,sum),sum(rt))
}
L[[k]] <- AR
}
T <- do.call('rbind',L)
write.table(T,paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",as.character(PERF_g$dis),".txt"),collapse=""))
}
t1 <- proc.time()
print(t1-t0)/60
It took 5h
In [18]:
# Diseases with more than 10% of suppressed iterations:
Mgbd$cause_name[c(9,11,21,23,27)]
We will re-simulate only for diseases corresponding to more than 1% of local burden in a region
In [37]:
set.seed(7212)
g <- 0
#For all diseases, we estimate number of RCTs relevant to the burden
PERF_g <- PERF[PERF$dis==0,]
#which trials are relevant to the burden
is_dis <- sapply(Lgbd,length)==1
#PPV et NPVs for finding the disease
sens_r <- PERF_g$TP_Dis
sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
spec_r <- PERF_g$TN_Dis
spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
a_dis <- sum(is_dis)
b_dis <- N-a_dis
As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_dis <- T1/(T1+F1)
NPV_dis <- T0/(T0+F0)
false_it <- PPV_dis<0 | PPV_dis>1 |
NPV_dis<0 | NPV_dis>1
print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
),collapse=" "))
PPV_dis <- PPV_dis[!false_it]
NPV_dis <- NPV_dis[!false_it]
L <- data.frame()
#Simulation: reclassifying each trial
for(k in 1:length(PPV_dis)){
AR <- rep(0,length(regs)+1)
tp_dis <- runif(a_dis)
tn_dis <- runif(b_dis)
recl_dis <- is_dis
recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
if(sum(recl_dis)==0) AR <- c(rep(0,length(regs)+1))
else{ if(sum(recl_dis)==1) AR <- c(as.numeric(LR[recl_dis,]),1)
else AR <- c(apply(LR[recl_dis,],2,sum),sum(recl_dis))
}
L <- rbind(L,AR)
}
write.table(L,paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",as.character(PERF_g$dis),".txt"),collapse=""))
t1 <- proc.time()-t0
t1/60
In [38]:
SM <- data.frame(Region = rep(c(regs,"All"),each=nrow(Mgbd)+1),
Disease = rep(c(as.character(Mgbd$cause_name),"All"),times=length(regs)+1))
In [39]:
SM$SimMn_NbRCTs <- NA
SM$SimMed_NbRCTs <- NA
SM$Sim95low_NbRCTs <- NA
SM$Sim95up_NbRCTs <- NA
SM$SimMn_PrRCTs <- NA
SM$SimMed_PrRCTs <- NA
SM$Sim95low_PrRCTs <- NA
SM$Sim95up_PrRCTs <- NA
In [40]:
for(g in dis){
T <- tryCatch(read.table(paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",
as.character(g),".txt"),collapse="")),error=NULL)
if(length(T)!=0){
#Mean, median and 95% uncertainty interval for the number of RCTs
M <- matrix(T[,1],ncol=8,byrow=TRUE)
SM$Sim95up_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.975)})
SM$Sim95low_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.025)})
SM$SimMed_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.5)})
SM$SimMn_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,mean)
#Mean and 95% upper-bound proportion of RCTs by simulation
M <- matrix(T[,1]/T[,2],ncol=8,byrow=TRUE)
SM$Sim95up_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.975)})
SM$Sim95low_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.025)})
SM$SimMed_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.5)})
SM$SimMn_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,mean)
}
}
#All diseases
g <- 0
T <- tryCatch(read.table(paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",
as.character(g),".txt"),collapse="")),error=NULL)
SM$Sim95up_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.975)})
SM$Sim95low_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.025)})
SM$SimMed_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.5)})
SM$SimMn_NbRCTs[SM$Disease=="All"] <- apply(T,2,mean)
In [43]:
SM[SM$Dis=="All",]
In [44]:
write.table(SM,'Data/Simulations_Alldis_NbProp_MedMn95Int_RCTs.txt')
In [ ]: