Mapping health research effort - RCTs

Database:

      1. All RCTs registered at WHO ICTRP by Jan 1st 2016, 
      2. with start date between 2006 and 2015
      3. with study type and design corresponding to RCT
      4. with at least one country location among the 187 countries included in the GBD2010 study
      5. with sample size information and sample size between 10 and 150,000

We will:

      1. General numbers of Nb Patients across regions and over time
      2. Create replicates of the mapping of Patients across diseases and evaluate the uncertainty intervals of the local share of patients across diseases within regions

In [17]:
#Upload database
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Flowchart/database_all_diseases_final_ok.txt")
data <- data[!is.na(data$Sample),]
data <- data[data$Sample>=10 & data$Sample<=150000,]
N <- nrow(data)
names(data)


  1. 'TrialID'
  2. 'brief_title'
  3. 'official_title'
  4. 'Primary_sponsor'
  5. 'Source_Register'
  6. 'Recruitment_Status'
  7. 'other_records'
  8. 'Target_size'
  9. 'Study_type'
  10. 'Study_design'
  11. 'Phase'
  12. 'Countries'
  13. 'condition'
  14. 'Secondary_ID'
  15. 'Source_Support'
  16. 'Secondary_Sponsor'
  17. 'year'
  18. 'Interv'
  19. 'Regions'
  20. 'Nb_ctr_per_reg'
  21. 'Sample'
  22. 'PMID'
  23. 'GBD28'
  24. 'GBD171'
  25. 'Infectious'
  26. 'MNN'
  27. 'Cancer'
  28. 'Chronic'
  • TrialID: unique trial ID from WHOICTRP
  • Regions: 7 epidemiological regions from GBD 2010 study
  • GBD28: classification according to 28 categories defined in Atal et al. BMC Bioinformatics (2016): This classification includes the injuries category, we exclude it

In [18]:
#Upload traduction names/label categories
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")
grep("Injur",Mgbd$cause_name)


28

In [19]:
#We supress from GBD28 the label 28
GBD27 <- sapply(strsplit(as.character(data$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})
data$GBD27 <- GBD27
#Number of trials relevant to the burden of diseases
table(GBD27=="")


FALSE  TRUE 
84069 23479 

1- Number Patients per region and over time


In [20]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))

In [21]:
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')
pats <- data.frame(TrialID = rep(data$TrialID,sapply(nb_ctrs,length)),
                   Nb_ctrs = unlist(nb_ctrs),
                   Region = unlist(RGs),
                   Tot_sample = rep(data$Sample,sapply(nb_ctrs,length)))

In [22]:
pats$tot_ctrs <- rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length))
pats$sample_per_reg <- pats$Tot_sample*pats$Nb_ctrs/pats$tot_ctrs

In [23]:
Lgbd <- lapply(as.character(data$GBD27),function(x){as.numeric(unlist(strsplit(x,"&")))})

In [24]:
tot <- tapply(pats$sample_per_reg,pats$Region,sum)
tot
sum(tot)
sum(data$Sample)


Central Europe, Eastern Europe, and Central Asia
1854018.87710663
High-income
27840647.5551797
Latin America and Caribbean
1088822.16191074
North Africa and Middle East
1159502.1621575
South Asia
2555774.10916648
Southeast Asia, East Asia and Oceania
3964834.87028215
Sub-Saharian Africa
3643540.26419678
42107140
42107140

In [25]:
#Distribution sample sizes
summary(data$Sample)


    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    10.0     48.0    100.0    391.5    220.0 145000.0 

In [41]:
spl_qt <- c(10,20,40,60,100,200,400,1000,2000,10000,20000,100000,200000)

In [42]:
data$Sple_cl <- cut(data$Sample,spl_qt,right=F)
data$Sple_cl <- as.character(data$Sple_cl)
#Base$IF_classe<-factor(Base$IF_classe,levels=c("[10,56)","[5,10)","[0,5)","No IF"),labels=c("IF greater or equal than 10","IF between 5 and 10","IF less than 5","No IF"))
data$Sple_cl <- as.factor(data$Sple_cl)

In [36]:
library(gdata)


gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

Attaching package: ‘gdata’

The following object is masked from ‘package:stats’:

    nobs

The following object is masked from ‘package:utils’:

    object.size

The following object is masked from ‘package:base’:

    startsWith


In [43]:
levels(data$Sple_cl)


  1. '[10,20)'
  2. '[100,200)'
  3. '[1e+03,2e+03)'
  4. '[1e+04,2e+04)'
  5. '[1e+05,2e+05)'
  6. '[20,40)'
  7. '[200,400)'
  8. '[2e+03,1e+04)'
  9. '[2e+04,1e+05)'
  10. '[40,60)'
  11. '[400,1e+03)'
  12. '[60,100)'

In [44]:
data$Sple_cl <- reorder(data$Sple_cl,new.order=c('[10,20)',
                                                 '[20,40)',
                                                 '[40,60)',
                                                 '[60,100)',
                                                 '[100,200)',
                                                 '[200,400)',
                                                 '[400,1e+03)',
                                                 '[1e+03,2e+03)',
                                                 '[2e+03,1e+04)',
                                                 '[1e+04,2e+04)',
                                                 '[2e+04,1e+05)',
                                                 '[1e+05,2e+05)'
                                                 ))

In [47]:
barplot(table(data$Sple_cl))



In [ ]:


In [ ]:


In [6]:
DRY <- do.call('cbind',tapply(regs,data$year,function(x){table(unlist(x))}))

In [7]:
DRY <- DRY[order(apply(DRY,1,sum)),]

In [8]:
DRY


2006200720082009201020112012201320142015
Sub-Saharian Africa222227199239242367368394344256
Latin America and Caribbean377414481506555767710716578494
South Asia281335417592794911989811576537
Central Europe, Eastern Europe, and Central Asia734823848805739894812786586491
North Africa and Middle East 223 328 503 769109312901873190415911628
Southeast Asia, East Asia and Oceania 429 561 822 955106013441407164016011761
High-income7244774785359134916094979587957878417966

In [9]:
barplot(DRY[rownames(DRY)!="High-income",])


2- Estimation of number RCTs per region and disease

For each disease, we simulate what would have been the mapping of RCTs within regions if the misclassification of RCTs towards groups of diseases was corrected, given the sensitivities and specificities of the classifier to identify each group of disease.

To estimate the performances of the classifier for each group of diseases, we dispose a test set with 2,763 trials manually classified towards the 27-class grouping of diseases used in this work. The test set is described at Atal et al. BMC Bioinformatics 2016.

The method used is based on the method presented at Fox et al. Int J Epidemiol 2005.

To do so, for each disease for which we found a local research gap we will:

  1. Calculating sensitivity and specificity of the classifier to identify the disease and other studies relevant to the burden of diseases, and the number of success and number of trials to derive beta distributions
  2. Doing N=10k times the following simulation
    • Randomly choose a sens and spec based on beta distribution for identifying the disease and identifying another disease (no correlation between sens and spec, neither between disease and another disease both)
    • Derive Positive and Negative Predictive Values (PPV and NPV) for each.
    • Simulate the correction of the classification based on PPVs and NPVs
    • Derive the proportion of RCTs concerning the disease among all RCTs concerning the burden of disease in the region
  3. Derive 95% upper-bond simulation interval of the proportion of RCTs concerning the disease among all RCTs concerning the burden of diseases

Construction of replicates


In [10]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)

In [11]:
Lgbd <- lapply(as.character(data$GBD28),function(x){as.numeric(unlist(strsplit(x,"&")))})
Lgbd <- lapply(Lgbd,function(x){x[x!=28]})

In [12]:
PERF <- read.csv('Tables/Performances_per_27disease_data.csv')

In [13]:
NK <- 10000
set.seed(7212)

In [14]:
#For all diseases, we will simulate the mapping across regions of trials concerning
#the disease or concerning other diseases
dis <- 1:27

In [15]:
#For each disease
t0 <- proc.time()

for(g in dis){

    PERF_g <- PERF[PERF$dis==g,]
    
    #which trials concern the disease
    is_dis <- sapply(Lgbd,function(x){g%in%x})
    #which trials concern another disease
    is_oth <- sapply(Lgbd,function(x){sum(setdiff(1:27,g)%in%x)>0})

    #PPV et NPVs for finding the disease
    sens_r <- PERF_g$TP_Dis
    sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
    spec_r <- PERF_g$TN_Dis
    spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
    sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
    spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)

    a_dis <- sum(is_dis)
    b_dis <- N-a_dis
    As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
    Bs <- N-As
    T1 <- sens*As
    T0 <- spec*Bs
    F1 <- (1-spec)*Bs
    F0 <- (1-sens)*As
    PPV_dis <- T1/(T1+F1)
    NPV_dis <- T0/(T0+F0)

    #PPV and NPVs for finding another disease
    sens_r <- PERF_g$TP_Oth
    sens_n <- PERF_g$TP_Oth + PERF_g$FN_Oth
    spec_r <- PERF_g$TN_Oth
    spec_n <- PERF_g$TN_Oth + PERF_g$FP_Oth
    sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
    spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)

    a_oth <- sum(is_oth)
    b_oth <- N-a_oth
    As <- (a_oth-(1-spec)*N)/(sens - (1-spec))
    Bs <- N-As
    T1 <- sens*As
    T0 <- spec*Bs
    F1 <- (1-spec)*Bs
    F0 <- (1-sens)*As
    PPV_oth <- T1/(T1+F1)
    NPV_oth <- T0/(T0+F0)

    #Some values of sens and spec may lead to impossible values of PPV or NPV (>1 or <0)
    #We supress and count them. If the total of suppressed iterations is higher than 10% of total iterations we
    #will modify the distributions for Specificity and Sensitivity
    false_it <- PPV_dis<0 | PPV_dis>1 | 
                NPV_dis<0 | NPV_dis>1 | 
                PPV_oth<0 | PPV_oth>1 | 
                NPV_oth<0 | NPV_oth>1
    
    print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
                                                    ),collapse=" "))  

    PPV_dis <- PPV_dis[!false_it]
    NPV_dis <- NPV_dis[!false_it]
    PPV_oth <- PPV_oth[!false_it]
    NPV_oth <- NPV_oth[!false_it]

    L <- list()
    #Simulation: reclassifying each trial
        for(k in 1:length(PPV_dis)){

            AR <- matrix(0, nrow=length(regs)+1, ncol=2)
            tp_dis <- runif(a_dis)
            tn_dis <- runif(b_dis)
            recl_dis <- is_dis
            recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
            recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
            #Rq: we count all trials (even those with more than 3 diseases)
            #it is a conservative choice
            rt <- as.numeric(recl_dis)

            if(sum(recl_dis)==0) AR[,1] <- c(rep(0,length(regs)+1))
            else{   if(sum(recl_dis)==1) AR[,1] <- c(as.numeric(LR[recl_dis,]),1)
                    else AR[,1] <- c(apply(LR[recl_dis,],2,sum),sum(recl_dis))
            }
                
            #Oth_dis
            tp_oth <- runif(a_oth)
            tn_oth <- runif(b_oth)
            recl_oth <- is_oth
            recl_oth[recl_oth==TRUE][tp_oth>PPV_oth[k]] <- FALSE
            recl_oth[recl_oth==FALSE][tn_oth>NPV_oth[k]] <- TRUE
            rt <- rt + as.numeric(recl_oth)

            if(sum(rt)==0) AR[,2] <- c(rep(0,length(regs)+1))
            else{    if(sum(rt)==1) AR[,2] <- c(as.numeric(LR[rt!=0,]),1)
                     else AR[,2] <- c(apply(LR[rt!=0,],2,sum),sum(rt))
            }

            L[[k]] <- AR

        }
   
    T <- do.call('rbind',L)
    write.table(T,paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",as.character(PERF_g$dis),".txt"),collapse=""))

}

                
t1 <- proc.time()
    
print(t1-t0)/60


[1] "1 has 90 suppressed false iterations"
[1] "2 has 0 suppressed false iterations"
[1] "3 has 0 suppressed false iterations"
[1] "4 has 3 suppressed false iterations"
[1] "5 has 0 suppressed false iterations"
[1] "6 has 0 suppressed false iterations"
[1] "7 has 5 suppressed false iterations"
[1] "8 has 0 suppressed false iterations"
[1] "9 has 1019 suppressed false iterations"
[1] "10 has 0 suppressed false iterations"
[1] "11 has 3108 suppressed false iterations"
[1] "12 has 0 suppressed false iterations"
[1] "13 has 0 suppressed false iterations"
[1] "14 has 0 suppressed false iterations"
[1] "15 has 3 suppressed false iterations"
[1] "16 has 0 suppressed false iterations"
[1] "17 has 0 suppressed false iterations"
[1] "18 has 0 suppressed false iterations"
[1] "19 has 0 suppressed false iterations"
[1] "20 has 0 suppressed false iterations"
[1] "21 has 3355 suppressed false iterations"
[1] "22 has 0 suppressed false iterations"
[1] "23 has 1324 suppressed false iterations"
[1] "24 has 0 suppressed false iterations"
[1] "25 has 0 suppressed false iterations"
[1] "26 has 0 suppressed false iterations"
[1] "27 has 9347 suppressed false iterations"
     user    system   elapsed 
17872.129    19.203 17911.586 
     user    system   elapsed 
297.86882   0.32005 298.52643 

It took 5h


In [18]:
# Diseases with more than 10% of suppressed iterations:
Mgbd$cause_name[c(9,11,21,23,27)]


  1. Sexually transmitted diseases excluding HIV
  2. Leprosy
  3. Hemoglobinopathies and hemolytic anemias
  4. Congenital anomalies
  5. Sudden infant death syndrome

We will re-simulate only for diseases corresponding to more than 1% of local burden in a region


In [37]:
set.seed(7212)
g <- 0
#For all diseases, we estimate number of RCTs relevant to the burden
PERF_g <- PERF[PERF$dis==0,]
    
    #which trials are relevant to the burden
    is_dis <- sapply(Lgbd,length)==1

    #PPV et NPVs for finding the disease
    sens_r <- PERF_g$TP_Dis
    sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
    spec_r <- PERF_g$TN_Dis
    spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
    sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
    spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)

    a_dis <- sum(is_dis)
    b_dis <- N-a_dis
    As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
    Bs <- N-As
    T1 <- sens*As
    T0 <- spec*Bs
    F1 <- (1-spec)*Bs
    F0 <- (1-sens)*As
    PPV_dis <- T1/(T1+F1)
    NPV_dis <- T0/(T0+F0)

false_it <- PPV_dis<0 | PPV_dis>1 | 
                NPV_dis<0 | NPV_dis>1
    
    print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
                                                    ),collapse=" "))  

    PPV_dis <- PPV_dis[!false_it]
    NPV_dis <- NPV_dis[!false_it]

L <- data.frame()
    #Simulation: reclassifying each trial
        for(k in 1:length(PPV_dis)){

            AR <- rep(0,length(regs)+1)
            tp_dis <- runif(a_dis)
            tn_dis <- runif(b_dis)
            recl_dis <- is_dis
            recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
            recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE

            if(sum(recl_dis)==0) AR <- c(rep(0,length(regs)+1))
            else{   if(sum(recl_dis)==1) AR <- c(as.numeric(LR[recl_dis,]),1)
                    else AR <- c(apply(LR[recl_dis,],2,sum),sum(recl_dis))
            }
                
            L <- rbind(L,AR)

        }

    write.table(L,paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",as.character(PERF_g$dis),".txt"),collapse=""))
t1 <- proc.time()-t0
t1/60


[1] "0 has 0 suppressed false iterations"
        user       system      elapsed 
 320.5417167    0.3311833 1140.9930833 

Deriving 95% uncertainty intervals


In [38]:
SM <- data.frame(Region = rep(c(regs,"All"),each=nrow(Mgbd)+1),
                Disease = rep(c(as.character(Mgbd$cause_name),"All"),times=length(regs)+1))

In [39]:
SM$SimMn_NbRCTs <- NA
SM$SimMed_NbRCTs <- NA
SM$Sim95low_NbRCTs <- NA
SM$Sim95up_NbRCTs <- NA
SM$SimMn_PrRCTs <- NA
SM$SimMed_PrRCTs <- NA
SM$Sim95low_PrRCTs <- NA
SM$Sim95up_PrRCTs <- NA

In [40]:
for(g in dis){

    T <- tryCatch(read.table(paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",
                            as.character(g),".txt"),collapse="")),error=NULL)

    if(length(T)!=0){

        #Mean, median and 95% uncertainty interval for the number of RCTs
        M <- matrix(T[,1],ncol=8,byrow=TRUE)
        SM$Sim95up_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.975)})
        SM$Sim95low_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.025)})
        SM$SimMed_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.5)})
        SM$SimMn_NbRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,mean)

        #Mean and 95% upper-bound proportion of RCTs by simulation
        M <- matrix(T[,1]/T[,2],ncol=8,byrow=TRUE)
        SM$Sim95up_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.975)})
        SM$Sim95low_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.025)})
        SM$SimMed_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,function(x){quantile(x,0.5)})
        SM$SimMn_PrRCTs[SM$Disease==as.character(Mgbd$cause_name[g])] <- apply(M,2,mean)

       
    }   
}

#All diseases
g <- 0
    T <- tryCatch(read.table(paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Total_simulation_",
                            as.character(g),".txt"),collapse="")),error=NULL)

        SM$Sim95up_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.975)})
        SM$Sim95low_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.025)})
        SM$SimMed_NbRCTs[SM$Disease=="All"] <- apply(T,2,function(x){quantile(x,0.5)})
        SM$SimMn_NbRCTs[SM$Disease=="All"] <- apply(T,2,mean)

In [43]:
SM[SM$Dis=="All",]


RegionDiseaseSimMn_NbRCTsSimMed_NbRCTsSim95low_NbRCTsSim95up_NbRCTsSimMn_PrRCTsSimMed_PrRCTsSim95low_PrRCTsSim95up_PrRCTs
29Central Europe, Eastern Europe, and Central AsiaAll 6000.2223 6005 5767.975 6206 NA NA NA NA
58High-incomeAll 60599.9094 60631 58034.95 62973.075 NA NA NA NA
87Latin America and CaribbeanAll 4132.2241 4135 3960 4288 NA NA NA NA
116North Africa and Middle EastAll 7860.4472 7863 7520.975 8176 NA NA NA NA
145South AsiaAll 4339.9904 4343 4145 4517 NA NA NA NA
174Southeast Asia, East Asia and OceaniaAll 8442.5907 8448 8087 8763 NA NA NA NA
203Sub-Saharian AfricaAll 2211.0041 2213 2119 2294 NA NA NA NA
232All All 82137.495782179 78661.775 85358.2 NA NA NA NA

In [44]:
write.table(SM,'Data/Simulations_Alldis_NbProp_MedMn95Int_RCTs.txt')

In [ ]: