In [1]:
library(gdata)
library(data.table)


gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

Attaching package: ‘gdata’

The following object is masked from ‘package:stats’:

    nobs

The following object is masked from ‘package:utils’:

    object.size

The following object is masked from ‘package:base’:

    startsWith


Attaching package: ‘data.table’

The following objects are masked from ‘package:gdata’:

    first, last


In [2]:
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/database_RCTs_regions_27diseases.txt")
nrow(data)


117180

In [3]:
table(is.na(data$Sample))
sum(data$Sample,na.rm=TRUE)
max(data$Sample,na.rm=TRUE)


 FALSE   TRUE 
109211   7969 
77618020
20121212

In [4]:
#Shrinking sample sizes
table(data$Sample < 10)
table(data$Sample > 200000)
sum(data$Sample[data$Sample>=10 & data$Sample<=200000],na.rm=TRUE)
table(is.na(data$Sample) | data$Sample<10 | data$Sample > 200000)


 FALSE   TRUE 
107582   1629 
 FALSE   TRUE 
109187     24 
43952638
 FALSE   TRUE 
107558   9622 

In [5]:
data <- data[!is.na(data$Sample) & data$Sample<=200000 & data$Sample>=10,]

In [6]:
#Distribution sample sizes
summary(data$Sample)


    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    10.0     48.0    100.0    408.6    220.0 200000.0 

In [7]:
spl_qt <- c(0,10,20,40,60,100,200,400,1000,2000,10000,20000,100000,200000)

data$Sple_cl <- cut(data$Sample,spl_qt,right=F)
data$Sple_cl <- as.character(data$Sple_cl)
data$Sple_cl <- as.factor(data$Sple_cl)
levels(data$Sple_cl)


  1. '[10,20)'
  2. '[100,200)'
  3. '[1e+03,2e+03)'
  4. '[1e+04,2e+04)'
  5. '[1e+05,2e+05)'
  6. '[20,40)'
  7. '[200,400)'
  8. '[2e+03,1e+04)'
  9. '[2e+04,1e+05)'
  10. '[40,60)'
  11. '[400,1e+03)'
  12. '[60,100)'

In [8]:
levels(data$Sple_cl) <- c(levels(data$Sple_cl),'[0,10)')

In [9]:
table(data$Sple_cl)


      [10,20)     [100,200) [1e+03,2e+03) [1e+04,2e+04) [1e+05,2e+05) 
         4598         22308          2861           270            31 
      [20,40)     [200,400) [2e+03,1e+04) [2e+04,1e+05)       [40,60) 
        14304         15898          1968           212         14124 
  [400,1e+03)      [60,100)        [0,10) 
        10348         20633             0 

In [10]:
data$Sple_cl <- reorder(data$Sple_cl,new.order=c('[0,10)',
                                                 '[10,20)',
                                                 '[20,40)',
                                                 '[40,60)',
                                                 '[60,100)',
                                                 '[100,200)',
                                                 '[200,400)',
                                                 '[400,1e+03)',
                                                 '[1e+03,2e+03)',
                                                 '[2e+03,1e+04)',
                                                 '[1e+04,2e+04)',
                                                 '[2e+04,1e+05)',
                                                 '[1e+05,2e+05)'
                                                 ))
levels(data$Sple_cl) <- c('[0,10)',
                         '[10,20)',
                         '[20,40)',
                         '[40,60)',
                         '[60,100)',
                         '[100,200)',
                         '[200,400)',
                         '[400,1e3)',
                         '[1e3,2e3)',
                         '[2e3,1e4)',
                         '[1e4,2e4)',
                         '[2e4,1e5)',
                         '[1e5,2e5)'
                        )

In [11]:
table(data$Sple_cl)


   [0,10)   [10,20)   [20,40)   [40,60)  [60,100) [100,200) [200,400) [400,1e3) 
        0      4598     14304     14124     20633     22308     15898     10348 
[1e3,2e3) [2e3,1e4) [1e4,2e4) [2e4,1e5) [1e5,2e5) 
     2861      1968       270       212        31 

In [12]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)
LR <- data.table(LR)
LR$TrialID <- data$TrialID

#Nb of patients per region per trial
#Nb countries per region per trial to distribute sample size equally across countries
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')
pats <- data.frame(TrialID = rep(data$TrialID,sapply(nb_ctrs,length)),
                   Nb_ctrs = unlist(nb_ctrs),
                   Region = unlist(RGs),
                   Tot_sample = rep(data$Sample,sapply(nb_ctrs,length)))

pats$tot_ctrs <- rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length))
pats$sample_per_reg <- pats$Tot_sample*pats$Nb_ctrs/pats$tot_ctrs
pats <- data.table(pats)

In [13]:
pats$Sple_cl <- cut(pats$sample_per_reg,spl_qt,right=F)
pats$Sple_cl <- as.character(pats$Sple_cl)
pats$Sple_cl <- as.factor(pats$Sple_cl)

levels(pats$Sple_cl)


  1. '[0,10)'
  2. '[10,20)'
  3. '[100,200)'
  4. '[1e+03,2e+03)'
  5. '[1e+04,2e+04)'
  6. '[1e+05,2e+05)'
  7. '[20,40)'
  8. '[200,400)'
  9. '[2e+03,1e+04)'
  10. '[2e+04,1e+05)'
  11. '[40,60)'
  12. '[400,1e+03)'
  13. '[60,100)'

In [14]:
pats$Sple_cl <- reorder(pats$Sple_cl,new.order=c('[0,10)',
                                                 '[10,20)',
                                                 '[20,40)',
                                                 '[40,60)',
                                                 '[60,100)',
                                                 '[100,200)',
                                                 '[200,400)',
                                                 '[400,1e+03)',
                                                 '[1e+03,2e+03)',
                                                 '[2e+03,1e+04)',
                                                 '[1e+04,2e+04)',
                                                 '[2e+04,1e+05)',
                                                 '[1e+05,2e+05)'
                                                 ))
levels(pats$Sple_cl) <- c('[0,10)',
                         '[10,20)',
                         '[20,40)',
                         '[40,60)',
                         '[60,100)',
                         '[100,200)',
                         '[200,400)',
                         '[400,1e3)',
                         '[1e3,2e3)',
                         '[2e3,1e4)',
                         '[1e4,2e4)',
                         '[2e4,1e5)',
                         '[1e5,2e5)'
                        )

In [15]:
regs <- names(sort(table(pats$Region),decreasing=TRUE))
regs


  1. 'High-income'
  2. 'North Africa and Middle East'
  3. 'Southeast Asia, East Asia and Oceania'
  4. 'Central Europe, Eastern Europe, and Central Asia'
  5. 'South Asia'
  6. 'Latin America and Caribbean'
  7. 'Sub-Saharian Africa'

In [16]:
reg_labs <- c('High-income',
              'North Africa\nand Middle East',
              'Southeast and East Asia\nand Oceania',
              'Eastern Europe\nand Central Asia',
              'South Asia',
              'Latin America\nand Caribbean',
              'Sub-Saharian Africa')

In [17]:
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)

barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)

#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))

for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
                                                   ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
                    if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
                        }



In [18]:
pdf("../Figures/Sample_size_distribution_within_regions.pdf",width=10,height=5)
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)

barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)

#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))

for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
                                                   ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
                    if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
                        }
dev.off()


PNG: 2

In [19]:
head(pats)


TrialIDNb_ctrsRegionTot_sampletot_ctrssample_per_regSple_cl
1EUCTR2007-006175-36-FI3 High-income 105 3 105 [100,200)
2NCT005901741 High-income1175 1 1175 [1e3,2e3)
3NCT005903561 High-income532 1 532 [400,1e3)
4NCT005914871 High-income60 1 60 [60,100)
5NCT010725261 High-income20 1 20 [20,40)
6NCT005899661 High-income62 1 62 [60,100)

In [26]:
hist(log10(pats$sample_per_reg),breaks=log10(c(0.1,10,50,100,200,500,1000,5000,10000,20000,100000,200000)))



In [ ]:


In [ ]:


In [ ]:


In [ ]: