notebook.community

Edit and run



In [1]:

    
library(gdata)
library(data.table)









    



gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

Attaching package: ‘gdata’

The following object is masked from ‘package:stats’:

    nobs

The following object is masked from ‘package:utils’:

    object.size

The following object is masked from ‘package:base’:

    startsWith


Attaching package: ‘data.table’

The following objects are masked from ‘package:gdata’:

    first, last



In [2]:

    
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/database_RCTs_regions_27diseases.txt")
nrow(data)



In [3]:

    
table(is.na(data$Sample))
sum(data$Sample,na.rm=TRUE)
max(data$Sample,na.rm=TRUE)









    





 FALSE   TRUE 
109211   7969 






    




77618020






    




20121212



In [4]:

    
#Shrinking sample sizes
table(data$Sample < 10)
table(data$Sample > 200000)
sum(data$Sample[data$Sample>=10 & data$Sample<=200000],na.rm=TRUE)
table(is.na(data$Sample) | data$Sample<10 | data$Sample > 200000)









    





 FALSE   TRUE 
107582   1629 






    





 FALSE   TRUE 
109187     24 






    




43952638






    





 FALSE   TRUE 
107558   9622



In [5]:

    
data <- data[!is.na(data$Sample) & data$Sample<=200000 & data$Sample>=10,]



In [6]:

    
#Distribution sample sizes
summary(data$Sample)









    





    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    10.0     48.0    100.0    408.6    220.0 200000.0



In [7]:

    
spl_qt <- c(0,10,20,40,60,100,200,400,1000,2000,10000,20000,100000,200000)

data$Sple_cl <- cut(data$Sample,spl_qt,right=F)
data$Sple_cl <- as.character(data$Sple_cl)
data$Sple_cl <- as.factor(data$Sple_cl)
levels(data$Sple_cl)









    





	'[10,20)'
	'[100,200)'
	'[1e+03,2e+03)'
	'[1e+04,2e+04)'
	'[1e+05,2e+05)'
	'[20,40)'
	'[200,400)'
	'[2e+03,1e+04)'
	'[2e+04,1e+05)'
	'[40,60)'
	'[400,1e+03)'
	'[60,100)'



In [8]:

    
levels(data$Sple_cl) <- c(levels(data$Sple_cl),'[0,10)')



In [9]:

    
table(data$Sple_cl)









    





      [10,20)     [100,200) [1e+03,2e+03) [1e+04,2e+04) [1e+05,2e+05) 
         4598         22308          2861           270            31 
      [20,40)     [200,400) [2e+03,1e+04) [2e+04,1e+05)       [40,60) 
        14304         15898          1968           212         14124 
  [400,1e+03)      [60,100)        [0,10) 
        10348         20633             0



In [10]:

    
data$Sple_cl <- reorder(data$Sple_cl,new.order=c('[0,10)',
                                                 '[10,20)',
                                                 '[20,40)',
                                                 '[40,60)',
                                                 '[60,100)',
                                                 '[100,200)',
                                                 '[200,400)',
                                                 '[400,1e+03)',
                                                 '[1e+03,2e+03)',
                                                 '[2e+03,1e+04)',
                                                 '[1e+04,2e+04)',
                                                 '[2e+04,1e+05)',
                                                 '[1e+05,2e+05)'
                                                 ))
levels(data$Sple_cl) <- c('[0,10)',
                         '[10,20)',
                         '[20,40)',
                         '[40,60)',
                         '[60,100)',
                         '[100,200)',
                         '[200,400)',
                         '[400,1e3)',
                         '[1e3,2e3)',
                         '[2e3,1e4)',
                         '[1e4,2e4)',
                         '[2e4,1e5)',
                         '[1e5,2e5)'
                        )



In [11]:

    
table(data$Sple_cl)









    





   [0,10)   [10,20)   [20,40)   [40,60)  [60,100) [100,200) [200,400) [400,1e3) 
        0      4598     14304     14124     20633     22308     15898     10348 
[1e3,2e3) [2e3,1e4) [1e4,2e4) [2e4,1e5) [1e5,2e5) 
     2861      1968       270       212        31



In [12]:

    
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)
LR <- data.table(LR)
LR$TrialID <- data$TrialID

#Nb of patients per region per trial
#Nb countries per region per trial to distribute sample size equally across countries
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')
pats <- data.frame(TrialID = rep(data$TrialID,sapply(nb_ctrs,length)),
                   Nb_ctrs = unlist(nb_ctrs),
                   Region = unlist(RGs),
                   Tot_sample = rep(data$Sample,sapply(nb_ctrs,length)))

pats$tot_ctrs <- rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length))
pats$sample_per_reg <- pats$Tot_sample*pats$Nb_ctrs/pats$tot_ctrs
pats <- data.table(pats)



In [13]:

    
pats$Sple_cl <- cut(pats$sample_per_reg,spl_qt,right=F)
pats$Sple_cl <- as.character(pats$Sple_cl)
pats$Sple_cl <- as.factor(pats$Sple_cl)

levels(pats$Sple_cl)









    





	'[0,10)'
	'[10,20)'
	'[100,200)'
	'[1e+03,2e+03)'
	'[1e+04,2e+04)'
	'[1e+05,2e+05)'
	'[20,40)'
	'[200,400)'
	'[2e+03,1e+04)'
	'[2e+04,1e+05)'
	'[40,60)'
	'[400,1e+03)'
	'[60,100)'



In [14]:

    
pats$Sple_cl <- reorder(pats$Sple_cl,new.order=c('[0,10)',
                                                 '[10,20)',
                                                 '[20,40)',
                                                 '[40,60)',
                                                 '[60,100)',
                                                 '[100,200)',
                                                 '[200,400)',
                                                 '[400,1e+03)',
                                                 '[1e+03,2e+03)',
                                                 '[2e+03,1e+04)',
                                                 '[1e+04,2e+04)',
                                                 '[2e+04,1e+05)',
                                                 '[1e+05,2e+05)'
                                                 ))
levels(pats$Sple_cl) <- c('[0,10)',
                         '[10,20)',
                         '[20,40)',
                         '[40,60)',
                         '[60,100)',
                         '[100,200)',
                         '[200,400)',
                         '[400,1e3)',
                         '[1e3,2e3)',
                         '[2e3,1e4)',
                         '[1e4,2e4)',
                         '[2e4,1e5)',
                         '[1e5,2e5)'
                        )



In [15]:

    
regs <- names(sort(table(pats$Region),decreasing=TRUE))
regs









    





	'High-income'
	'North Africa and Middle East'
	'Southeast Asia, East Asia and Oceania'
	'Central Europe, Eastern Europe, and Central Asia'
	'South Asia'
	'Latin America and Caribbean'
	'Sub-Saharian Africa'



In [16]:

    
reg_labs <- c('High-income',
              'North Africa\nand Middle East',
              'Southeast and East Asia\nand Oceania',
              'Eastern Europe\nand Central Asia',
              'South Asia',
              'Latin America\nand Caribbean',
              'Sub-Saharian Africa')



In [17]:

    
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)

barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)

#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))

for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
                                                   ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
                    if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
                        }



In [18]:

    
pdf("../Figures/Sample_size_distribution_within_regions.pdf",width=10,height=5)
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)

barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)

#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))

for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
                                                   ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
                    if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
                        }
dev.off()









    




PNG: 2



In [19]:

    
head(pats)









    





TrialID Nb_ctrs Region Tot_sample tot_ctrs sample_per_reg Sple_cl

	1 EUCTR2007-006175-36-FI 3                     High-income           105                   3                     105                   [100,200)             
	2 NCT00590174 1          High-income 1175       1          1175       [1e3,2e3)  
	3 NCT00590356 1          High-income 532        1          532        [400,1e3)  
	4 NCT00591487 1          High-income 60         1          60         [60,100)   
	5 NCT01072526 1          High-income 20         1          20         [20,40)    
	6 NCT00589966 1          High-income 62         1          62         [60,100)



In [26]:

    
hist(log10(pats$sample_per_reg),breaks=log10(c(0.1,10,50,100,200,500,1000,5000,10000,20000,100000,200000)))



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	TrialID	Nb_ctrs	Region	Tot_sample	tot_ctrs	sample_per_reg	Sple_cl
1	EUCTR2007-006175-36-FI	3	High-income	105	3	105	[100,200)
2	NCT00590174	1	High-income	1175	1	1175	[1e3,2e3)
3	NCT00590356	1	High-income	532	1	532	[400,1e3)
4	NCT00591487	1	High-income	60	1	60	[60,100)
5	NCT01072526	1	High-income	20	1	20	[20,40)
6	NCT00589966	1	High-income	62	1	62	[60,100)