In [1]:
library(gdata)
library(data.table)
In [2]:
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/database_RCTs_regions_27diseases.txt")
nrow(data)
In [3]:
table(is.na(data$Sample))
sum(data$Sample,na.rm=TRUE)
max(data$Sample,na.rm=TRUE)
In [4]:
#Shrinking sample sizes
table(data$Sample < 10)
table(data$Sample > 200000)
sum(data$Sample[data$Sample>=10 & data$Sample<=200000],na.rm=TRUE)
table(is.na(data$Sample) | data$Sample<10 | data$Sample > 200000)
In [5]:
data <- data[!is.na(data$Sample) & data$Sample<=200000 & data$Sample>=10,]
In [6]:
#Distribution sample sizes
summary(data$Sample)
In [7]:
spl_qt <- c(0,10,20,40,60,100,200,400,1000,2000,10000,20000,100000,200000)
data$Sple_cl <- cut(data$Sample,spl_qt,right=F)
data$Sple_cl <- as.character(data$Sple_cl)
data$Sple_cl <- as.factor(data$Sple_cl)
levels(data$Sple_cl)
In [8]:
levels(data$Sple_cl) <- c(levels(data$Sple_cl),'[0,10)')
In [9]:
table(data$Sple_cl)
In [10]:
data$Sple_cl <- reorder(data$Sple_cl,new.order=c('[0,10)',
'[10,20)',
'[20,40)',
'[40,60)',
'[60,100)',
'[100,200)',
'[200,400)',
'[400,1e+03)',
'[1e+03,2e+03)',
'[2e+03,1e+04)',
'[1e+04,2e+04)',
'[2e+04,1e+05)',
'[1e+05,2e+05)'
))
levels(data$Sple_cl) <- c('[0,10)',
'[10,20)',
'[20,40)',
'[40,60)',
'[60,100)',
'[100,200)',
'[200,400)',
'[400,1e3)',
'[1e3,2e3)',
'[2e3,1e4)',
'[1e4,2e4)',
'[2e4,1e5)',
'[1e5,2e5)'
)
In [11]:
table(data$Sple_cl)
In [12]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)
LR <- data.table(LR)
LR$TrialID <- data$TrialID
#Nb of patients per region per trial
#Nb countries per region per trial to distribute sample size equally across countries
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')
pats <- data.frame(TrialID = rep(data$TrialID,sapply(nb_ctrs,length)),
Nb_ctrs = unlist(nb_ctrs),
Region = unlist(RGs),
Tot_sample = rep(data$Sample,sapply(nb_ctrs,length)))
pats$tot_ctrs <- rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length))
pats$sample_per_reg <- pats$Tot_sample*pats$Nb_ctrs/pats$tot_ctrs
pats <- data.table(pats)
In [13]:
pats$Sple_cl <- cut(pats$sample_per_reg,spl_qt,right=F)
pats$Sple_cl <- as.character(pats$Sple_cl)
pats$Sple_cl <- as.factor(pats$Sple_cl)
levels(pats$Sple_cl)
In [14]:
pats$Sple_cl <- reorder(pats$Sple_cl,new.order=c('[0,10)',
'[10,20)',
'[20,40)',
'[40,60)',
'[60,100)',
'[100,200)',
'[200,400)',
'[400,1e+03)',
'[1e+03,2e+03)',
'[2e+03,1e+04)',
'[1e+04,2e+04)',
'[2e+04,1e+05)',
'[1e+05,2e+05)'
))
levels(pats$Sple_cl) <- c('[0,10)',
'[10,20)',
'[20,40)',
'[40,60)',
'[60,100)',
'[100,200)',
'[200,400)',
'[400,1e3)',
'[1e3,2e3)',
'[2e3,1e4)',
'[1e4,2e4)',
'[2e4,1e5)',
'[1e5,2e5)'
)
In [15]:
regs <- names(sort(table(pats$Region),decreasing=TRUE))
regs
In [16]:
reg_labs <- c('High-income',
'North Africa\nand Middle East',
'Southeast and East Asia\nand Oceania',
'Eastern Europe\nand Central Asia',
'South Asia',
'Latin America\nand Caribbean',
'Sub-Saharian Africa')
In [17]:
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)
barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))
for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
}
In [18]:
pdf("../Figures/Sample_size_distribution_within_regions.pdf",width=10,height=5)
options(repr.plot.width=10, repr.plot.height=5)
par(mfrow=c(2,4),mar=c(5,4.9,3,0),las=2)
barplot(100*table(data$Sple_cl)/nrow(data),ylim=c(0,35),main="Worldwide",cex.names=0.8)
mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
#barplot(table(pats$Sple_cl)/nrow(pats),ylim=c(0,0.5))
for(r in 1:length(regs)){barplot(100*table(pats$Sple_cl[pats$Region==regs[r]])/sum(pats$Region==regs[r]),
ylim=c(0,35),main=reg_labs[r],cex.names=0.8)
if(r==4) mtext("Distribution of RCTs\nacross sample sizes (%)",side=2,line=2.5,las=3,cex=0.8)
}
dev.off()
In [19]:
head(pats)
In [26]:
hist(log10(pats$sample_per_reg),breaks=log10(c(0.1,10,50,100,200,500,1000,5000,10000,20000,100000,200000)))
In [ ]:
In [ ]:
In [ ]: