Get file information
In [1]:
path <- "//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902"
f <- grep("episodes.*201[0-3]\\.txt", list.files(path), ignore.case=TRUE, value=TRUE)
f <- file.path(path, f)
file.info(f)[c("size", "mtime")]
Out[1]:
Specify file characteristics
In [2]:
library(data.table)
varnames <- names(fread(f[1], nrows=0))
names(varnames) <- 1:length(varnames)
select <- c(7, 13, 14, 15, 16, 23, 24, 35, 36:48, 62:74, 75:79, 98)
colClasses <- rep("character", length(varnames))
colClasses[varnames %in% c("patid", "personkey", "yob", "megnum")] <- "integer"
names(colClasses) <- varnames
na.strings <- c("NA", "*N", "*NU", "*NUL", "*NULL", "*NULL*", "")
Parallelize the file reads
In [3]:
library("doParallel")
years <- length(f)
cl <- makeCluster(years)
registerDoParallel(cl, cores=years)
L <- foreach (i = 1:years) %dopar% {
require(data.table)
Di <- fread(f[i], nrow=1e5, select=select, colClasses=colClasses, na.strings=na.strings)
Di <- Di[gender != "M" & between(yob, 1955 + (i - 1), 2000 + (i - 1))]
Di <- Di[,
`:=` (fromdate = as.Date(fromdate),
todate = as.Date(todate))]
Di
}
stopCluster(cl)
D <- rbindlist(L)
D[, .(.N, minYOB = min(yob), maxYOB = max(yob)), .(year(fromdate), gender)]
Out[3]:
Write to Stata .dta
file
In [4]:
library(haven)
path <- "F:/OD11ObGynHardStop/Data"
f <- file.path(path, "APAC_episodes_childbearingwomen.dta")
write_dta(D, f)
file.info(f)[c("size", "mtime")]
message(sprintf("Size of data.table() object D is %.3g GB", object.size(D) / 1e9))
str(D)
Out[4]: