Reproduce Ifeoma's APAC_shrink_append.do script

Get file information


In [1]:
path <- "//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902"
f <- grep("episodes.*201[0-3]\\.txt", list.files(path), ignore.case=TRUE, value=TRUE)
f <- file.path(path, f)
file.info(f)[c("size", "mtime")]


Out[1]:
sizemtime
//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902/Mcconn_episodes2010.txt308693740632015-08-31 15:27:02
//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902/Mcconn_episodes2011.txt434102813082015-08-31 16:30:25
//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902/Mcconn_episodes2012.txt452706244812015-08-31 18:58:27
//CHSE/DataRepository/APAC/Raw/LimitedDatasets/Release20150902/Mcconn_episodes2013.txt457237956362015-09-01 14:35:55

Specify file characteristics

  • Column names
  • Column classes
  • Columns to select
  • NA strings

In [2]:
library(data.table)
varnames <- names(fread(f[1], nrows=0))
names(varnames) <- 1:length(varnames)
select <- c(7, 13, 14, 15, 16, 23, 24, 35, 36:48, 62:74, 75:79, 98)
colClasses <- rep("character", length(varnames))
colClasses[varnames %in% c("patid", "personkey", "yob", "megnum")] <- "integer"
names(colClasses) <- varnames
na.strings <- c("NA", "*N", "*NU", "*NUL", "*NULL", "*NULL*", "")

Parallelize the file reads


In [3]:
library("doParallel")
years <- length(f)
cl <- makeCluster(years)
registerDoParallel(cl, cores=years)
L <- foreach (i = 1:years) %dopar% {
    require(data.table)
    Di <- fread(f[i], nrow=1e5, select=select, colClasses=colClasses, na.strings=na.strings)
    Di <- Di[gender != "M" & between(yob, 1955 + (i - 1), 2000 + (i - 1))]
    Di <- Di[,
             `:=` (fromdate = as.Date(fromdate),
                   todate = as.Date(todate))]
    Di
}
stopCluster(cl)
D <- rbindlist(L)
D[, .(.N, minYOB = min(yob), maxYOB = max(yob)), .(year(fromdate), gender)]


Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
Out[3]:
yeargenderNminYOBmaxYOB
12010F3125719552000
22011F760519562001
32012F2418619572002
42013F1236619582003

Write to Stata .dta file


In [4]:
library(haven)
path <- "F:/OD11ObGynHardStop/Data"
f <- file.path(path, "APAC_episodes_childbearingwomen.dta")
write_dta(D, f)
file.info(f)[c("size", "mtime")]
message(sprintf("Size of data.table() object D is %.3g GB", object.size(D) / 1e9))
str(D)


Out[4]:
sizemtime
F:/OD11ObGynHardStop/Data/APAC_episodes_childbearingwomen.dta119974222016-05-20 06:50:15
Size of data.table() object D is 0.0236 GB
Classes 'data.table' and 'data.frame':	75414 obs. of  40 variables:
 $ apac_payer: chr  "MCO" "MCO" "MCO" "MCO" ...
 $ patid     : int  2336687 3998961 3998961 3998961 3998961 4107598 3024345 1911465 3998961 1959472 ...
 $ personkey : int  7892947 7251101 7251101 7251101 7251101 1798306 4164607 7258740 7251101 7243618 ...
 $ gender    : chr  "F" "F" "F" "F" ...
 $ yob       : int  1999 1986 1986 1986 1986 1994 1985 2000 1986 1963 ...
 $ fromdate  : Date, format: "2010-01-07" "2010-04-20" ...
 $ todate    : Date, format: "2010-01-07" "2010-04-20" ...
 $ hcg       : chr  "P99c" "P99c" "P99c" "P99c" ...
 $ dx1       : chr  NA NA NA NA ...
 $ dx2       : chr  NA NA NA NA ...
 $ dx3       : chr  NA NA NA NA ...
 $ dx4       : chr  NA NA NA NA ...
 $ dx5       : chr  NA NA NA NA ...
 $ dx6       : chr  NA NA NA NA ...
 $ dx7       : chr  NA NA NA NA ...
 $ dx8       : chr  NA NA NA NA ...
 $ dx9       : chr  NA NA NA NA ...
 $ dx10      : chr  NA NA NA NA ...
 $ dx11      : chr  NA NA NA NA ...
 $ dx12      : chr  NA NA NA NA ...
 $ dx13      : chr  NA NA NA NA ...
 $ px1       : chr  NA NA NA NA ...
 $ px2       : chr  NA NA NA NA ...
 $ px3       : chr  NA NA NA NA ...
 $ px4       : chr  NA NA NA NA ...
 $ px5       : chr  NA NA NA NA ...
 $ px6       : chr  NA NA NA NA ...
 $ px7       : chr  NA NA NA NA ...
 $ px8       : chr  NA NA NA NA ...
 $ px9       : chr  NA NA NA NA ...
 $ px10      : chr  NA NA NA NA ...
 $ px11      : chr  NA NA NA NA ...
 $ px12      : chr  NA NA NA NA ...
 $ px13      : chr  NA NA NA NA ...
 $ proccode  : chr  "15813" "16036" "15805" "15812" ...
 $ mod1      : chr  "0" "0" "0" "0" ...
 $ mod2      : chr  "0" "0" "0" "0" ...
 $ mod3      : chr  NA NA NA NA ...
 $ mod4      : chr  NA NA NA NA ...
 $ megnum    : int  11391713 27566498 27566498 27566498 27566498 5384946 16270743 27703374 27566498 27424467 ...
 - attr(*, ".internal.selfref")=<externalptr>