In [1]:
### Read in data file
In [2]:
library(data.table)
In [3]:
datafile <- "data/export_all.csv"
df <- as.data.frame(fread(datafile,header=FALSE,stringsAsFactors=FALSE))
if( df[1,1] == "pid1" ) {
df <- as.data.frame(fread(datafile,header=TRUE,stringsAsFactors=FALSE))
}
In [4]:
fields <- c("ID","FirstName","MiddleName","LastName","Institution")
columnNames <- c(
paste("Trainee",fields,sep="."),
paste("Mentor",fields,sep="."),
"relationshipCode","relationshipType", "startYear", "stopYear")
colnames(df) <- columnNames
head(df)
Out[4]:
In [5]:
dim(df)
Out[5]:
In [6]:
comp <- as.data.frame(fread("data/schultz_companies.csv",header=TRUE,stringsAsFactors=FALSE))
dim(comp)
head(comp)
Out[6]:
Out[6]:
In [7]:
colnames(comp) <- colnames(df)
df <- rbind(comp,df)
head(df)
Out[7]:
In [8]:
### in cases where year is missing, guess it or select randomly
setYear <- function(x){
if(as.numeric(x["startYear"])==0) {
if( as.numeric(x["stopYear"])==0) {
x["startYear"] = sample(1985:2016,1)
} else {
x["startYear"] = as.numeric(x["stopYear"]) - 5;
}
}
return(x)
}
df <- as.data.frame(t(apply(df,1,setYear)))
In [9]:
### define root persion ID
root.ID <- 52763 ### Peter Schultz
#root.ID <- 62876 ### test on Virginia Cornish
#root.ID <- 63525 ### test on Hening Lin
#root.ID <- 54496 ### test on Chris Walsh
#root.ID <- 4338 ### Robert Woodward
#root.ID <- 9005 ### George Whitesides (161 direct children)
root.ID
Out[9]:
In [10]:
getPerson <- function(ID, df, mentor.ID = NULL, include.children=TRUE, level=1 ) {
# print(paste("getting person", ID, level))
if( is.null(mentor.ID) ) {
p <- df[df$Trainee.ID==ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
} else {
p <- df[df$Trainee.ID==ID&df$Mentor.ID==mentor.ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
}
person <- list()
person$ID <- as.character(p$Trainee.ID[1])
person$FirstName <- as.character(p$Trainee.FirstName[1])
person$MiddleName <- as.character(p$Trainee.MiddleName[1])
person$LastName <- as.character(p$Trainee.LastName[1])
person$level <- level
person$startYear <- as.character(p$startYear)
if( is.na(person$LastName) ) {
person$name <- person$ID;
} else {
person$name <- paste(c(
substr(person$FirstName,0,1),
substr(person$MiddleName,0,1),
substr(person$LastName,0,1) ), collapse="")
person$url = paste('http://academictree.org/chemistry/peopleinfo.php?pid=',person$ID,sep="")
}
if( include.children ) {
person$children <- getChildren( person$ID, df, level = level + 1 )
}
return(person)
}
getChildren <- function(root.ID, df, level = 0) {
root <- getPerson(root.ID, df, include.children=FALSE)
# print(paste("A: ", root$ID, level))
# print(root)
# print(dim(df))
children.df <- df[df$Mentor.ID == root$ID,]
children.df <- children.df[order(children.df$startYear,decreasing = TRUE),]
children.ids <- unique(children.df$Trainee.ID)
# print(paste("B: ", children.ids))
# print(children.df)
children <- list()
if( dim(children.df)[1] == 0 ) { return(children); }
# for( child.id in children.ids ) {
for( i in 1:dim(children.df)[1] ) {
child.id <- children.df[i,"Trainee.ID"]
child.type <- children.df[i,"relationshipType"]
# print(paste("B: ", child.id, level))
child <- getPerson(child.id, df, level=level, mentor.ID=root.ID)
child$type <- child.type
# print(child$ID)
# child$children <- getChildren( child$ID, df )
children[[length(children)+1]] <- child
}
return(children)
}
In [11]:
df[df$Trainee.ID=="GNF",]
is.na(df[df$Trainee.ID=="GNF",][1,"Trainee.LastName"])
Out[11]:
Out[11]:
In [12]:
date()
root <- getPerson(root.ID, df)
date()
Out[12]:
Out[12]:
In [13]:
library(jsonlite)
json <- jsonlite::toJSON(root,pretty=TRUE,auto_unbox=TRUE)
write(json, paste(c("output/output_PGS.json"),collapse=""))