In [1]:
### Read in data file

In [2]:
library(data.table)


Warning message:
: package 'data.table' was built under R version 3.2.5

In [3]:
datafile <- "data/export_all.csv"
df <- as.data.frame(fread(datafile,header=FALSE,stringsAsFactors=FALSE))

if( df[1,1] == "pid1" ) {
    df <- as.data.frame(fread(datafile,header=TRUE,stringsAsFactors=FALSE))
}


Read 529264 rows and 14 (of 14) columns from 0.074 GB file in 00:00:03

In [4]:
fields <- c("ID","FirstName","MiddleName","LastName","Institution")
columnNames <- c(
    paste("Trainee",fields,sep="."),
    paste("Mentor",fields,sep="."),
    "relationshipCode","relationshipType", "startYear", "stopYear")

colnames(df) <- columnNames
head(df)


Out[4]:
Trainee.IDTrainee.FirstNameTrainee.MiddleNameTrainee.LastNameTrainee.InstitutionMentor.IDMentor.FirstNameMentor.MiddleNameMentor.LastNameMentor.InstitutionrelationshipCoderelationshipTypestartYearstopYear
12BenjaminYHaydenUniversity of Rochester3JackLGallantUniversity of California, Berkeley1student20002005
24BenjaminWillmoreUniversity of Oxford3JackLGallantUniversity of California, Berkeley2postdoc20032006
36RyanPrengerLawrence Livermore Laboratory3JackLGallantUniversity of California, Berkeley1student20022008
418761AlanPKoretskyNational Institute of Neurological Disorders and Stroke9MelvinP.KleinUniversity of California, Berkeley1student00001984
510CEdwardConnorJohns Hopkins University16DavidCVan EssenWashington University, Saint Louis2postdoc00000000
63JackLGallantUniversity of California, Berkeley16DavidCVan EssenWashington University, Saint Louis2postdoc00000000

In [5]:
dim(df)


Out[5]:
  1. 529263
  2. 14

In [6]:
comp <- as.data.frame(fread("data/schultz_companies.csv",header=TRUE,stringsAsFactors=FALSE))
dim(comp)
head(comp)


Out[6]:
  1. 29
  2. 14
Out[6]:
Trainee.IDCompany.NameTrainee.MiddleNameTrainee.LastNameTrainee.InstitutionFounder.IDFounder.FirstNameFounder.MiddleNameFounder.LastNameFounder.InstitutionrelationshipCoderelationshipTypeYearFoundedstopYear
1ADScientist.com (formerly Assay Depot)NANANA633266AndrewBMartinNANAcompany2007NA
2XPXenoPort IncNANANA663570MarkAGallopNANAcompany1999NA
3ETEnsemble TherapeuticsNANANA57073DavidRLiuNANAcompany2004NA
4PERPermeon BiologicsNANANA57073DavidRLiuNANAcompany2011NA
5EDTEditas MedicineNANANA57073DavidRLiuNANAcompany2014NA
6SYRSyros PharmaceuticalsNANANA477556NathanaelS.GrayNANAcompany2012NA

In [7]:
colnames(comp) <- colnames(df)
df <- rbind(comp,df)
head(df)


Out[7]:
Trainee.IDTrainee.FirstNameTrainee.MiddleNameTrainee.LastNameTrainee.InstitutionMentor.IDMentor.FirstNameMentor.MiddleNameMentor.LastNameMentor.InstitutionrelationshipCoderelationshipTypestartYearstopYear
1ADScientist.com (formerly Assay Depot)NANANA633266AndrewBMartinNANAcompany2007NA
2XPXenoPort IncNANANA663570MarkAGallopNANAcompany1999NA
3ETEnsemble TherapeuticsNANANA57073DavidRLiuNANAcompany2004NA
4PERPermeon BiologicsNANANA57073DavidRLiuNANAcompany2011NA
5EDTEditas MedicineNANANA57073DavidRLiuNANAcompany2014NA
6SYRSyros PharmaceuticalsNANANA477556NathanaelS.GrayNANAcompany2012NA

In [8]:
### in cases where year is missing, guess it or select randomly

setYear <- function(x){
    if(as.numeric(x["startYear"])==0) {
        if( as.numeric(x["stopYear"])==0) {
            x["startYear"] = sample(1985:2016,1)
        } else {
            x["startYear"] = as.numeric(x["stopYear"]) - 5;
        }
    }
    return(x)
}
df <- as.data.frame(t(apply(df,1,setYear)))

In [9]:
### define root persion ID
root.ID <- 52763 ### Peter Schultz

#root.ID <- 62876 ### test on Virginia Cornish
#root.ID <- 63525 ### test on Hening Lin
#root.ID <- 54496  ### test on Chris Walsh 
#root.ID <- 4338 ### Robert Woodward
#root.ID <- 9005 ### George Whitesides (161 direct children)
root.ID


Out[9]:
52763

In [10]:
getPerson <- function(ID, df, mentor.ID = NULL, include.children=TRUE, level=1 ) {
#    print(paste("getting person", ID, level))
    if( is.null(mentor.ID) ) {
        p <- df[df$Trainee.ID==ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    } else {
        p <- df[df$Trainee.ID==ID&df$Mentor.ID==mentor.ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    }
    person <- list()
    person$ID <- as.character(p$Trainee.ID[1])
    person$FirstName <- as.character(p$Trainee.FirstName[1])
    person$MiddleName <- as.character(p$Trainee.MiddleName[1])
    person$LastName <- as.character(p$Trainee.LastName[1])
    person$level <- level
    person$startYear <- as.character(p$startYear)
    if( is.na(person$LastName) ) {
        person$name <- person$ID;
    } else {
        person$name <- paste(c(
            substr(person$FirstName,0,1),
            substr(person$MiddleName,0,1),
            substr(person$LastName,0,1) ), collapse="")
        person$url = paste('http://academictree.org/chemistry/peopleinfo.php?pid=',person$ID,sep="")
    }
    if( include.children ) {
        person$children <- getChildren( person$ID, df, level = level + 1 )
    }

    return(person) 
}

getChildren <- function(root.ID, df, level = 0) {
    root <- getPerson(root.ID, df, include.children=FALSE)
#    print(paste("A: ", root$ID, level))
#    print(root)
#    print(dim(df))
    children.df <- df[df$Mentor.ID == root$ID,]
    children.df <- children.df[order(children.df$startYear,decreasing = TRUE),]
    children.ids <- unique(children.df$Trainee.ID)
#    print(paste("B: ", children.ids))
#    print(children.df)

    children <- list()
    if( dim(children.df)[1] == 0 ) { return(children); }
#    for( child.id in children.ids ) {
    for( i in 1:dim(children.df)[1] ) {
       child.id <- children.df[i,"Trainee.ID"]
       child.type <- children.df[i,"relationshipType"]
#       print(paste("B: ", child.id, level))
       child <- getPerson(child.id, df, level=level, mentor.ID=root.ID)
       child$type <- child.type
#        print(child$ID)
#       child$children <- getChildren( child$ID, df )
       children[[length(children)+1]] <- child
    }
    return(children)
}

In [11]:
df[df$Trainee.ID=="GNF",]
is.na(df[df$Trainee.ID=="GNF",][1,"Trainee.LastName"])


Out[11]:
Trainee.IDTrainee.FirstNameTrainee.MiddleNameTrainee.LastNameTrainee.InstitutionMentor.IDMentor.FirstNameMentor.MiddleNameMentor.LastNameMentor.InstitutionrelationshipCoderelationshipTypestartYearstopYear
22GNFGenomics Institute of the Novartis Research FoundationNANANA52763SchultzGPeterNANAcompany1999NA
Out[11]:
22: TRUE

In [12]:
date()
root <- getPerson(root.ID, df)
date()


Out[12]:
"Mon Jul 25 15:17:11 2016"
Out[12]:
"Mon Jul 25 15:40:33 2016"

In [13]:
library(jsonlite)
json <- jsonlite::toJSON(root,pretty=TRUE,auto_unbox=TRUE)
write(json, paste(c("output/output_PGS.json"),collapse=""))