notebook.community

Edit and run



In [1]:

    
### Read in data file



In [2]:

    
library(data.table)









    



Warning message:
: package 'data.table' was built under R version 3.2.5



In [3]:

    
datafile <- "data/export_all.csv"
df <- as.data.frame(fread(datafile,header=FALSE,stringsAsFactors=FALSE))

if( df[1,1] == "pid1" ) {
    df <- as.data.frame(fread(datafile,header=TRUE,stringsAsFactors=FALSE))
}









    



Read 529264 rows and 14 (of 14) columns from 0.074 GB file in 00:00:03



In [4]:

    
fields <- c("ID","FirstName","MiddleName","LastName","Institution")
columnNames <- c(
    paste("Trainee",fields,sep="."),
    paste("Mentor",fields,sep="."),
    "relationshipCode","relationshipType", "startYear", "stopYear")

colnames(df) <- columnNames
head(df)









    Out[4]:





Trainee.ID Trainee.FirstName Trainee.MiddleName Trainee.LastName Trainee.Institution Mentor.ID Mentor.FirstName Mentor.MiddleName Mentor.LastName Mentor.Institution relationshipCode relationshipType startYear stopYear

	1 2 Benjamin Y Hayden University of Rochester 3 Jack L Gallant University of California, Berkeley 1 student 2000 2005
	2 4 Benjamin Willmore University of Oxford 3 Jack L Gallant University of California, Berkeley 2 postdoc 2003 2006
	3 6 Ryan Prenger Lawrence Livermore Laboratory 3 Jack L Gallant University of California, Berkeley 1 student 2002 2008
	4 18761 Alan P Koretsky National Institute of Neurological Disorders and Stroke 9 Melvin P. Klein University of California, Berkeley 1 student 0000 1984
	5 10 C Edward Connor Johns Hopkins University 16 David C Van Essen Washington University, Saint Louis 2 postdoc 0000 0000
	6 3 Jack L Gallant University of California, Berkeley 16 David C Van Essen Washington University, Saint Louis 2 postdoc 0000 0000



In [5]:

    
dim(df)









    Out[5]:





	529263
	14



In [6]:

    
comp <- as.data.frame(fread("data/schultz_companies.csv",header=TRUE,stringsAsFactors=FALSE))
dim(comp)
head(comp)









    Out[6]:





	29
	14








    Out[6]:





Trainee.ID Company.Name Trainee.MiddleName Trainee.LastName Trainee.Institution Founder.ID Founder.FirstName Founder.MiddleName Founder.LastName Founder.Institution relationshipCode relationshipType YearFounded stopYear

	1 AD Scientist.com (formerly Assay Depot) NA NA NA 633266 Andrew B Martin NA NA company 2007 NA
	2 XP XenoPort Inc NA NA NA 663570 Mark A Gallop NA NA company 1999 NA
	3 ET Ensemble Therapeutics NA NA NA 57073 David R Liu NA NA company 2004 NA
	4 PER Permeon Biologics NA NA NA 57073 David R Liu NA NA company 2011 NA
	5 EDT Editas Medicine NA NA NA 57073 David R Liu NA NA company 2014 NA
	6 SYR Syros Pharmaceuticals NA NA NA 477556 Nathanael S. Gray NA NA company 2012 NA



In [7]:

    
colnames(comp) <- colnames(df)
df <- rbind(comp,df)
head(df)









    Out[7]:





Trainee.ID Trainee.FirstName Trainee.MiddleName Trainee.LastName Trainee.Institution Mentor.ID Mentor.FirstName Mentor.MiddleName Mentor.LastName Mentor.Institution relationshipCode relationshipType startYear stopYear

	1 AD Scientist.com (formerly Assay Depot) NA NA NA 633266 Andrew B Martin NA NA company 2007 NA
	2 XP XenoPort Inc NA NA NA 663570 Mark A Gallop NA NA company 1999 NA
	3 ET Ensemble Therapeutics NA NA NA 57073 David R Liu NA NA company 2004 NA
	4 PER Permeon Biologics NA NA NA 57073 David R Liu NA NA company 2011 NA
	5 EDT Editas Medicine NA NA NA 57073 David R Liu NA NA company 2014 NA
	6 SYR Syros Pharmaceuticals NA NA NA 477556 Nathanael S. Gray NA NA company 2012 NA



In [8]:

    
### in cases where year is missing, guess it or select randomly

setYear <- function(x){
    if(as.numeric(x["startYear"])==0) {
        if( as.numeric(x["stopYear"])==0) {
            x["startYear"] = sample(1985:2016,1)
        } else {
            x["startYear"] = as.numeric(x["stopYear"]) - 5;
        }
    }
    return(x)
}
df <- as.data.frame(t(apply(df,1,setYear)))



In [9]:

    
### define root persion ID
root.ID <- 52763 ### Peter Schultz

#root.ID <- 62876 ### test on Virginia Cornish
#root.ID <- 63525 ### test on Hening Lin
#root.ID <- 54496  ### test on Chris Walsh 
#root.ID <- 4338 ### Robert Woodward
#root.ID <- 9005 ### George Whitesides (161 direct children)
root.ID









    Out[9]:




52763



In [10]:

    
getPerson <- function(ID, df, mentor.ID = NULL, include.children=TRUE, level=1 ) {
#    print(paste("getting person", ID, level))
    if( is.null(mentor.ID) ) {
        p <- df[df$Trainee.ID==ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    } else {
        p <- df[df$Trainee.ID==ID&df$Mentor.ID==mentor.ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    }
    person <- list()
    person$ID <- as.character(p$Trainee.ID[1])
    person$FirstName <- as.character(p$Trainee.FirstName[1])
    person$MiddleName <- as.character(p$Trainee.MiddleName[1])
    person$LastName <- as.character(p$Trainee.LastName[1])
    person$level <- level
    person$startYear <- as.character(p$startYear)
    if( is.na(person$LastName) ) {
        person$name <- person$ID;
    } else {
        person$name <- paste(c(
            substr(person$FirstName,0,1),
            substr(person$MiddleName,0,1),
            substr(person$LastName,0,1) ), collapse="")
        person$url = paste('http://academictree.org/chemistry/peopleinfo.php?pid=',person$ID,sep="")
    }
    if( include.children ) {
        person$children <- getChildren( person$ID, df, level = level + 1 )
    }

    return(person) 
}

getChildren <- function(root.ID, df, level = 0) {
    root <- getPerson(root.ID, df, include.children=FALSE)
#    print(paste("A: ", root$ID, level))
#    print(root)
#    print(dim(df))
    children.df <- df[df$Mentor.ID == root$ID,]
    children.df <- children.df[order(children.df$startYear,decreasing = TRUE),]
    children.ids <- unique(children.df$Trainee.ID)
#    print(paste("B: ", children.ids))
#    print(children.df)

    children <- list()
    if( dim(children.df)[1] == 0 ) { return(children); }
#    for( child.id in children.ids ) {
    for( i in 1:dim(children.df)[1] ) {
       child.id <- children.df[i,"Trainee.ID"]
       child.type <- children.df[i,"relationshipType"]
#       print(paste("B: ", child.id, level))
       child <- getPerson(child.id, df, level=level, mentor.ID=root.ID)
       child$type <- child.type
#        print(child$ID)
#       child$children <- getChildren( child$ID, df )
       children[[length(children)+1]] <- child
    }
    return(children)
}



In [11]:

    
df[df$Trainee.ID=="GNF",]
is.na(df[df$Trainee.ID=="GNF",][1,"Trainee.LastName"])









    Out[11]:





Trainee.ID Trainee.FirstName Trainee.MiddleName Trainee.LastName Trainee.Institution Mentor.ID Mentor.FirstName Mentor.MiddleName Mentor.LastName Mentor.Institution relationshipCode relationshipType startYear stopYear

	22 GNF Genomics Institute of the Novartis Research Foundation NA NA NA 52763 Schultz G Peter NA NA company 1999 NA









    Out[11]:




22: TRUE



In [12]:

    
date()
root <- getPerson(root.ID, df)
date()









    Out[12]:




"Mon Jul 25 15:17:11 2016"






    Out[12]:




"Mon Jul 25 15:40:33 2016"



In [13]:

    
library(jsonlite)
json <- jsonlite::toJSON(root,pretty=TRUE,auto_unbox=TRUE)
write(json, paste(c("output/output_PGS.json"),collapse=""))

	Trainee.ID	Trainee.FirstName	Trainee.MiddleName	Trainee.LastName	Trainee.Institution	Mentor.ID	Mentor.FirstName	Mentor.MiddleName	Mentor.LastName	Mentor.Institution	relationshipCode	relationshipType	startYear	stopYear
1	2	Benjamin	Y	Hayden	University of Rochester	3	Jack	L	Gallant	University of California, Berkeley	1	student	2000	2005
2	4	Benjamin		Willmore	University of Oxford	3	Jack	L	Gallant	University of California, Berkeley	2	postdoc	2003	2006
3	6	Ryan		Prenger	Lawrence Livermore Laboratory	3	Jack	L	Gallant	University of California, Berkeley	1	student	2002	2008
4	18761	Alan	P	Koretsky	National Institute of Neurological Disorders and Stroke	9	Melvin	P.	Klein	University of California, Berkeley	1	student	0000	1984
5	10	C	Edward	Connor	Johns Hopkins University	16	David	C	Van Essen	Washington University, Saint Louis	2	postdoc	0000	0000
6	3	Jack	L	Gallant	University of California, Berkeley	16	David	C	Van Essen	Washington University, Saint Louis	2	postdoc	0000	0000

	Trainee.ID	Company.Name	Trainee.MiddleName	Trainee.LastName	Trainee.Institution	Founder.ID	Founder.FirstName	Founder.MiddleName	Founder.LastName	Founder.Institution	relationshipCode	relationshipType	YearFounded	stopYear
1	AD	Scientist.com (formerly Assay Depot)	NA	NA	NA	633266	Andrew	B	Martin	NA	NA	company	2007	NA
2	XP	XenoPort Inc	NA	NA	NA	663570	Mark	A	Gallop	NA	NA	company	1999	NA
3	ET	Ensemble Therapeutics	NA	NA	NA	57073	David	R	Liu	NA	NA	company	2004	NA
4	PER	Permeon Biologics	NA	NA	NA	57073	David	R	Liu	NA	NA	company	2011	NA
5	EDT	Editas Medicine	NA	NA	NA	57073	David	R	Liu	NA	NA	company	2014	NA
6	SYR	Syros Pharmaceuticals	NA	NA	NA	477556	Nathanael	S.	Gray	NA	NA	company	2012	NA