gzip
, timeit
, pandas
, urllib.request
, collections
and operator
urlopen
gzip.GzipFile
In [1]:
# Class Session 2
# this is the URL of the SIF file at Pathway Commons
sif_file_url <- "http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.sif.gz"
# for starters, we only want three possible interaction types
interaction_types_ppi <- c("interacts-with","in-complex-with","neighbor-of")
Read the remote compressed file into a data frame using readr::read_delim_chunked
; time it using system.time
; use head
to print the first six lines.
In [5]:
# do the filtering while reading the data; reduce disk space and memory usage
system.time(interactions_df <- read_delim_chunked(sif_file_url,
callback=DataFrameCallback$new(function(df_chunk, pos){
subset(df_chunk, interaction_type %in% interaction_types_ppi)}),
chunk_size=10000,
delim="\t",
quote="",
comment="",
col_names=c("species1","interaction_type","species2"),
progress=FALSE))
# sanity check the resulting data frame
head(interactions_df)
Count the number of rows that correspond to protein-protein interactions
In [5]:
# how many rows are there in the protein-protein interaction edge-list?
nrow(interactions_df)
Count the number of proteins that participate in protein-protein interactions, using unique
In [6]:
# how many unique proteins are there in the interaction network?
length(unique(c(interactions_df$species1, interactions_df$species2)))
Count the number of unique interacting protein-protein pairs, regardless of interaction type
In [7]:
# how many unique interacting protein pairs are there in the interaction network?
length(unique(apply(interactions_df[,c(1,3)], 1, function(my_pair) {
paste(c(min(my_pair), "-", max(my_pair)),collapse="")
})))
Use table
to count the number of each type of interaction in the database
In [12]:
# really quick-and-dirty approach using cURL and gunzip
system.time({
system(paste(c("curl -s --compressed ", sif_file_url, " | gunzip > pc.sif"), collapse=""),
intern=TRUE)
my_df <- read.table(file="pc.sif",
sep="\t",
comment.char="",
quote="",
col.names=c("species1",
"interaction_type",
"species2"),
stringsAsFactors=FALSE)
interactions_df5 <- subset(my_df,
interaction_type %in% interaction_types_ppi)})
all(interactions_df5 == interactions_df)
print(sort(table(as.factor(my_df$interaction_type)), decreasing=TRUE))
In [6]:
# alternative approach which is kind of slow and (transiently) memory-hungry; read the entire file into memory and
# then process the text contents of the file, line by line
system.time({
alltext <- readLines(gzcon(url(sif_file_url)))
nlines <- length(alltext)
interactions_df4 <- data.frame(do.call(rbind, lapply(1:length(alltext),
function(i) {
split_line <- strsplit(alltext[i], "\t")[[1]]
names(split_line) <- c("species1","interaction_type","species2")
split_line
})))
interactions_df4 <- subset(interactions_df4, interaction_type %in% interaction_types_ppi)
})
# sanity check that this slow approach at least gave us correct results
all(interactions_df4 == interactions_df)
rm(interactions_df4)
rm(alltext)
In [7]:
# quick and dirty approach which doesn't require looking up any newfangled commands, but transiently eats up
# disk space and memory
fname <- tempfile()
gzfname <- paste(c(fname, ".gz"),collapse="")
system.time({
download.file(sif_file_url, destfile=gzfname)
interactions_df3 <- subset(read.table(file=gzfile(gzfname),
sep="\t",
comment.char="",
quote="",
col.names=c("species1","interaction_type", "species2"),
stringsAsFactors=FALSE),
interaction_type %in% interaction_types_ppi)})
unlink(gzfname)
# sanity check the results
all(interactions_df3 == interactions_df)
# clean up
rm(interactions_df3)