gzip
, timeit
, pandas
, urllib.request
, collections
and operator
urlopen
gzip.GzipFile
In [2]:
from urllib.request import urlopen
import gzip
import timeit
baseURL = "http://www.pathwaycommons.org/archives/PC2/v9/"
filename = "PathwayCommons9.All.hgnc.sif.gz"
outFilePath = "pc.sif"
interaction_types_ppi = set(["interacts-with","in-complex-with","neighbor-of"])
start_time = timeit.default_timer()
zfd = urlopen(baseURL + filename)
fd = gzip.GzipFile(fileobj=zfd, mode="r")
# initialize the SIF file interaction counter
intctr = 0
linectr = 0
from collections import defaultdict
interactions = set()
proteins = set()
intnamectr = defaultdict(int)
for line in fd:
if linectr < 6:
print(line)
linectr += 1
[prot1, interaction_type, prot2] = line.decode("utf-8").rstrip("\n").split("\t")
intnamectr[interaction_type] += 1
if interaction_type in interaction_types_ppi:
intctr += 1
proteins |= set([prot1, prot2])
interactions.add(min(prot1, prot2) + "-" + max(prot1, prot2))
elapsed = timeit.default_timer() - start_time
How long your program take to run?
In [5]:
print(elapsed)
How many protein-protein interactions are there in the data file?
In [10]:
print(intctr)
How many unique protein names are there in the data file?
In [11]:
len(proteins)
Out[11]:
How many unique pairs of proteins (regarless of interaction type name) are there that interact?
In [12]:
len(interactions)
Out[12]:
How many interactions are there of each type, in PC2?
In [13]:
from operator import itemgetter
sorted(intnamectr.items(), key=itemgetter(1), reverse=True)
Out[13]:
read from the uncompressed data stream, and parse it into a data frame, using pandas.read_csv
In [7]:
import pandas
zfd = urlopen(baseURL + filename)
fd = gzip.GzipFile(fileobj=zfd, mode="r")
df = pandas.read_csv(fd, sep="\t", names=["species1","interaction_type","species2"])
Use the head
method on the data frame, to print out the first six lines
In [8]:
print(df.head())
Print the unique types of interactions in the data frame, using the unique
method:
In [9]:
df.interaction_type.unique()
Out[9]:
Subset the data frame by interaction type (using isin
method), to include only the protein-protein interactions, then count
In [63]:
ppirows = df.interaction_type.isin(interaction_types_ppi)
sum(ppirows)
Out[63]:
Make a list of all proteins that occur in a protein-protein interaction, and count the unique protein names by putting them in a set
and calling len
on the set
In [64]:
newlist = df["species1"][ppirows].tolist() + df["species2"][ppirows].tolist()
len(set(newlist))
Out[64]:
Count unique protein-protein interaction pairs (specific type of interaction irrelevant), again using set
and len
In [68]:
len(set(df["species1"][ppirows] + "-" + df["species2"][ppirows]))
Out[68]:
Count each type of interaction in the database, by subsetting to the interaction
column and using value_counts
In [70]:
df["interaction_type"].value_counts()
Out[70]: