This notebook gives some examples in Python for programmatic access to http://genomenexus.org. You can run these examples after installing Jupyter. Easiest way for using Jupyter is installing the Python 3 version of anaconda: https://www.anaconda.com/download/. After having that you can install Jupyter with:
conda install jupyter
For these exampels we also require the Swagger API client reader Bravado. Unfortunately not yet available in anaconda, but you can get it through pip
:
conda install pip
pip install bravado
Let's try connecting to the Genome Nexus API now:
In [23]:
from bravado.client import SwaggerClient
client = SwaggerClient.from_url('https://www.genomenexus.org/v2/api-docs',
config={"validate_requests":False,"validate_responses":False})
print(client)
In [24]:
dir(client)
Out[24]:
In [25]:
for a in dir(client):
client.__setattr__(a[:-len('-controller')], client.__getattr__(a))
In [26]:
variant = client.annotation.fetchVariantAnnotationGET(variant='17:g.41242962_41242963insGA').result()
In [27]:
dir(variant)
Out[27]:
In [28]:
tc1 = variant.transcript_consequences[0]
In [29]:
dir(tc1)
Out[29]:
In [30]:
print(tc1)
cBioPortal also uses Swagger for their API.
In [31]:
import seaborn as sns
%matplotlib inline
sns.set_style("white")
sns.set_context('talk')
import matplotlib.pyplot as plt
In [32]:
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/api-docs',
config={"validate_requests":False,"validate_responses":False})
print(cbioportal)
In [33]:
for a in dir(cbioportal):
cbioportal.__setattr__(a.replace(' ', '_').lower(), cbioportal.__getattr__(a))
In [34]:
dir(cbioportal)
Out[34]:
In [35]:
muts = cbioportal.mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
molecularProfileId="msk_impact_2017_mutations", # {study_id}_mutations gives default mutations profile for study
sampleListId="msk_impact_2017_all", # {study_id}_all includes all samples
projection="DETAILED" # include gene info
).result()
In [36]:
import pandas as pd
In [37]:
mdf = pd.DataFrame([dict(m.__dict__['_Model__dict'],
**m.__dict__['_Model__dict']['gene'].__dict__['_Model__dict']) for m in muts])
In [38]:
mdf.groupby('uniqueSampleKey').studyId.count().plot(kind='hist', bins=400, xlim=(0,30))
plt.xlabel('Number of mutations in sample')
plt.ylabel('Number of samples')
plt.title('Number of mutations across samples in MSK-IMPACT (2017)')
sns.despine(trim=True)
In [39]:
mdf.variantType.astype(str).value_counts().plot(kind='bar')
plt.title('Types of mutations in MSK-IMPACT (2017)')
sns.despine(trim=False)
In [40]:
snvs = mdf[(mdf.variantType == 'SNP') & (mdf.variantAllele != '-') & (mdf.referenceAllele != '-')].copy()
In [41]:
# need query string like 9:g.22125503G>C
snvs['hgvs_for_gn'] = snvs.chromosome.astype(str) + ":g." + snvs.startPosition.astype(str) + snvs.referenceAllele + '>' + snvs.variantAllele
In [42]:
assert(snvs['hgvs_for_gn'].isnull().sum() == 0)
In [43]:
import time
qvariants = list(set(snvs.hgvs_for_gn))
gn_results = []
chunk_size = 500
print("Querying {} variants".format(len(qvariants)))
for n, qvar in enumerate([qvariants[i:i + chunk_size] for i in range(0, len(qvariants), chunk_size)]):
try:
gn_results += client.annotation.fetchVariantAnnotationPOST(variants=qvar,fields=['hotspots']).result()
print("Querying [{}, {}]: Success".format(n*chunk_size, min(len(qvariants), n*chunk_size+chunk_size)))
except Exception as e:
print("Querying [{}, {}]: Failed".format(n*chunk_size, min(len(qvariants), n*chunk_size+chunk_size)))
pass
time.sleep(1) # add a delay, to not overload server
In [44]:
gn_dict = {v.id:v for v in gn_results}
In [45]:
def is_sift_high(variant):
return variant in gn_dict and \
len(list(filter(lambda x: x.sift_prediction == 'deleterious', gn_dict[variant].transcript_consequences))) > 0
def is_polyphen_high(variant):
return variant in gn_dict and \
len(list(filter(lambda x: x.polyphen_prediction == 'probably_damaging', gn_dict[variant].transcript_consequences))) > 0
In [51]:
snvs['is_sift_high'] = snvs.hgvs_for_gn.apply(is_sift_high)
snvs['is_polyphen_high'] = snvs.hgvs_for_gn.apply(is_polyphen_high)
In [52]:
from matplotlib_venn import venn2
venn2(subsets=((snvs.is_sift_high & (~snvs.is_polyphen_high)).sum(),
(snvs.is_polyphen_high & (~snvs.is_sift_high)).sum(),
(snvs.is_polyphen_high & snvs.is_sift_high).sum()), set_labels=["SIFT","PolyPhen-2"])
plt.title("Variants as predicted to have a high impact in MSK-IMPACT (2017)")
Out[52]: