In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_unweighted
import seaborn
%pylab inline
pd.set_option('display.max_rows', 150)


Populating the interactive namespace from numpy and matplotlib

In [2]:
vep_cols = """\
Allele|Annotation|Impact|Gene_name|Gene_id|Feature_type|Feature_id|Biotype|EXON\
|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids\
|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL\
|CCDS|HGVS_OFFSET\
"""
vep_cols = vep_cols.split("|")
vep_cols = [term.strip().capitalize() for term in vep_cols]

Import Data

First we'll import the vcf files into Pandas dataframes. We split out the ANN INFO field and parse it into the component fields. Since an annotation is provided for each transcript, we split by transcript (ie splitting on ,). Then for each transcript we break it into the fields specified by the tools documentation (splitting on |). Finally, we join back to the original dataframe, with each row representing a unique POS, REF, ALT, Feature_id combination.


In [3]:
vep_df = pd.read_table('./cftr.grch37.vep.vcf', header=0, skiprows=6, usecols=range(10))
info_df = vep_df["INFO"].str.replace("ANN=", "").str.split(",").apply(pd.Series, 1).stack()
info_df = info_df.str.split("|").apply(pd.Series, 1)
info_df.index = info_df.index.droplevel(-1)
info_df.columns = vep_cols
vep_df = vep_df.join(info_df)
vep_df = vep_df[['POS', 'REF', 'ALT', 'Feature_id', 'Biotype', 'Annotation', 'Impact', 'Hgvsc']]
del info_df
vep_df.head()


Out[3]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc
0 117105737 C A ENST00000546407 processed_transcript upstream_gene_variant MODIFIER
1 117105737 C G ENST00000546407 processed_transcript upstream_gene_variant MODIFIER
2 117105737 C T ENST00000546407 processed_transcript upstream_gene_variant MODIFIER
3 117105737 C CA ENST00000546407 processed_transcript upstream_gene_variant MODIFIER
4 117105737 C CG ENST00000546407 processed_transcript upstream_gene_variant MODIFIER

In [4]:
snpeff_cols = """\
Allele|Annotation|Impact|Gene_name|Gene_ID|Feature_type|Feature_ID|Biotype|Rank|HGVSc|HGVSp\
|cDNA_position|CDS_position|Protein_position|Distance|Errors"""
snpeff_cols = snpeff_cols.split("|")
snpeff_cols = [term.strip().capitalize() for term in snpeff_cols]

In [5]:
snpeff_df = pd.read_table('./cftr.grch37.snpeff.vcf', header=0, skiprows=9, usecols=range(10))
info_df = snpeff_df["INFO"].str.split(";").apply(pd.Series, 1)[0] #snpeff includes two other INFO fields that we don't need
info_df = info_df.str.replace("ANN=", "").str.split(",").apply(pd.Series, 1).stack()
info_df = info_df.str.split("|").apply(pd.Series, 1)
info_df.index = info_df.index.droplevel(-1)
info_df.columns = snpeff_cols
snpeff_df = snpeff_df.join(info_df)
snpeff_df = snpeff_df[['POS', 'REF', 'ALT', 'Feature_id', 'Biotype', 'Annotation', 'Impact', 'Hgvsc']]
del info_df
snpeff_df.head()


Out[5]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc
0 117105737 C A ENST00000546407 processed_transcript upstream_gene_variant MODIFIER n.-101C>A
0 117105737 C A ENSG00000214684-ENSG00000001626 intergenic_region MODIFIER n.117105737C>A
1 117105737 C G ENST00000546407 processed_transcript upstream_gene_variant MODIFIER n.-101C>G
1 117105737 C G ENSG00000214684-ENSG00000001626 intergenic_region MODIFIER n.117105737C>G
2 117105737 C T ENST00000546407 processed_transcript upstream_gene_variant MODIFIER n.-101C>T

Collapsing Annotations

For each transcript-variant pair, the tool may provide multiple annotations. We'll only compare the most damaging as defined by the ranking in the ANN field spec.


In [6]:
#From http://snpeff.sourceforge.net/VCFannotationformat_v1.0.pdf with additions:
#VEP: non_coding_transcript_exon_variant, non_coding_transcript_variant, protein altering variant, 
#incomplete_terminal_codon_variant, NMD_transcript_variant
#Snpeff: conservative_inframe_deletion, conservative_inframe_insertion, structural_interaction_variant, 5_prime_UTR_truncation

ranked_terms = ["chromosome_number_variation","exon_loss_variant","frameshift_variant","stop_gained","stop_lost",
                "start_lost","splice_acceptor_variant","splice_donor_variant","rare_amino_acid_variant","missense_variant",
                "inframe_insertion","conservative_inframe_insertion", "disruptive_inframe_insertion","inframe_deletion","conservative_inframe_deletion", "disruptive_inframe_deletion",
                "5_prime_UTR_truncation+exon_loss_variant","5_prime_UTR_truncation","3_prime_UTR_truncation+exon_loss","splice_branch_variant",
                "splice_region_variant","stop_retained_variant","initiator_codon_variant",
                "synonymous_variant","initiator_codon_variant+non_canonical_start_codon","stop_retained_variant",
                "5_prime_UTR_variant","3_prime_UTR_variant","5_prime_UTR_premature_start_codon_gain_variant",
                "structural_interaction_variant", "protein_altering_variant","upstream_gene_variant","downstream_gene_variant",
                "TF_binding_site_variant","regulatory_region_variant","miRNA","custom","sequence_feature",
                "conserved_intron_variant","intron_variant","intragenic_variant","conserved_intergenic_variant",
                "intergenic_region", "coding_sequence_variant", "non_coding_exon_variant","non_coding_transcript_exon_variant",
                "nc_transcript_variant","non_coding_transcript_variant","NMD_transcript_variant", "incomplete_terminal_codon_variant", "gene_variant","chromosome"]
def term_rank(term):
    return ranked_terms.index(term)

In [7]:
vep_df["Effect"] = vep_df.apply(lambda row: min(row["Annotation"].split("&"), key=term_rank), axis=1)
snpeff_df["Effect"] = snpeff_df.apply(lambda row: min(row["Annotation"].split('&'), key=term_rank), axis=1)

In [8]:
vc_vep = vep_df.groupby(['Effect']).size()
vc_vep.name = "VEP"
vc_snpeff = snpeff_df.groupby(['Effect']).size()
vc_snpeff.name = "SnpEff"
vc_df = pd.DataFrame([vc_vep, vc_snpeff])
print("Annotations\n")
print(vc_df.transpose().fillna(0))
impact_vep = vep_df.groupby(['Impact']).size()
impact_vep.name = "VEP"
impact_snpeff = snpeff_df.groupby(['Impact']).size()
impact_snpeff.name = "SnpEff"
impact_df = pd.DataFrame([impact_vep, impact_snpeff])
print("\nImpacts")
print(impact_df.transpose())
counts_vep = vep_df.count()
counts_vep.name = 'VEP'
counts_snpeff = snpeff_df.count()
counts_snpeff.name = 'SnpEff'
counts_df = pd.DataFrame([counts_vep, counts_snpeff])
print("\nCounts")
print(counts_df.transpose())


Annotations

                                                     VEP    SnpEff
3_prime_UTR_variant                               6654.0    6680.0
5_prime_UTR_premature_start_codon_gain_variant       0.0     286.0
5_prime_UTR_variant                               9987.0   10259.0
coding_sequence_variant                             28.0       0.0
conservative_inframe_deletion                        0.0    4619.0
conservative_inframe_insertion                       0.0    8650.0
disruptive_inframe_deletion                          0.0    8990.0
disruptive_inframe_insertion                         0.0   17211.0
downstream_gene_variant                          99177.0   99185.0
exon_loss_variant                                    0.0       7.0
frameshift_variant                              113706.0  113693.0
incomplete_terminal_codon_variant                    8.0       0.0
inframe_deletion                                 13604.0       0.0
inframe_insertion                                18961.0       0.0
initiator_codon_variant                              0.0       8.0
intergenic_region                                    0.0    1403.0
intron_variant                                  417918.0  416946.0
missense_variant                                 30988.0   30988.0
non_coding_transcript_exon_variant               29794.0   29567.0
non_coding_transcript_variant                        0.0      42.0
protein_altering_variant                          6834.0       0.0
sequence_feature                                     0.0    5220.0
splice_acceptor_variant                           2208.0    2776.0
splice_donor_variant                              2208.0    2869.0
splice_region_variant                            17865.0   18005.0
start_lost                                          55.0      65.0
stop_gained                                       4877.0    4867.0
stop_lost                                           55.0      63.0
stop_retained_variant                                5.0       5.0
structural_interaction_variant                       0.0   78159.0
synonymous_variant                                9056.0    9052.0
upstream_gene_variant                            83853.0   83809.0

Impacts
             VEP  SnpEff
Impact                  
HIGH      123109  202492
LOW        26698   31676
MODERATE   70645   71365
MODIFIER  647389  647891

Counts
               VEP  SnpEff
POS         867841  953424
REF         867841  953424
ALT         867841  953424
Feature_id  867841  953424
Biotype     867841  953424
Annotation  867841  953424
Impact      867841  953424
Hgvsc       867841  953424
Effect      867841  953424

This example is interesting. Vep is usually a pretty general protein_altering_variant annotation. Snpeff breaks it down to disruptive_inframe_insertion.


In [9]:
vep_df[vep_df['Effect'].str.contains('protein_altering_variant')][:1]


Out[9]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19511 117120152 C CCGA ENST00000003084 protein_coding protein_altering_variant MODERATE ENST00000003084.6:c.4_5insCGA protein_altering_variant

In [10]:
vep_df.ix[19511]


Out[10]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19511 117120152 C CCGA ENST00000003084 protein_coding protein_altering_variant MODERATE ENST00000003084.6:c.4_5insCGA protein_altering_variant
19511 117120152 C CCGA ENST00000426809 protein_coding protein_altering_variant MODERATE ENST00000426809.1:c.4_5insCGA protein_altering_variant
19511 117120152 C CCGA ENST00000446805 protein_coding intron_variant MODIFIER ENST00000446805.1:c.-191+404_-191+405insCGA intron_variant
19511 117120152 C CCGA ENST00000454343 protein_coding protein_altering_variant MODERATE ENST00000454343.1:c.4_5insCGA protein_altering_variant
19511 117120152 C CCGA ENST00000546407 processed_transcript intron_variant&non_coding_transcript_variant MODIFIER ENST00000546407.1:n.166+4290_166+4291insCGA intron_variant

In [11]:
snpeff_df.ix[19511]


Out[11]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19511 117120152 C CCGA ENST00000003084 protein_coding disruptive_inframe_insertion MODERATE c.4_5insCGA disruptive_inframe_insertion
19511 117120152 C CCGA ENST00000454343 protein_coding disruptive_inframe_insertion MODERATE c.4_5insCGA disruptive_inframe_insertion
19511 117120152 C CCGA ENST00000426809 protein_coding disruptive_inframe_insertion MODERATE c.4_5insCGA disruptive_inframe_insertion
19511 117120152 C CCGA ENST00000546407 processed_transcript intron_variant MODIFIER n.166+4290_166+4291insCGA intron_variant
19511 117120152 C CCGA ENST00000446805 protein_coding intron_variant MODIFIER c.-191+404_-191+405insCGA intron_variant

So this example is interesting. It shows two things:

  1. SnpEff is providing these "structural_interaction_variant" annotations, which vep does not provide
  2. SnpEff is a bit more granular annotating inframe variants

In [12]:
snpeff_df[snpeff_df['Effect'].str.contains('structural_interaction_variant')][:1]


Out[12]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
58085 117182112 TTAA T 2PZG:B_388-B_567:ENST00000003084 protein_coding structural_interaction_variant HIGH c.1160_1162delTAA structural_interaction_variant

In [13]:
vep_df.ix[58085]


Out[13]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
58085 117182112 TTAA T ENST00000003084 protein_coding inframe_deletion MODERATE ENST00000003084.6:c.1160_1162delTAA inframe_deletion
58085 117182112 TTAA T ENST00000426809 protein_coding inframe_deletion MODERATE ENST00000426809.1:c.1070_1072delTAA inframe_deletion
58085 117182112 TTAA T ENST00000454343 protein_coding inframe_deletion MODERATE ENST00000454343.1:c.1160_1162delTAA inframe_deletion

In [14]:
snpeff_df.ix[58085]


Out[14]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
58085 117182112 TTAA T 2PZG:B_388-B_567:ENST00000003084 protein_coding structural_interaction_variant HIGH c.1160_1162delTAA structural_interaction_variant
58085 117182112 TTAA T ENST00000003084 protein_coding disruptive_inframe_deletion MODERATE c.1160_1162delTAA disruptive_inframe_deletion
58085 117182112 TTAA T ENST00000454343 protein_coding disruptive_inframe_deletion MODERATE c.1160_1162delTAA disruptive_inframe_deletion
58085 117182112 TTAA T ENST00000426809 protein_coding disruptive_inframe_deletion MODERATE c.1070_1072delTAA disruptive_inframe_deletion

Similarly to the structural interaction variant example, Snpeff provides an additional annotation called sequence_feature


In [15]:
snpeff_df[snpeff_df['Effect'].str.contains('sequence_feature')][3:4]


Out[15]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19449 117120148 C CA ENST00000003084 protein_coding sequence_feature LOW c.-1_1insA sequence_feature

In [16]:
vep_df.ix[19449]


Out[16]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19449 117120148 C CA ENST00000003084 protein_coding 5_prime_UTR_variant MODIFIER ENST00000003084.6:c.1dupA 5_prime_UTR_variant
19449 117120148 C CA ENST00000426809 protein_coding upstream_gene_variant MODIFIER upstream_gene_variant
19449 117120148 C CA ENST00000446805 protein_coding intron_variant MODIFIER ENST00000446805.1:c.-191+401dupA intron_variant
19449 117120148 C CA ENST00000454343 protein_coding 5_prime_UTR_variant MODIFIER ENST00000454343.1:c.1dupA 5_prime_UTR_variant
19449 117120148 C CA ENST00000546407 processed_transcript intron_variant&non_coding_transcript_variant MODIFIER ENST00000546407.1:n.166+4287dupA intron_variant

In [17]:
snpeff_df.ix[19449]


Out[17]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
19449 117120148 C CA ENST00000003084 protein_coding frameshift_variant&start_lost HIGH c.1dupA frameshift_variant
19449 117120148 C CA ENST00000454343 protein_coding frameshift_variant&start_lost HIGH c.1dupA frameshift_variant
19449 117120148 C CA ENST00000426809 protein_coding frameshift_variant&start_lost HIGH c.1dupA frameshift_variant
19449 117120148 C CA ENST00000003084 protein_coding sequence_feature LOW c.-1_1insA sequence_feature
19449 117120148 C CA ENST00000546407 processed_transcript intron_variant MODIFIER n.166+4287dupA intron_variant
19449 117120148 C CA ENST00000446805 protein_coding intron_variant MODIFIER c.-191+401dupA intron_variant

Here's a consequential mismatch. Vep annotates this deletion of a T on ENST00000446805 as coding_sequence_variant with a LOW impact, whereas SnpEff annotates it as a frameshift with a HIGH impact.


In [18]:
vep_df[vep_df['Effect'].str.contains('coding_sequence_variant')][:4]


Out[18]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
32771 117171030 CT C ENST00000446805 protein_coding incomplete_terminal_codon_variant&coding_seque... LOW ENST00000446805.1:c.109delT coding_sequence_variant
32774 117171031 T A ENST00000446805 protein_coding incomplete_terminal_codon_variant&coding_seque... LOW ENST00000446805.1:c.109T>A coding_sequence_variant
32775 117171031 T G ENST00000446805 protein_coding incomplete_terminal_codon_variant&coding_seque... LOW ENST00000446805.1:c.109T>G coding_sequence_variant
32776 117171031 T C ENST00000446805 protein_coding incomplete_terminal_codon_variant&coding_seque... LOW ENST00000446805.1:c.109T>C coding_sequence_variant

In [19]:
vep_df.ix[32771]


Out[19]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
32771 117171030 CT C ENST00000003084 protein_coding frameshift_variant HIGH ENST00000003084.6:c.352delT frameshift_variant
32771 117171030 CT C ENST00000426809 protein_coding frameshift_variant HIGH ENST00000426809.1:c.352delT frameshift_variant
32771 117171030 CT C ENST00000446805 protein_coding incomplete_terminal_codon_variant&coding_seque... LOW ENST00000446805.1:c.109delT coding_sequence_variant
32771 117171030 CT C ENST00000454343 protein_coding frameshift_variant HIGH ENST00000454343.1:c.352delT frameshift_variant

In [20]:
snpeff_df.ix[32771]


Out[20]:
POS REF ALT Feature_id Biotype Annotation Impact Hgvsc Effect
32771 117171030 CT C ENST00000003084 protein_coding frameshift_variant HIGH c.352delT frameshift_variant
32771 117171030 CT C ENST00000446805 protein_coding frameshift_variant&splice_region_variant HIGH c.109delT frameshift_variant
32771 117171030 CT C ENST00000454343 protein_coding frameshift_variant HIGH c.352delT frameshift_variant
32771 117171030 CT C ENST00000426809 protein_coding frameshift_variant HIGH c.352delT frameshift_variant
32771 117171030 CT C ENST00000003084 protein_coding sequence_feature LOW c.352delT sequence_feature

So we'll remove some of this inconsistances by dropping rows that are annotated as structural_interaction_variant or sequence_feature by Snpeff and normalizing SnpEff more granular inframe annotations.


In [21]:
snpeff_df = snpeff_df[~snpeff_df['Effect'].str.contains('structural_interaction_variant|sequence_feature')]

In [22]:
collapse_map = {
'3_prime_UTR_variant': '3_prime_UTR_variant', 
'5_prime_UTR_premature_start_codon_gain_variant': '5_prime_UTR_premature_start_codon_gain_variant',
'5_prime_UTR_variant': '5_prime_UTR_variant',
'coding_sequence_variant': 'coding_sequence_variant',
'conservative_inframe_deletion': 'inframe_deletion',
'conservative_inframe_insertion': 'inframe_insertion',
'disruptive_inframe_deletion': 'inframe_deletion',
'disruptive_inframe_insertion': 'inframe_insertion',
'downstream_gene_variant': 'downstream_gene_variant',
'exon_loss_variant': 'exon_loss_variant',
'frameshift_variant': 'frameshift_variant',
'incomplete_terminal_codon_variant': 'incomplete_terminal_codon_variant',
'inframe_deletion': 'inframe_deletion',
'inframe_insertion': 'inframe_insertion', 
'initiator_codon_variant': 'initiator_codon_variant',
'intergenic_region': 'intergenic_region',
'intron_variant': 'intron_variant',
'missense_variant': 'missense_variant',
'non_coding_transcript_exon_variant': 'non_coding_transcript_exon_variant',
'non_coding_transcript_variant': 'non_coding_transcript_variant',
'protein_altering_variant': 'inframe_insertion',
'splice_acceptor_variant': 'splice_acceptor_variant',
'splice_donor_variant': 'splice_donor_variant',
'splice_region_variant': 'splice_region_variant',
'start_lost': 'start_lost',
'stop_gained': 'stop_gained',
'stop_lost': 'stop_lost', 
'stop_retained_variant': 'stop_retained_variant',
'synonymous_variant': 'synonymous_variant',
'upstream_gene_variant': 'upstream_gene_variant'}

In [23]:
vep_df['Normalized_effect'] = vep_df['Effect'].apply(lambda eff: collapse_map[eff])
snpeff_df['Normalized_effect'] = snpeff_df['Effect'].apply(lambda eff: collapse_map[eff])

Effect and Impact Condorance

First, let's look at the effects numbers again, but this time normalized.


In [24]:
vc_vep = vep_df.groupby(['Normalized_effect']).size()
vc_vep.name = "VEP"
vc_snpeff = snpeff_df.groupby(['Normalized_effect']).size()
vc_snpeff.name = "SnpEff"
vc_df = pd.DataFrame([vc_vep, vc_snpeff])
print("Annotations\n")
print(vc_df.transpose().fillna(0))
vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8))


Annotations

                                                     VEP    SnpEff
3_prime_UTR_variant                               6654.0    6680.0
5_prime_UTR_premature_start_codon_gain_variant       0.0     286.0
5_prime_UTR_variant                               9987.0   10259.0
coding_sequence_variant                             28.0       0.0
downstream_gene_variant                          99177.0   99185.0
exon_loss_variant                                    0.0       7.0
frameshift_variant                              113706.0  113693.0
incomplete_terminal_codon_variant                    8.0       0.0
inframe_deletion                                 13604.0   13609.0
inframe_insertion                                25795.0   25861.0
initiator_codon_variant                              0.0       8.0
intergenic_region                                    0.0    1403.0
intron_variant                                  417918.0  416946.0
missense_variant                                 30988.0   30988.0
non_coding_transcript_exon_variant               29794.0   29567.0
non_coding_transcript_variant                        0.0      42.0
splice_acceptor_variant                           2208.0    2776.0
splice_donor_variant                              2208.0    2869.0
splice_region_variant                            17865.0   18005.0
start_lost                                          55.0      65.0
stop_gained                                       4877.0    4867.0
stop_lost                                           55.0      63.0
stop_retained_variant                                5.0       5.0
synonymous_variant                                9056.0    9052.0
upstream_gene_variant                            83853.0   83809.0
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd01af1bb38>

Now we join the VEP and SnpEff dataframes together and count number of matches


In [25]:
effect_df = pd.merge(vep_df, snpeff_df, on=['POS', 'REF', 'ALT', "Feature_id" ], how='outer', suffixes=('_vep','_snpeff'))

In [26]:
effect_df['Impact_match'] = effect_df.apply(lambda row: row['Impact_vep'] == row['Impact_snpeff'], axis=1)

In [27]:
effect_df['Effect_match'] = effect_df.apply(lambda row: row['Effect_vep'] == row['Effect_snpeff'], axis=1)

In [28]:
effect_df['Normalized_Effect_match'] = effect_df.apply(lambda row: row['Normalized_effect_vep'] == row['Normalized_effect_snpeff'], axis=1)

In [29]:
round(effect_df['Impact_match'].value_counts()/effect_df['Impact_match'].size*100, 2)


Out[29]:
True     99.29
False     0.71
Name: Impact_match, dtype: float64

We find a 99% match between calculated impacts between tools. And below we examine the 4x4 grid of categorization matches. There are a suprising number (almost 1600) of HIGH-LOW disagreements. These deserve to be examined.


In [30]:
effect_df.groupby(['Impact_vep', 'Impact_snpeff'])['Impact_match'].count()


Out[30]:
Impact_vep  Impact_snpeff
HIGH        HIGH             122728
            LOW                 307
            MODERATE             64
            MODIFIER             55
LOW         HIGH               1286
            LOW               24976
            MODERATE             19
            MODIFIER            633
MODERATE    HIGH                243
            LOW                  19
            MODERATE          70377
            MODIFIER              3
MODIFIER    HIGH                 76
            LOW                2054
            MODERATE              5
            MODIFIER         645789
Name: Impact_match, dtype: int64

In [31]:
figure, axes = plt.subplots(2, 2)
figure.set_size_inches(10,10)
figure.suptitle("VEP-SnpEff Impact Concordance", fontsize=14, fontweight='bold')
for idx, level in enumerate(effect_df['Impact_snpeff'].dropna().unique()):
    pos = (int(idx/2), idx%2)
    vep_level = effect_df[effect_df['Impact_vep'] == level]
    snpeff_level = effect_df[effect_df['Impact_snpeff'] == level]
    axes[pos].set_title(level, fontsize=12, fontweight='bold')
    venn2_unweighted([set(vep_level.index.values), set(snpeff_level.index.values)], set_labels=('VEP', 'SnpEff'), set_colors=('g', 'b'),
                     ax=axes[pos[0]][pos[1]])
plt.tight_layout()
plt.subplots_adjust(top=.95)
plt.show()



In [32]:
round(effect_df['Effect_match'].value_counts()/effect_df['Effect_match'].size*100, 2)


Out[32]:
True     94.57
False     5.43
Name: Effect_match, dtype: float64

Looking at the unnormalized effects, there is 95% rate of condodance.


In [33]:
pd.DataFrame(effect_df.groupby(['Effect_vep', 'Effect_snpeff'])['Effect_match'].count())


Out[33]:
Effect_match
Effect_vep Effect_snpeff
3_prime_UTR_variant 3_prime_UTR_variant 6648
downstream_gene_variant 12
frameshift_variant 6
non_coding_transcript_variant 6
splice_region_variant 74
stop_lost 8
5_prime_UTR_variant 5_prime_UTR_premature_start_codon_gain_variant 279
5_prime_UTR_variant 9944
conservative_inframe_insertion 1
exon_loss_variant 3
frameshift_variant 34
non_coding_transcript_variant 14
splice_region_variant 34
start_lost 15
upstream_gene_variant 18
coding_sequence_variant 5_prime_UTR_variant 12
frameshift_variant 4
splice_region_variant 12
downstream_gene_variant 3_prime_UTR_variant 16
downstream_gene_variant 99161
splice_region_variant 16
frameshift_variant 3_prime_UTR_variant 14
5_prime_UTR_variant 14
exon_loss_variant 2
frameshift_variant 112937
splice_donor_variant 725
splice_region_variant 17
incomplete_terminal_codon_variant frameshift_variant 6
inframe_deletion conservative_inframe_deletion 4609
disruptive_inframe_deletion 8964
exon_loss_variant 2
frameshift_variant 31
inframe_insertion 3_prime_UTR_variant 2
5_prime_UTR_variant 1
conservative_inframe_insertion 8624
disruptive_inframe_insertion 10130
frameshift_variant 14
splice_donor_variant 166
splice_region_variant 19
intron_variant intron_variant 416576
splice_region_variant 1342
missense_variant missense_variant 30988
non_coding_transcript_exon_variant downstream_gene_variant 12
non_coding_transcript_exon_variant 29472
splice_donor_variant 2
splice_region_variant 309
upstream_gene_variant 18
protein_altering_variant disruptive_inframe_insertion 6832
frameshift_variant 2
splice_acceptor_variant 5_prime_UTR_variant 10
conservative_inframe_deletion 10
conservative_inframe_insertion 16
disruptive_inframe_deletion 26
disruptive_inframe_insertion 10
frameshift_variant 414
non_coding_transcript_exon_variant 1
non_coding_transcript_variant 7
splice_acceptor_variant 1692
splice_region_variant 42
stop_gained 4
splice_donor_variant frameshift_variant 217
non_coding_transcript_variant 9
splice_donor_variant 1760
splice_region_variant 240
splice_region_variant 5_prime_UTR_premature_start_codon_gain_variant 7
5_prime_UTR_variant 214
conservative_inframe_insertion 8
disruptive_inframe_insertion 239
frameshift_variant 16
intron_variant 370
non_coding_transcript_exon_variant 41
non_coding_transcript_variant 2
splice_acceptor_variant 1084
splice_donor_variant 198
splice_region_variant 15896
stop_gained 4
start_lost initiator_codon_variant 8
start_lost 47
stop_gained splice_donor_variant 18
stop_gained 4859
stop_lost stop_lost 55
stop_retained_variant stop_retained_variant 5
synonymous_variant splice_region_variant 4
synonymous_variant 9052
upstream_gene_variant 5_prime_UTR_variant 64
conservative_inframe_insertion 1
frameshift_variant 12
non_coding_transcript_exon_variant 53
non_coding_transcript_variant 4
start_lost 3
upstream_gene_variant 83765

In [34]:
round(effect_df['Normalized_Effect_match'].value_counts()/effect_df['Normalized_Effect_match'].size*100, 2)


Out[34]:
True     99.08
False     0.92
Name: Normalized_Effect_match, dtype: float64

And once normalized (ie inframe annotations are generalized). The condordance rate jumps to 99%.


In [35]:
pd.DataFrame(effect_df.groupby(['Normalized_effect_vep', 'Normalized_effect_snpeff'])['Effect_match'].count())


Out[35]:
Effect_match
Normalized_effect_vep Normalized_effect_snpeff
3_prime_UTR_variant 3_prime_UTR_variant 6648
downstream_gene_variant 12
frameshift_variant 6
non_coding_transcript_variant 6
splice_region_variant 74
stop_lost 8
5_prime_UTR_variant 5_prime_UTR_premature_start_codon_gain_variant 279
5_prime_UTR_variant 9944
exon_loss_variant 3
frameshift_variant 34
inframe_insertion 1
non_coding_transcript_variant 14
splice_region_variant 34
start_lost 15
upstream_gene_variant 18
coding_sequence_variant 5_prime_UTR_variant 12
frameshift_variant 4
splice_region_variant 12
downstream_gene_variant 3_prime_UTR_variant 16
downstream_gene_variant 99161
splice_region_variant 16
frameshift_variant 3_prime_UTR_variant 14
5_prime_UTR_variant 14
exon_loss_variant 2
frameshift_variant 112937
splice_donor_variant 725
splice_region_variant 17
incomplete_terminal_codon_variant frameshift_variant 6
inframe_deletion exon_loss_variant 2
frameshift_variant 31
inframe_deletion 13573
inframe_insertion 3_prime_UTR_variant 2
5_prime_UTR_variant 1
frameshift_variant 16
inframe_insertion 25586
splice_donor_variant 166
splice_region_variant 19
intron_variant intron_variant 416576
splice_region_variant 1342
missense_variant missense_variant 30988
non_coding_transcript_exon_variant downstream_gene_variant 12
non_coding_transcript_exon_variant 29472
splice_donor_variant 2
splice_region_variant 309
upstream_gene_variant 18
splice_acceptor_variant 5_prime_UTR_variant 10
frameshift_variant 414
inframe_deletion 36
inframe_insertion 26
non_coding_transcript_exon_variant 1
non_coding_transcript_variant 7
splice_acceptor_variant 1692
splice_region_variant 42
stop_gained 4
splice_donor_variant frameshift_variant 217
non_coding_transcript_variant 9
splice_donor_variant 1760
splice_region_variant 240
splice_region_variant 5_prime_UTR_premature_start_codon_gain_variant 7
5_prime_UTR_variant 214
frameshift_variant 16
inframe_insertion 247
intron_variant 370
non_coding_transcript_exon_variant 41
non_coding_transcript_variant 2
splice_acceptor_variant 1084
splice_donor_variant 198
splice_region_variant 15896
stop_gained 4
start_lost initiator_codon_variant 8
start_lost 47
stop_gained splice_donor_variant 18
stop_gained 4859
stop_lost stop_lost 55
stop_retained_variant stop_retained_variant 5
synonymous_variant splice_region_variant 4
synonymous_variant 9052
upstream_gene_variant 5_prime_UTR_variant 64
frameshift_variant 12
inframe_insertion 1
non_coding_transcript_exon_variant 53
non_coding_transcript_variant 4
start_lost 3
upstream_gene_variant 83765

In [36]:
effect_list = sorted(set(effect_df['Normalized_effect_vep'].dropna().unique()) | set(effect_df['Normalized_effect_snpeff'].dropna().unique()), 
key=term_rank)

figure, axes = plt.subplots(13, 2)
figure.set_size_inches(10,45)
figure.suptitle("VEP-SnpEff Effect Concordance", fontsize=14, fontweight='bold')

for idx, effect in enumerate(effect_list):
    pos = (int(idx/2), idx%2)
    vep_effect = effect_df[effect_df['Normalized_effect_vep'] == effect]
    snpeff_effect = effect_df[effect_df['Normalized_effect_snpeff'] == effect]
    axes[pos].set_title(effect, fontsize=12, fontweight='bold')
    venn2_unweighted([set(vep_effect.index.values), set(snpeff_effect.index.values)], set_labels=('VEP', 'SnpEff'), set_colors=('g', 'b'),
                     ax=axes[pos[0]][pos[1]])
axes[12, 1].axis('off')

plt.subplots_adjust(top=.95)
plt.show()


HGVS cDot Comparison


In [37]:
effect_df['Hgvsc_vep'] = effect_df['Hgvsc_vep'].str.replace(r'^.*:', '').str.strip()
effect_df['Hgvsc_snpeff'] = effect_df['Hgvsc_snpeff'].str.replace(r'^.*:', '').str.strip()

In [38]:
effect_df['Hgvsc_match'] = effect_df.apply(lambda row: row['Hgvsc_vep'] == row['Hgvsc_snpeff'], axis=1)

First, let's just see how many HGVS cDot notations are the same.


In [39]:
round(effect_df['Hgvsc_match'].value_counts()/effect_df['Hgvsc_match'].size*100, 2)


Out[39]:
True     69.09
False    30.91
Name: Hgvsc_match, dtype: float64

We see there is about a 69% match between generated HGVS identifiers.

This isn's really a fair comparison since the tools don't always generate coding idenfiers for variants that are intergenic. So we'll check the concordance of variants where both tools generated a cDot.


In [40]:
has_hgvsc = (effect_df['Hgvsc_vep'] != '') & (effect_df['Hgvsc_snpeff'] != '')

In [41]:
filtered = effect_df[has_hgvsc]
round(filtered['Hgvsc_match'].value_counts()/filtered['Hgvsc_match'].size*100, 2)


Out[41]:
True     87.53
False    12.47
Name: Hgvsc_match, dtype: float64

Now we see an almost 88% condordance between identifiers.

But, let's filter down to coding transcripts since these are the transcripts that contain an open reading frame


In [42]:
is_coding = (effect_df['Biotype_vep'] == 'protein_coding') & (effect_df['Biotype_snpeff'] == 'protein_coding')

In [43]:
filtered_2 = effect_df[has_hgvsc & is_coding]

In [44]:
round(filtered_2['Hgvsc_match'].value_counts()/filtered_2['Hgvsc_match'].size*100, 2)


Out[44]:
True     86.33
False    13.67
Name: Hgvsc_match, dtype: float64

Here the condordance drops slighty to 86%.

Now let's just look at HGVS notation on the canonical transcript ENST00000003084


In [51]:
on_canonical = (effect_df['Feature_id'] == "ENST00000003084")

filtered_3 = effect_df[has_hgvsc & on_canonical]

round(filtered_3['Hgvsc_match'].value_counts()/filtered_3['Hgvsc_match'].size*100, 2)


Out[51]:
True    100.0
Name: Hgvsc_match, dtype: float64

Wow! So on the canonical transcript, all the HGVS annotations match. Since this transcript is well chariacterized it suggests that the mismatches are being driven by differences in transcription representation (and/or parsing of the GFF file).

Let's just look at a few random mismatches


In [45]:
mismatch_df = filtered_2[filtered_2['Hgvsc_match'] == False]
mismatch_df[['POS', 'REF', 'ALT', 'Feature_id', 'Hgvsc_vep', 'Hgvsc_snpeff', 'Effect_vep', 'Effect_snpeff']]


Out[45]:
POS REF ALT Feature_id Hgvsc_vep Hgvsc_snpeff Effect_vep Effect_snpeff
39739 117119455 TTTG T ENST00000446805 c.-423-58_-424+57delTTG c.-424+57_-423-58delTTG intron_variant intron_variant
39779 117119456 T TTG ENST00000446805 c.-423-58_-424+58dupTG c.-424+58_-423-58dupTG intron_variant intron_variant
39804 117119456 TTG T ENST00000446805 c.-423-58_-424+58delTG c.-424+58_-423-58delTG intron_variant intron_variant
39829 117119457 T TA ENST00000446805 c.-423-58_-424+58insA c.-424+58_-423-58insA intron_variant intron_variant
39839 117119457 T TC ENST00000446805 c.-423-58_-424+58insC c.-424+58_-423-58insC intron_variant intron_variant
39854 117119457 T TTA ENST00000446805 c.-423-58_-424+58insTA c.-424+58_-423-58insTA intron_variant intron_variant
39859 117119457 T TTGA ENST00000446805 c.-423-58_-424+58insTGA c.-424+58_-423-58insTGA intron_variant intron_variant
39864 117119457 T TCGA ENST00000446805 c.-423-58_-424+58insCGA c.-424+58_-423-58insCGA intron_variant intron_variant
39924 117119458 G GTG ENST00000446805 c.-423-58_-424+58dupTG c.-424+58_-423-58dupTG intron_variant intron_variant
148045 117171029 GCT G ENST00000446805 c.108_109delCT c.110_*1delCT frameshift_variant frameshift_variant
148089 117171030 C CTAC ENST00000446805 c.108_109insTAC c.109_110insACT inframe_insertion disruptive_inframe_insertion
506572 117251670 TT T ENST00000468795 c.1delT c.-2delT frameshift_variant 5_prime_UTR_variant
506577 117251670 TTA T ENST00000468795 c.1_2delTA c.-2_-1delTA frameshift_variant exon_loss_variant
506582 117251670 TTAA T ENST00000468795 c.1_3delTAA c.-2_1delTAA inframe_deletion exon_loss_variant
506583 117251670 TTAA T ENST00000468795 c.1_3delTAA c.-2_1delTAA inframe_deletion conservative_inframe_deletion
506588 117251671 T A ENST00000468795 c.1T>A c.-2T>A coding_sequence_variant 5_prime_UTR_variant
506593 117251671 T G ENST00000468795 c.1T>G c.-2T>G coding_sequence_variant 5_prime_UTR_variant
506598 117251671 T C ENST00000468795 c.1T>C c.-2T>C coding_sequence_variant 5_prime_UTR_variant
506603 117251671 T TA ENST00000468795 c.5dupA c.3dupA frameshift_variant frameshift_variant
506608 117251671 T TG ENST00000468795 c.1_2insG c.-2_-1insG frameshift_variant 5_prime_UTR_variant
506613 117251671 T TC ENST00000468795 c.1_2insC c.-2_-1insC frameshift_variant 5_prime_UTR_variant
506618 117251671 T TT ENST00000468795 c.1dupT c.-2dupT frameshift_variant 5_prime_UTR_variant
506623 117251671 T TGC ENST00000468795 c.1_2insGC c.-2_-1insGC frameshift_variant 5_prime_UTR_variant
506628 117251671 T TGA ENST00000468795 c.1_2insGA c.-2_-1insGA frameshift_variant 5_prime_UTR_variant
506633 117251671 T TATC ENST00000468795 c.2_3insTCA c.-1_1insTCA inframe_insertion conservative_inframe_insertion
506638 117251671 T TTGC ENST00000468795 c.1_2insTGC c.-2_-1insTGC inframe_insertion 5_prime_UTR_variant
506643 117251671 TA T ENST00000468795 c.5delA c.3delA frameshift_variant frameshift_variant
506648 117251671 TAA T ENST00000468795 c.4_5delAA c.2_3delAA frameshift_variant frameshift_variant
506653 117251671 TAAA T ENST00000468795 c.3_5delAAA c.1_3delAAA inframe_deletion conservative_inframe_deletion
506658 117251672 A G ENST00000468795 c.2A>G c.-1A>G coding_sequence_variant 5_prime_UTR_variant
506663 117251672 A C ENST00000468795 c.2A>C c.-1A>C coding_sequence_variant 5_prime_UTR_variant
506668 117251672 A T ENST00000468795 c.2A>T c.-1A>T coding_sequence_variant 5_prime_UTR_variant
506673 117251672 A AA ENST00000468795 c.5dupA c.3dupA frameshift_variant frameshift_variant
506678 117251672 A AG ENST00000468795 c.2_3insG c.-1_1insG frameshift_variant frameshift_variant
506683 117251672 A AC ENST00000468795 c.2_3insC c.-1_1insC frameshift_variant frameshift_variant
506688 117251672 A AT ENST00000468795 c.2_3insT c.-1_1insT frameshift_variant frameshift_variant
506693 117251672 A AGA ENST00000468795 c.2_3insGA c.-1_1insGA frameshift_variant frameshift_variant
506698 117251672 A AGC ENST00000468795 c.2_3insGC c.-1_1insGC frameshift_variant frameshift_variant
506703 117251672 A ACGT ENST00000468795 c.2_3insCGT c.-1_1insCGT inframe_insertion conservative_inframe_insertion
506708 117251672 A ACAG ENST00000468795 c.2_3insCAG c.-1_1insCAG inframe_insertion conservative_inframe_insertion
506713 117251672 AA A ENST00000468795 c.5delA c.3delA frameshift_variant frameshift_variant
506718 117251672 AAA A ENST00000468795 c.4_5delAA c.2_3delAA frameshift_variant frameshift_variant
506723 117251672 AAAA A ENST00000468795 c.3_5delAAA c.1_3delAAA inframe_deletion conservative_inframe_deletion
506728 117251673 A G ENST00000468795 c.3A>G c.1A>G missense_variant missense_variant
506733 117251673 A C ENST00000468795 c.3A>C c.1A>C missense_variant missense_variant
506738 117251673 A T ENST00000468795 c.3A>T c.1A>T stop_gained stop_gained
506743 117251673 A AA ENST00000468795 c.5dupA c.3dupA frameshift_variant frameshift_variant
506748 117251673 A AG ENST00000468795 c.3_4insG c.1_2insG frameshift_variant frameshift_variant
506753 117251673 A AC ENST00000468795 c.3_4insC c.1_2insC frameshift_variant frameshift_variant
506758 117251673 A AT ENST00000468795 c.3_4insT c.1_2insT frameshift_variant frameshift_variant
506763 117251673 A ACA ENST00000468795 c.3_4insCA c.1_2insCA frameshift_variant frameshift_variant
506768 117251673 A AGC ENST00000468795 c.3_4insGC c.1_2insGC frameshift_variant frameshift_variant
506773 117251673 A ATAC ENST00000468795 c.3_4insTAC c.1_2insTAC protein_altering_variant disruptive_inframe_insertion
506778 117251673 A ATGA ENST00000468795 c.3_4insTGA c.1_2insTGA inframe_insertion disruptive_inframe_insertion
506783 117251673 AA A ENST00000468795 c.5delA c.3delA frameshift_variant frameshift_variant
506788 117251673 AAA A ENST00000468795 c.4_5delAA c.2_3delAA frameshift_variant frameshift_variant
506793 117251673 AAAG A ENST00000468795 c.4_6delAAG c.2_4delAAG inframe_deletion disruptive_inframe_deletion
506798 117251674 A G ENST00000468795 c.4A>G c.2A>G missense_variant missense_variant
506803 117251674 A C ENST00000468795 c.4A>C c.2A>C missense_variant missense_variant
506808 117251674 A T ENST00000468795 c.4A>T c.2A>T missense_variant missense_variant
506813 117251674 A AA ENST00000468795 c.5dupA c.3dupA frameshift_variant frameshift_variant
506818 117251674 A AG ENST00000468795 c.4_5insG c.2_3insG frameshift_variant frameshift_variant
506823 117251674 A AC ENST00000468795 c.4_5insC c.2_3insC frameshift_variant frameshift_variant
506828 117251674 A AT ENST00000468795 c.4_5insT c.2_3insT frameshift_variant frameshift_variant
506833 117251674 A ATA ENST00000468795 c.4_5insTA c.2_3insTA frameshift_variant frameshift_variant
506838 117251674 A ACT ENST00000468795 c.4_5insCT c.2_3insCT frameshift_variant frameshift_variant
506843 117251674 A ACTG ENST00000468795 c.4_5insCTG c.2_3insCTG stop_gained stop_gained
506848 117251674 A AGAT ENST00000468795 c.4_5insGAT c.2_3insGAT inframe_insertion disruptive_inframe_insertion
506853 117251674 AA A ENST00000468795 c.5delA c.3delA frameshift_variant frameshift_variant
506858 117251674 AAG A ENST00000468795 c.5_6delAG c.3_4delAG frameshift_variant frameshift_variant
506863 117251674 AAGG A ENST00000468795 c.6_8delGGA c.4_6delGGA inframe_deletion conservative_inframe_deletion
506868 117251675 A G ENST00000468795 c.5A>G c.3A>G synonymous_variant synonymous_variant
506873 117251675 A C ENST00000468795 c.5A>C c.3A>C missense_variant missense_variant
506878 117251675 A T ENST00000468795 c.5A>T c.3A>T missense_variant missense_variant
506883 117251675 A AA ENST00000468795 c.5dupA c.3dupA frameshift_variant frameshift_variant
... ... ... ... ... ... ... ... ...
868183 117355976 GG G ENST00000446636 c.3208+2096delC c.3207+2095delC intron_variant intron_variant
868191 117355976 GGA G ENST00000446636 c.3208+2094_3208+2095delTC c.3207+2094_3207+2095delTC intron_variant intron_variant
868195 117355976 GGAG G ENST00000160373 c.4746+2095_4746+2097delCCT c.4746+2093_4746+2095delCTC intron_variant intron_variant
868199 117355976 GGAG G ENST00000446636 c.3208+2095_3208+2097delCCT c.3207+2093_3207+2095delCTC intron_variant intron_variant
868207 117355977 G A ENST00000446636 c.3208+2095C>T c.3207+2095C>T intron_variant intron_variant
868215 117355977 G C ENST00000446636 c.3208+2095C>G c.3207+2095C>G intron_variant intron_variant
868223 117355977 G T ENST00000446636 c.3208+2095C>A c.3207+2095C>A intron_variant intron_variant
868231 117355977 G GA ENST00000446636 c.3208+2094dupT c.3207+2094dupT intron_variant intron_variant
868235 117355977 G GG ENST00000160373 c.4746+2096dupC c.4746+2094_4746+2095insC intron_variant intron_variant
868239 117355977 G GG ENST00000446636 c.3208+2096dupC c.3207+2094_3207+2095insC intron_variant intron_variant
868247 117355977 G GC ENST00000446636 c.3208+2094_3208+2095insG c.3207+2094_3207+2095insG intron_variant intron_variant
868255 117355977 G GT ENST00000446636 c.3208+2094_3208+2095insA c.3207+2094_3207+2095insA intron_variant intron_variant
868263 117355977 G GAC ENST00000446636 c.3208+2094_3208+2095insGT c.3207+2094_3207+2095insGT intron_variant intron_variant
868271 117355977 G GCA ENST00000446636 c.3208+2094_3208+2095insTG c.3207+2094_3207+2095insTG intron_variant intron_variant
868279 117355977 G GGTA ENST00000446636 c.3208+2094_3208+2095insTAC c.3207+2094_3207+2095insTAC intron_variant intron_variant
868283 117355977 G GTCG ENST00000160373 c.4746+2095_4746+2096insGAC c.4746+2094_4746+2095insCGA intron_variant intron_variant
868287 117355977 G GTCG ENST00000446636 c.3208+2095_3208+2096insGAC c.3207+2094_3207+2095insCGA intron_variant intron_variant
868295 117355977 GA G ENST00000446636 c.3208+2094delT c.3207+2094delT intron_variant intron_variant
868299 117355977 GAG G ENST00000160373 c.4746+2094_4746+2095delTC c.4746+2093_4746+2094delCT intron_variant intron_variant
868303 117355977 GAG G ENST00000446636 c.3208+2094_3208+2095delTC c.3207+2093_3207+2094delCT intron_variant intron_variant
868311 117355977 GAGA G ENST00000446636 c.3208+2092_3208+2094delTCT c.3207+2092_3207+2094delTCT intron_variant intron_variant
868319 117355978 A G ENST00000446636 c.3208+2094T>C c.3207+2094T>C intron_variant intron_variant
868327 117355978 A C ENST00000446636 c.3208+2094T>G c.3207+2094T>G intron_variant intron_variant
868335 117355978 A T ENST00000446636 c.3208+2094T>A c.3207+2094T>A intron_variant intron_variant
868339 117355978 A AA ENST00000160373 c.4746+2094dupT c.4746+2093_4746+2094insT intron_variant intron_variant
868343 117355978 A AA ENST00000446636 c.3208+2094dupT c.3207+2093_3207+2094insT intron_variant intron_variant
868351 117355978 A AG ENST00000446636 c.3208+2093dupC c.3207+2093dupC intron_variant intron_variant
868359 117355978 A AC ENST00000446636 c.3208+2093_3208+2094insG c.3207+2093_3207+2094insG intron_variant intron_variant
868367 117355978 A AT ENST00000446636 c.3208+2093_3208+2094insA c.3207+2093_3207+2094insA intron_variant intron_variant
868375 117355978 A AGC ENST00000446636 c.3208+2093_3208+2094insGC c.3207+2093_3207+2094insGC intron_variant intron_variant
868383 117355978 A AAT ENST00000446636 c.3208+2093_3208+2094insAT c.3207+2093_3207+2094insAT intron_variant intron_variant
868387 117355978 A AGCA ENST00000160373 c.4746+2094_4746+2095insGCT c.4746+2093_4746+2094insTGC intron_variant intron_variant
868391 117355978 A AGCA ENST00000446636 c.3208+2094_3208+2095insGCT c.3207+2093_3207+2094insTGC intron_variant intron_variant
868399 117355978 A ATGC ENST00000446636 c.3208+2093_3208+2094insGCA c.3207+2093_3207+2094insGCA intron_variant intron_variant
868407 117355978 AG A ENST00000446636 c.3208+2093delC c.3207+2093delC intron_variant intron_variant
868411 117355978 AGA A ENST00000160373 c.4746+2094_4746+2095delTC c.4746+2092_4746+2093delTC intron_variant intron_variant
868415 117355978 AGA A ENST00000446636 c.3208+2094_3208+2095delTC c.3207+2092_3207+2093delTC intron_variant intron_variant
868419 117355978 AGAA A ENST00000160373 c.4746+2092_4746+2094delTCT c.4746+2091_4746+2093delTTC intron_variant intron_variant
868423 117355978 AGAA A ENST00000446636 c.3208+2092_3208+2094delTCT c.3207+2091_3207+2093delTTC intron_variant intron_variant
868431 117355979 G A ENST00000446636 c.3208+2093C>T c.3207+2093C>T intron_variant intron_variant
868439 117355979 G C ENST00000446636 c.3208+2093C>G c.3207+2093C>G intron_variant intron_variant
868447 117355979 G T ENST00000446636 c.3208+2093C>A c.3207+2093C>A intron_variant intron_variant
868455 117355979 G GA ENST00000446636 c.3208+2092dupT c.3207+2092dupT intron_variant intron_variant
868459 117355979 G GG ENST00000160373 c.4746+2093dupC c.4746+2092_4746+2093insC intron_variant intron_variant
868463 117355979 G GG ENST00000446636 c.3208+2093dupC c.3207+2092_3207+2093insC intron_variant intron_variant
868471 117355979 G GC ENST00000446636 c.3208+2092_3208+2093insG c.3207+2092_3207+2093insG intron_variant intron_variant
868479 117355979 G GT ENST00000446636 c.3208+2092_3208+2093insA c.3207+2092_3207+2093insA intron_variant intron_variant
868483 117355979 G GTG ENST00000160373 c.4746+2093_4746+2094insAC c.4746+2092_4746+2093insCA intron_variant intron_variant
868487 117355979 G GTG ENST00000446636 c.3208+2093_3208+2094insAC c.3207+2092_3207+2093insCA intron_variant intron_variant
868495 117355979 G GGT ENST00000446636 c.3208+2092_3208+2093insAC c.3207+2092_3207+2093insAC intron_variant intron_variant
868503 117355979 G GCGT ENST00000446636 c.3208+2092_3208+2093insACG c.3207+2092_3207+2093insACG intron_variant intron_variant
868507 117355979 G GTAG ENST00000160373 c.4746+2094_4746+2095insACT c.4746+2092_4746+2093insCTA intron_variant intron_variant
868511 117355979 G GTAG ENST00000446636 c.3208+2094_3208+2095insACT c.3207+2092_3207+2093insCTA intron_variant intron_variant
868519 117355979 GA G ENST00000446636 c.3208+2092delT c.3207+2092delT intron_variant intron_variant
868527 117355979 GAA G ENST00000446636 c.3208+2091_3208+2092delTT c.3207+2091_3207+2092delTT intron_variant intron_variant
868531 117355979 GAAG G ENST00000160373 c.4746+2092_4746+2094delTCT c.4746+2090_4746+2092delCTT intron_variant intron_variant
868535 117355979 GAAG G ENST00000446636 c.3208+2092_3208+2094delTCT c.3207+2090_3207+2092delCTT intron_variant intron_variant
868543 117355980 A G ENST00000446636 c.3208+2092T>C c.3207+2092T>C intron_variant intron_variant
868551 117355980 A C ENST00000446636 c.3208+2092T>G c.3207+2092T>G intron_variant intron_variant
868559 117355980 A T ENST00000446636 c.3208+2092T>A c.3207+2092T>A intron_variant intron_variant
868563 117355980 A AA ENST00000160373 c.4746+2092dupT c.4746+2091dupT intron_variant intron_variant
868567 117355980 A AA ENST00000446636 c.3208+2092dupT c.3207+2091dupT intron_variant intron_variant
868575 117355980 A AG ENST00000446636 c.3208+2091_3208+2092insC c.3207+2091_3207+2092insC intron_variant intron_variant
868583 117355980 A AC ENST00000446636 c.3208+2091_3208+2092insG c.3207+2091_3207+2092insG intron_variant intron_variant
868591 117355980 A AT ENST00000446636 c.3208+2091_3208+2092insA c.3207+2091_3207+2092insA intron_variant intron_variant
868599 117355980 A ACG ENST00000446636 c.3208+2091_3208+2092insCG c.3207+2091_3207+2092insCG intron_variant intron_variant
868603 117355980 A ACA ENST00000160373 c.4746+2092_4746+2093insGT c.4746+2091_4746+2092insTG intron_variant intron_variant
868607 117355980 A ACA ENST00000446636 c.3208+2092_3208+2093insGT c.3207+2091_3207+2092insTG intron_variant intron_variant
868615 117355980 A ACTG ENST00000446636 c.3208+2091_3208+2092insCAG c.3207+2091_3207+2092insCAG intron_variant intron_variant
868623 117355980 A ATAG ENST00000446636 c.3208+2091_3208+2092insCTA c.3207+2091_3207+2092insCTA intron_variant intron_variant
868627 117355980 AA A ENST00000160373 c.4746+2092delT c.4746+2091delT intron_variant intron_variant
868631 117355980 AA A ENST00000446636 c.3208+2092delT c.3207+2091delT intron_variant intron_variant
868639 117355980 AAG A ENST00000446636 c.3208+2090_3208+2091delCT c.3207+2090_3207+2091delCT intron_variant intron_variant
868643 117355980 AAGA A ENST00000160373 c.4746+2092_4746+2094delTCT c.4746+2089_4746+2091delTCT intron_variant intron_variant
868647 117355980 AAGA A ENST00000446636 c.3208+2092_3208+2094delTCT c.3207+2089_3207+2091delTCT intron_variant intron_variant

73008 rows × 8 columns


In [46]:
round(mismatch_df['Effect_match'].value_counts()/mismatch_df['Effect_match'].size*100, 2)


Out[46]:
True     95.58
False     4.42
Name: Effect_match, dtype: float64

So we see a slightly higher percentage of effect mismatched among the variants that have differing cDot notations.


In [47]:
vc_vep = mismatch_df.groupby(['Effect_vep']).size()
vc_vep.name = "VEP"
vc_snpeff = mismatch_df.groupby(['Effect_snpeff']).size()
vc_snpeff.name = "SnpEff"
vc_df = pd.DataFrame([vc_vep, vc_snpeff])
print("Annotations\n")
print(vc_df.transpose().fillna(0))


Annotations

                                    VEP   SnpEff
3_prime_UTR_variant               619.0    616.0
5_prime_UTR_variant                 0.0     27.0
coding_sequence_variant            12.0      0.0
conservative_inframe_deletion       0.0    361.0
conservative_inframe_insertion      0.0    645.0
disruptive_inframe_deletion         0.0    639.0
disruptive_inframe_insertion        0.0   1282.0
exon_loss_variant                   0.0      4.0
frameshift_variant               8348.0   8323.0
inframe_deletion                 1000.0      0.0
inframe_insertion                1403.0      0.0
intron_variant                  56723.0  56668.0
missense_variant                 2288.0   2288.0
protein_altering_variant          519.0      0.0
splice_acceptor_variant           115.0    141.0
splice_donor_variant              115.0    145.0
splice_region_variant             878.0    878.0
stop_gained                       312.0    311.0
stop_lost                          25.0     29.0
stop_retained_variant               3.0      3.0
synonymous_variant                648.0    648.0

Finally, we see that the vast majority of Hgvs mismatches occur for intronic variants.


In [48]:
def create_vcf_header():
    metadata_lines = [
        '##fileformat=VCFv4.0',
        '##INFO=<ID=TX,Number=1,Type=String,Description="Corresponding Transcript"',
        '##INFO=<ID=VE,Number=1,Type=String,Description="VEP Effect"',
        '##INFO=<ID=VI,Number=1,Type=String,Description="Vep Impact"',
        '##INFO=<ID=VC,Number=1,Type=String,Description="Vep coding HGVS"',
        '##INFO=<ID=SE,Number=1,Type=String,Description="SnpEff Effect"',
        '##INFO=<ID=SI,Number=1,Type=String,Description="SnpEff Impact"',
        '##INFO=<ID=SC,Number=1,Type=String,Description="SnpEff coding HGVS"']
    header_fields = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
    metadata_lines.append('\t'.join(header_fields))
    vcf_header = '\n'.join(metadata_lines)
    return vcf_header

def create_vcf_line(row):
    line = []
    line.append('7')
    line.append(row['POS'])
    line.append('.')
    line.extend(row[['REF', 'ALT']])
    line.append('.')
    line.append('PASS')
    info = 'TX={};VE={};VI={};VC={};SE={};SI={};SC={}'.format(*row[['Feature_id', 'Effect_vep', 'Impact_vep', 'Hgvsc_vep',
                                                                    'Effect_snpeff', 'Impact_snpeff', 'Hgvsc_snpeff']])
    line.append(info)
    line.append('TX:VE:VI:VC:SE:SI:SC')
    return '\t'.join(map(str, line))

def create_vcf(df, idxs, filename):
    with open(filename, "w") as vcf_out:
        vcf_out.write(create_vcf_header() + '\n')
        for i in idxs:
            vcf_out.write(create_vcf_line(df.ix[i]) + '\n')

In [49]:
hgvs_mismatch = (effect_df['Hgvsc_vep'] != '') & (effect_df['Hgvsc_snpeff'] != effect_df['Hgvsc_vep'])
hgvs_mismatch_idxs = effect_df[hgvs_mismatch].sort_values(by=['POS']).index.tolist()

impact_mismatch_idxs = effect_df[~effect_df['Impact_match']].sort_values(by=['POS']).index.tolist()
effect_mismatch_idxs = effect_df[~effect_df['Effect_match']].sort_values(by=['POS']).index.tolist()

In [50]:
create_vcf(effect_df, effect_mismatch_idxs, 'effect_mismatch.vcf')
create_vcf(effect_df, impact_mismatch_idxs, 'impact_mismatch.vcf')
create_vcf(effect_df, hgvs_mismatch_idxs, 'hgvs_mismatch.vcf')