In [1]:
import pandas as pd
import numpy as np

In [2]:
uniprot_mapping = pd.read_csv('raw_data/uniprot-yourlist_M201910086746803381A1F0E0DB47453E0216320D0044DCI+active_yes.tab', sep='\t')
hippie = pd.read_csv('raw_data/hippie_current.txt', sep='\t', names=['A', 'Aa', 'B', 'Bb', 'score', 'evidence'])
location_data = pd.read_excel('raw_data/41467_2015_BFncomms8866_MOESM610_ESM.xlsx', 1)
apid = pd.read_csv('raw_data/9606_noISI_Q1.txt', sep='\t', index_col=0)

In [3]:
# pre-processing

# remove HIPPIE PPIs with score below 0.01; these are usually just inferred from kegg with no evidence
hippie = hippie[hippie['score'] > 00.1]

# merge hippie and apid
hippie_apid = pd.merge(hippie, apid,  how='left', left_on=['A','B'], right_on = ['UniprotName_A','UniprotName_B'])

# assign a binary == True label for all interactions in hippie that have an apid annotation
hippie_apid['binary'] = hippie_apid['UniprotID_A'].notnull()

In [4]:
display(uniprot_mapping[:3])
display(location_data[:3])
display(hippie_apid[:3])


Entry Entry name Protein names Gene names
0 P04217 A1BG_HUMAN Alpha-1B-glycoprotein (Alpha-1-B glycoprotein) A1BG
1 Q9NQ94 A1CF_HUMAN APOBEC1 complementation factor (APOBEC1-stimul... A1CF ACF ASP
2 P01023 A2MG_HUMAN Alpha-2-macroglobulin (Alpha-2-M) (C3 and PZP-... A2M CPAMD5 FWP007
ApprovedSymbol HGNCID ApprovedName UniProtAC UniProt_SL HPRD_SL HPRD_SL.PMID LocTree3_Localization LocTree3_Score TMHsPredicted_PolyPhobius Consensus_SL 6class_consensus_SL (Fig. 1) Taxon.Wagner Taxon.Dollo Taxon.Tautz2013
0 A1BG HGNC:5 alpha-1-B glycoprotein P04217 Secreted. Extracellular (GO:0005576) 3610142 secreted 100.0 0 secreted secreted Eutheria Eutheria Euteleostomi
1 A1CF HGNC:24086 APOBEC1 complementation factor Q9NQ94 Nucleus. Endoplasmic reticulum (By similarity)... Nucleus (GO:0005634); Cytoplasm (GO:0005737) 10833526, 12881431, 12896982 nucleus 51.0 0 cytoplasm. nucleus multiple Deuterostomia Eukaryota Cell.Organisms
2 A2M HGNC:7 alpha-2-macroglobulin P01023 Secreted. Extracellular (GO:0005576); Cytosol (GO:0005829) 9831625, 11823454, 15561729, 16188874, 16199891 secreted 100.0 0 cytoplasm. secreted multiple Euteleostomi Deuterostomia Cell.Organisms
A Aa B Bb score evidence UniprotID_A UniprotName_A GeneName_A UniprotID_B UniprotName_B GeneName_B ExpEvidences Methods Publications 3DStructures CurationEvents binary
0 AL1A1_HUMAN 216 AL1A1_HUMAN 216 0.76 experiments:in vivo,Two-hybrid;pmids:12081471,... P00352 AL1A1_HUMAN ALDH1A1 P00352 AL1A1_HUMAN ALDH1A1 2.0 4.0 3.0 0.0 5.0 True
1 ITA7_HUMAN 3679 ACHA_HUMAN 1134 0.73 experiments:in vivo,Affinity Capture-Western,a... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False
2 NEB1_HUMAN 55607 ACTG_HUMAN 71 0.65 experiments:in vitro,in vivo;pmids:9362513,120... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False

In [5]:
# merge mapping and location data: create one row for each protein

merged = pd.merge(uniprot_mapping, location_data, how='left', left_on='Entry', right_on='UniProtAC')
merged = merged[['Entry', 'Entry name', 'Protein names', 'Gene names', 'Consensus_SL']]
merged['Consensus_SL'] = merged['Consensus_SL'].str.split('\.\s')

In [6]:
merged[:3]


Out[6]:
Entry Entry name Protein names Gene names Consensus_SL
0 P04217 A1BG_HUMAN Alpha-1B-glycoprotein (Alpha-1-B glycoprotein) A1BG [secreted]
1 Q9NQ94 A1CF_HUMAN APOBEC1 complementation factor (APOBEC1-stimul... A1CF ACF ASP [cytoplasm, nucleus]
2 P01023 A2MG_HUMAN Alpha-2-macroglobulin (Alpha-2-M) (C3 and PZP-... A2M CPAMD5 FWP007 [cytoplasm, secreted]

In [7]:
# merge hippie_apid and mapping

hippie_merged = pd.merge(hippie_apid, uniprot_mapping[['Entry name', 'Entry']], how='left', left_on='A', right_on='Entry name')
hippie_merged = pd.merge(hippie_merged, uniprot_mapping[['Entry name', 'Entry']], how='left', left_on='B', right_on='Entry name', suffixes=('_A','_B'))
hippie_merged = hippie_merged.dropna(subset=['Entry_A', 'Entry_B'])[['Entry_A', 'Entry_B', 'score', 'binary']]

In [8]:
# hippie is *not* symmetric (aka a->b but it may be that there is no b->a);
# this creates problems in the UI, so to ammend this:
#  1. create a copy of hippie_merged
#  2. swap out Entry_A and Entry_B
#  3. concatenate hippie_merged and hipppie_merged_reverse
#  4. get rid of duplicates

hippie_merged_reverse = hippie_merged.copy()
hippie_merged_reverse['Entry_A'] = hippie_merged_reverse['Entry_B'].values
hippie_merged_reverse['Entry_B'] = hippie_merged['Entry_A'].values

mega_hippie_merged = hippie_merged.append(hippie_merged_reverse)
mega_hippie_merged.drop_duplicates(subset=['Entry_A', 'Entry_B'], keep='first', inplace=True)

In [12]:
def find_interactions(mapping_row):
    ac = mapping_row['Entry']
    right = mega_hippie_merged[mega_hippie_merged['Entry_A'] == ac].values
    left = mega_hippie_merged[mega_hippie_merged['Entry_B'] == ac].values
    
    return [{'interactor': x[1], 'score': x[2], 'binary': x[3]} for x in right] + [{'interactor': x[0], 'score': x[2], 'binary': x[3]} for x in left]

In [13]:
# Import
from pandarallel import pandarallel

# Initialization
pandarallel.initialize()


INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

In [14]:
merged['partners'] = merged.parallel_apply(find_interactions, axis=1)

In [15]:
merged.to_csv('data.csv', index=False)

In [16]:
merged.to_json('data.json', orient='records')

In [ ]: