In [1]:
import pandas as pd
import numpy as np
In [2]:
uniprot_mapping = pd.read_csv('raw_data/uniprot-yourlist_M201910086746803381A1F0E0DB47453E0216320D0044DCI+active_yes.tab', sep='\t')
hippie = pd.read_csv('raw_data/hippie_current.txt', sep='\t', names=['A', 'Aa', 'B', 'Bb', 'score', 'evidence'])
location_data = pd.read_excel('raw_data/41467_2015_BFncomms8866_MOESM610_ESM.xlsx', 1)
apid = pd.read_csv('raw_data/9606_noISI_Q1.txt', sep='\t', index_col=0)
In [3]:
# pre-processing
# remove HIPPIE PPIs with score below 0.01; these are usually just inferred from kegg with no evidence
hippie = hippie[hippie['score'] > 00.1]
# merge hippie and apid
hippie_apid = pd.merge(hippie, apid, how='left', left_on=['A','B'], right_on = ['UniprotName_A','UniprotName_B'])
# assign a binary == True label for all interactions in hippie that have an apid annotation
hippie_apid['binary'] = hippie_apid['UniprotID_A'].notnull()
In [4]:
display(uniprot_mapping[:3])
display(location_data[:3])
display(hippie_apid[:3])
In [5]:
# merge mapping and location data: create one row for each protein
merged = pd.merge(uniprot_mapping, location_data, how='left', left_on='Entry', right_on='UniProtAC')
merged = merged[['Entry', 'Entry name', 'Protein names', 'Gene names', 'Consensus_SL']]
merged['Consensus_SL'] = merged['Consensus_SL'].str.split('\.\s')
In [6]:
merged[:3]
Out[6]:
In [7]:
# merge hippie_apid and mapping
hippie_merged = pd.merge(hippie_apid, uniprot_mapping[['Entry name', 'Entry']], how='left', left_on='A', right_on='Entry name')
hippie_merged = pd.merge(hippie_merged, uniprot_mapping[['Entry name', 'Entry']], how='left', left_on='B', right_on='Entry name', suffixes=('_A','_B'))
hippie_merged = hippie_merged.dropna(subset=['Entry_A', 'Entry_B'])[['Entry_A', 'Entry_B', 'score', 'binary']]
In [8]:
# hippie is *not* symmetric (aka a->b but it may be that there is no b->a);
# this creates problems in the UI, so to ammend this:
# 1. create a copy of hippie_merged
# 2. swap out Entry_A and Entry_B
# 3. concatenate hippie_merged and hipppie_merged_reverse
# 4. get rid of duplicates
hippie_merged_reverse = hippie_merged.copy()
hippie_merged_reverse['Entry_A'] = hippie_merged_reverse['Entry_B'].values
hippie_merged_reverse['Entry_B'] = hippie_merged['Entry_A'].values
mega_hippie_merged = hippie_merged.append(hippie_merged_reverse)
mega_hippie_merged.drop_duplicates(subset=['Entry_A', 'Entry_B'], keep='first', inplace=True)
In [12]:
def find_interactions(mapping_row):
ac = mapping_row['Entry']
right = mega_hippie_merged[mega_hippie_merged['Entry_A'] == ac].values
left = mega_hippie_merged[mega_hippie_merged['Entry_B'] == ac].values
return [{'interactor': x[1], 'score': x[2], 'binary': x[3]} for x in right] + [{'interactor': x[0], 'score': x[2], 'binary': x[3]} for x in left]
In [13]:
# Import
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()
In [14]:
merged['partners'] = merged.parallel_apply(find_interactions, axis=1)
In [15]:
merged.to_csv('data.csv', index=False)
In [16]:
merged.to_json('data.json', orient='records')
In [ ]: