In [1]:
from pandas import DataFrame, merge

In [2]:
SIDER = DataFrame.from_csv('./temp/sider/meddra_all_se.tsv', sep='\t', index_col=None, header=None)
SIDER = SIDER.rename(columns={0: 'stitch_flat', 1: 'stitch_stereo',
                              2: 'umls_label', 3: 'meddra',
                              4: 'umls_meddra', 5: 'umls_def'})

In [3]:
SIDER_freq = DataFrame.from_csv('./temp/sider/meddra_freq.tsv', sep='\t', index_col=None, header=None)
SIDER_freq = SIDER_freq.rename(columns={0: 'stitch_flat', 1: 'stitch_stereo',
                                        2: 'umls_label', 3: 'placebo',
                                        4: 'average', 5: 'lower',
                                        6: 'higher', 7: 'meddra',
                                        8: 'umls_meddra', 9: 'umls_def'})

In [4]:
SIDER_complete = merge(SIDER, SIDER_freq, on=['stitch_flat', 'stitch_stereo',
                                              'umls_label', 'meddra',
                                              'umls_meddra', 'umls_def'], how='outer')

In [5]:
print(len(SIDER), len(SIDER_freq), len(SIDER_complete))


309849 291632 479075

In [6]:
SIDER_freq[:2]


Out[6]:
stitch_flat stitch_stereo umls_label placebo average lower higher meddra umls_meddra umls_def
0 CID100000085 CID000010917 C0000737 NaN 21% 0.21 0.21 LLT C0000737 Abdominal pain
1 CID100000085 CID000010917 C0000737 NaN 21% 0.21 0.21 PT C0000737 Abdominal pain

In [7]:
SIDER_complete[:2]


Out[7]:
stitch_flat stitch_stereo umls_label meddra umls_meddra umls_def placebo average lower higher
0 CID100000085 CID000010917 C0000729 LLT C0000729 Abdominal cramps NaN NaN NaN NaN
1 CID100000085 CID000010917 C0000729 PT C0000737 Abdominal pain NaN NaN NaN NaN

In [8]:
SIDER_complete['stitch_flat'] = SIDER_complete['stitch_flat'].map(lambda x:  x[:3] + 'm' + x[4:])

In [9]:
SIDER_label = SIDER_complete[['stitch_flat', 'umls_label', 'lower', 'higher']].drop_duplicates()
SIDER_label = SIDER_label.rename(columns={'stitch_flat': 'stitch_id', 'umls_label': 'umls_id'})

SIDER_meddra = SIDER_complete[['stitch_flat', 'umls_meddra', 'lower', 'higher']].drop_duplicates()
SIDER_meddra = SIDER_meddra.rename(columns={'stitch_flat': 'stitch_id', 'umls_meddra': 'umls_id'})
    

SIDER_union = merge(SIDER_meddra, SIDER_label, on=['stitch_id', 'umls_id', 'lower', 'higher'], how='outer')
print(len(SIDER_union))
SIDER_union = SIDER_union.drop_duplicates(subset=['stitch_id', 'umls_id'])
print(len(SIDER_union))
SIDER_union = SIDER_union[SIDER_union.stitch_id.notnull()]
SIDER_union = SIDER_union[SIDER_union.umls_id.notnull()]
print(len(SIDER_union))


257048
163711
163224

In [10]:
from sqlalchemy import create_engine

engine = create_engine('postgresql://postgres:postgres@localhost:32768/efktr')
SIDER_union.to_sql('stitch_to_umls', engine, index=False, if_exists='append')

In [ ]: