In [1]:
from pandas import DataFrame, merge
In [2]:
SIDER = DataFrame.from_csv('./temp/sider/meddra_all_se.tsv', sep='\t', index_col=None, header=None)
SIDER = SIDER.rename(columns={0: 'stitch_flat', 1: 'stitch_stereo',
2: 'umls_label', 3: 'meddra',
4: 'umls_meddra', 5: 'umls_def'})
In [3]:
SIDER_freq = DataFrame.from_csv('./temp/sider/meddra_freq.tsv', sep='\t', index_col=None, header=None)
SIDER_freq = SIDER_freq.rename(columns={0: 'stitch_flat', 1: 'stitch_stereo',
2: 'umls_label', 3: 'placebo',
4: 'average', 5: 'lower',
6: 'higher', 7: 'meddra',
8: 'umls_meddra', 9: 'umls_def'})
In [4]:
SIDER_complete = merge(SIDER, SIDER_freq, on=['stitch_flat', 'stitch_stereo',
'umls_label', 'meddra',
'umls_meddra', 'umls_def'], how='outer')
In [5]:
print(len(SIDER), len(SIDER_freq), len(SIDER_complete))
In [6]:
SIDER_freq[:2]
Out[6]:
In [7]:
SIDER_complete[:2]
Out[7]:
In [8]:
SIDER_complete['stitch_flat'] = SIDER_complete['stitch_flat'].map(lambda x: x[:3] + 'm' + x[4:])
In [9]:
SIDER_label = SIDER_complete[['stitch_flat', 'umls_label', 'lower', 'higher']].drop_duplicates()
SIDER_label = SIDER_label.rename(columns={'stitch_flat': 'stitch_id', 'umls_label': 'umls_id'})
SIDER_meddra = SIDER_complete[['stitch_flat', 'umls_meddra', 'lower', 'higher']].drop_duplicates()
SIDER_meddra = SIDER_meddra.rename(columns={'stitch_flat': 'stitch_id', 'umls_meddra': 'umls_id'})
SIDER_union = merge(SIDER_meddra, SIDER_label, on=['stitch_id', 'umls_id', 'lower', 'higher'], how='outer')
print(len(SIDER_union))
SIDER_union = SIDER_union.drop_duplicates(subset=['stitch_id', 'umls_id'])
print(len(SIDER_union))
SIDER_union = SIDER_union[SIDER_union.stitch_id.notnull()]
SIDER_union = SIDER_union[SIDER_union.umls_id.notnull()]
print(len(SIDER_union))
In [10]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:postgres@localhost:32768/efktr')
SIDER_union.to_sql('stitch_to_umls', engine, index=False, if_exists='append')
In [ ]: