In [2]:
bioscope = []
with open('../../../data/BioScope/bioscope_abstracts_scope_pubtator.txt', 'r') as rf:
lines = rf.readlines()
for line in lines:
bioscope.append(line)
with open('../../../data/BioScope/bioscope_papers_scope_pubtator.txt', 'r') as rf:
lines = rf.readlines()
for line in lines:
bioscope.append(line)
In [3]:
bioscope_pmids = []
for i in range(len(bioscope)):
if bioscope[i] in ['\n', '\r\n']:
if len(bioscope[i-1].split('\t')) > 1:
bioscope_pmids.append(bioscope[i-1].split('\t')[0].split("_")[0])
else:
bioscope_pmids.append(bioscope[i-1].split('|')[0].split("_")[0])
bioscope_pmids = list(set(bioscope_pmids))
In [5]:
cdr = []
with open('../../../data/CDR/Corpus.txt', 'r') as rf:
lines = rf.readlines()
for line in lines:
cdr.append(line)
In [6]:
cdr_pmids = []
for i in range(len(cdr)):
if cdr[i] in ['\n']:
if len(cdr[i-1].split('\t')) > 1:
cdr_pmids.append(cdr[i-1].split('\t')[0])
else:
cdr_pmids.append(cdr[i-1].split('|')[0])
cdr_pmids = list(set(cdr_pmids))
In [7]:
ncbi = []
with open('../../../data/NCBI_disease/Corpus.txt', 'r') as rf:
lines = rf.readlines()
for line in lines:
ncbi.append(line)
In [8]:
ncbi_pmids = []
for i in range(len(ncbi)):
if ncbi[i] in ['\n']:
if len(ncbi[i-1].split('\t')) > 1:
ncbi_pmids.append(ncbi[i-1].split('\t')[0])
else:
ncbi_pmids.append(ncbi[i-1].split('|')[0])
ncbi_pmids = list(set(ncbi_pmids))
In [43]:
scrape_pmids = []
with open('../../../data/PubMed_scrape_diseaseCancer/PubMed_diseaseCancer_PMIDs.txt', 'r') as rf:
lines = rf.readlines()
for line in lines:
scrape_pmids.append(line.strip('\n'))
In [45]:
trimmed = []
for i in range(len(scrape_pmids)):
if scrape_pmids[i] in cdr_pmids or scrape_pmids[i] in ncbi_pmids or scrape_pmids[i] in bioscope_pmids:
trimmed.append(scrape_pmids[i])
In [54]:
with open('../../../data/PubMed_scrape_diseaseCancer/exclude_PMIDs.txt', 'w') as wf:
for i in range(len(trimmed)):
wf.write(trimmed[i])
wf.write('\n')
In [ ]: