In [2]:
bioscope = []

with open('../../../data/BioScope/bioscope_abstracts_scope_pubtator.txt', 'r') as rf:
    lines = rf.readlines()
    for line in lines:
        bioscope.append(line)
        
with open('../../../data/BioScope/bioscope_papers_scope_pubtator.txt', 'r') as rf:
    lines = rf.readlines()
    for line in lines:
        bioscope.append(line)

In [3]:
bioscope_pmids = []

for i in range(len(bioscope)):
    if bioscope[i] in ['\n', '\r\n']:
        if len(bioscope[i-1].split('\t')) > 1:
            bioscope_pmids.append(bioscope[i-1].split('\t')[0].split("_")[0])
        else:
            bioscope_pmids.append(bioscope[i-1].split('|')[0].split("_")[0])
            
bioscope_pmids = list(set(bioscope_pmids))

In [5]:
cdr = []

with open('../../../data/CDR/Corpus.txt', 'r') as rf:
    lines = rf.readlines()
    for line in lines:
        cdr.append(line)

In [6]:
cdr_pmids = []

for i in range(len(cdr)):
    if cdr[i] in ['\n']:        
        if len(cdr[i-1].split('\t')) > 1:
            cdr_pmids.append(cdr[i-1].split('\t')[0])
        else:
            cdr_pmids.append(cdr[i-1].split('|')[0])
            
cdr_pmids = list(set(cdr_pmids))

In [7]:
ncbi = []

with open('../../../data/NCBI_disease/Corpus.txt', 'r') as rf:
    lines = rf.readlines()
    for line in lines:
        ncbi.append(line)

In [8]:
ncbi_pmids = []

for i in range(len(ncbi)):
    if ncbi[i] in ['\n']:        
        if len(ncbi[i-1].split('\t')) > 1:
            ncbi_pmids.append(ncbi[i-1].split('\t')[0])
        else:
            ncbi_pmids.append(ncbi[i-1].split('|')[0])
            
ncbi_pmids = list(set(ncbi_pmids))

In [43]:
scrape_pmids = []

with open('../../../data/PubMed_scrape_diseaseCancer/PubMed_diseaseCancer_PMIDs.txt', 'r') as rf:
    lines = rf.readlines()
    for line in lines:
        scrape_pmids.append(line.strip('\n'))

In [45]:
trimmed = []

for i in range(len(scrape_pmids)):
    if scrape_pmids[i] in cdr_pmids or scrape_pmids[i] in ncbi_pmids or scrape_pmids[i] in bioscope_pmids:
        trimmed.append(scrape_pmids[i])

In [54]:
with open('../../../data/PubMed_scrape_diseaseCancer/exclude_PMIDs.txt', 'w') as wf:
    for i in range(len(trimmed)):
        wf.write(trimmed[i])
        wf.write('\n')

In [ ]: