In [1]:
import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed
from pyzotero import zotero
from lib.secrets import CORRECTED_PAPERS_DATASET, USER_KEY
In [2]:
output_dir = join('data', 'pdf')
This is the function that does the actual download of the PDFs using Zotero's API.
First, we need to get all of the collections in the Zotero Library. Collections are like sub-folders in the library. We will be looking for a collection with the given name.
Next, we will get all of the items in a collection with a given tag. We have been tagging items with a "Rel-Yes" or "Rel-No" when we determine if the item is relevant to the study or not.
Finally, we can get the PDF attachment associated with the item. An item may have more than one attachment (PDF, HTML, etc.) underneath it. However, for our current purpose we are only concerned with the PDF.
In [3]:
def get_pdfs(output_dir, collection_name, tag):
# Create the output directory
path = join(output_dir, collection_name, tag)
os.makedirs(path, exist_ok=True)
# Connect to Zotero
zot = zotero.Zotero(CORRECTED_PAPERS_DATASET, 'group', USER_KEY)
# Get the collection of interest and it's key
collections = {c['data']['name']: c for c in zot.collections()}
collection = collections[collection_name]
key = collection['key']
# Now get the items in the collection that have the given tag
items = [d for d in zot.everything(zot.collection_items(key, tag=tag))]
# items = [d for d in zot.collection_items(key, tag=tag, limit=3)]
# Get the PDF attachment for each item and save it to the category directory
for item in items:
# An item's attachments
children = [c for c in zot.children(item['key'])]
# Just get the PDFs
pdfs = [c for c in children
if c['data'].get('contentType') == 'application/pdf']
# Handle when there are no attachments
if not children:
print('\nMISSING DOCUMENTS {}\n'.format(item['key']))
# Handle when there are no PDF attachments
elif not pdfs:
print('\nNO PDFs {}\n'.format(item['key']))
# Handle when there is more than one PDF attachment
elif len(pdfs) != 1:
print('\nTOO MANY PDFs {}\n'.format(item['key']))
# Save the PDF to the category directory
else:
doc = pdfs[0]
print(doc['data']['filename'])
zot.dump(doc['key'], '{}.pdf'.format(doc['key']), path)
In [16]:
get_pdfs(output_dir, 'RSet_N1', 'Rel-Yes')
get_pdfs(output_dir, 'RSet_N1', 'Rel-No')
get_pdfs(output_dir, 'RSet_N2', 'Rel-Yes')
get_pdfs(output_dir, 'RSet_N2', 'Rel-No')
It turns out that some files have both labels (Rel-Yes and Rel-No). We need to remove these files from the data set.
Get all PDF file names for a particular category.
In [5]:
def file_names(root, category):
pattern = join('data', 'pdf', root, category, '*.pdf')
paths = glob(pattern)
return [basename(p) for p in paths]
We move one copy of the file out of the way and delete the extra copy.
In [6]:
def move_duplicates(root):
rel_yes = set(file_names(root, 'Rel-Yes'))
rel_no = set(file_names(root, 'Rel-No'))
duplicates = rel_yes & rel_no
dup_root = join('data', 'pdf', 'duplicates')
os.makedirs(dup_root, exist_ok=True)
for duplicate in duplicates:
print(duplicate)
src = join('data', 'pdf', root, 'Rel-Yes', duplicate)
dst = join(dup_root, duplicate)
move(src, dst)
src = join('data', 'pdf', root, 'Rel-No', duplicate)
os.remove(src)
In [7]:
move_duplicates('RSet_N1')
move_duplicates('RSet_N2')
Convert the PDF files to text. They will be placed into the given output directory. This utility depends on the external program "xpdf" specifically "pdftotext".
Extract the text from the PDF ad write it to a file.
In [8]:
def pdf_to_text(output_dir, pdf_path):
txt_name = basename(pdf_path)
txt_name = splitext(txt_name)[0] + '.txt'
txt_path = join(output_dir, txt_name)
cmd = "pdftotext '{}' '{}'".format(pdf_path, txt_path)
try:
subprocess.check_call(cmd, shell=True)
except Exception:
pass
Loop through all of the PDFs and convert them
In [11]:
def convert_pdfs(input_dir, output_dir):
os.makedirs(output_dir, exist_ok=True)
pattern = join(input_dir, '*.pdf')
pdf_paths = glob(pattern)
for i, pdf_path in enumerate(pdf_paths, 1):
print('Converting:', pdf_path)
pdf_to_text(output_dir, pdf_path)
In [12]:
convert_pdfs('data/pdf/RSet_N1/Rel-Yes', 'data/Rel-Yes')
convert_pdfs('data/pdf/RSet_N1/Rel-No', 'data/Rel-No')
convert_pdfs('data/pdf/RSet_N2/Rel-Yes', 'data/Rel-Yes')
convert_pdfs('data/pdf/RSet_N2/Rel-No', 'data/Rel-No')
In [ ]: