Get files from Zotero


In [1]:
import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero

from lib.secrets import CORRECTED_PAPERS_DATASET, USER_KEY

In [2]:
output_dir = join('data', 'pdf')

This is the function that does the actual download of the PDFs using Zotero's API.

First, we need to get all of the collections in the Zotero Library. Collections are like sub-folders in the library. We will be looking for a collection with the given name.

Next, we will get all of the items in a collection with a given tag. We have been tagging items with a "Rel-Yes" or "Rel-No" when we determine if the item is relevant to the study or not.

Finally, we can get the PDF attachment associated with the item. An item may have more than one attachment (PDF, HTML, etc.) underneath it. However, for our current purpose we are only concerned with the PDF.


In [3]:
def get_pdfs(output_dir, collection_name, tag):

    # Create the output directory
    path = join(output_dir, collection_name, tag)
    os.makedirs(path, exist_ok=True)

    # Connect to Zotero
    zot = zotero.Zotero(CORRECTED_PAPERS_DATASET, 'group', USER_KEY)

    # Get the collection of interest and it's key
    collections = {c['data']['name']: c for c in zot.collections()}
    collection = collections[collection_name]
    key = collection['key']

    # Now get the items in the collection that have the given tag
    items = [d for d in zot.everything(zot.collection_items(key, tag=tag))]
    # items = [d for d in zot.collection_items(key, tag=tag, limit=3)]

    # Get the PDF attachment for each item and save it to the category directory
    for item in items:
        # An item's attachments
        children = [c for c in zot.children(item['key'])]

        # Just get the PDFs
        pdfs = [c for c in children
                if c['data'].get('contentType') == 'application/pdf']

        # Handle when there are no attachments
        if not children:
            print('\nMISSING DOCUMENTS {}\n'.format(item['key']))
        # Handle when there are no PDF attachments
        elif not pdfs:
            print('\nNO PDFs {}\n'.format(item['key']))
        # Handle when there is more than one PDF attachment
        elif len(pdfs) != 1:
            print('\nTOO MANY PDFs {}\n'.format(item['key']))
        # Save the PDF to the category directory
        else:
            doc = pdfs[0]
            print(doc['data']['filename'])
            zot.dump(doc['key'], '{}.pdf'.format(doc['key']), path)

In [16]:
get_pdfs(output_dir, 'RSet_N1', 'Rel-Yes')
get_pdfs(output_dir, 'RSet_N1', 'Rel-No')
get_pdfs(output_dir, 'RSet_N2', 'Rel-Yes')
get_pdfs(output_dir, 'RSet_N2', 'Rel-No')


T7HXEB68

Handle duplicate files

It turns out that some files have both labels (Rel-Yes and Rel-No). We need to remove these files from the data set.

Get all PDF file names for a particular category.


In [5]:
def file_names(root, category):
    pattern = join('data', 'pdf', root, category, '*.pdf')
    paths = glob(pattern)
    return [basename(p) for p in paths]

We move one copy of the file out of the way and delete the extra copy.


In [6]:
def move_duplicates(root):

    rel_yes = set(file_names(root, 'Rel-Yes'))
    rel_no = set(file_names(root, 'Rel-No'))
    duplicates = rel_yes & rel_no

    dup_root = join('data', 'pdf', 'duplicates')
    os.makedirs(dup_root, exist_ok=True)

    for duplicate in duplicates:
        print(duplicate)
        src = join('data', 'pdf', root, 'Rel-Yes', duplicate)
        dst = join(dup_root, duplicate)
        move(src, dst)
        src = join('data', 'pdf', root, 'Rel-No', duplicate)
        os.remove(src)

In [7]:
move_duplicates('RSet_N1')
move_duplicates('RSet_N2')

Convert PDF files to text

Convert the PDF files to text. They will be placed into the given output directory. This utility depends on the external program "xpdf" specifically "pdftotext".

Extract the text from the PDF ad write it to a file.


In [8]:
def pdf_to_text(output_dir, pdf_path):
    txt_name = basename(pdf_path)
    txt_name = splitext(txt_name)[0] + '.txt'
    txt_path = join(output_dir, txt_name)
    cmd = "pdftotext '{}' '{}'".format(pdf_path, txt_path)
    try:
        subprocess.check_call(cmd, shell=True)
    except Exception:
        pass

Loop through all of the PDFs and convert them


In [11]:
def convert_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    pattern = join(input_dir, '*.pdf')
    pdf_paths = glob(pattern)

    for i, pdf_path in enumerate(pdf_paths, 1):
        print('Converting:', pdf_path)
        pdf_to_text(output_dir, pdf_path)

In [12]:
convert_pdfs('data/pdf/RSet_N1/Rel-Yes', 'data/Rel-Yes')
convert_pdfs('data/pdf/RSet_N1/Rel-No', 'data/Rel-No')

convert_pdfs('data/pdf/RSet_N2/Rel-Yes', 'data/Rel-Yes')
convert_pdfs('data/pdf/RSet_N2/Rel-No', 'data/Rel-No')


Converting: data/pdf/RSet_N1/Rel-Yes/NFBFRJE3.pdf
Converting: data/pdf/RSet_N1/Rel-Yes/7H3FB5AR.pdf
Converting: data/pdf/RSet_N1/Rel-Yes/9FWZX3P8.pdf
Converting: data/pdf/RSet_N1/Rel-No/HUK6N8SE.pdf
Converting: data/pdf/RSet_N1/Rel-No/TDIW72GZ.pdf
Converting: data/pdf/RSet_N1/Rel-No/PQ8MRSVV.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/U2BPDHGA.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/DT5FH8G5.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/DHW5ACU8.pdf
Converting: data/pdf/RSet_N2/Rel-No/VR7BXTHD.pdf
Converting: data/pdf/RSet_N2/Rel-No/IFJXWSER.pdf

In [ ]: