In [1]:
!pip install PyPDF2
import urllib
import PyPDF2


Requirement already satisfied: PyPDF2 in /home/staeiou/anaconda3/lib/python3.6/site-packages

In [41]:
pr = PyPDF2.PdfFileReader("/home/staeiou/github/cscw-pages/data/pdfs/2013/p203-thayer.pdf")


PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]
Out[41]:
11

In [28]:
import re
import os
import glob

rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)

def count_pages(filename):
    with open(filename,"rb") as f:
        data = f.read()
    return len(rxcountpages.findall(data))

In [ ]:


In [42]:
cscw_len = {}
years = [2008,2010,2011,2012,2013,2014,2015,2016,2017]
for year in years:
    
    cscw_len[year] = []
    for filename in glob.glob("/home/staeiou/github/cscw-pages/data/pdfs/" + str(year) + "/*.pdf"):
        pr = PyPDF2.PdfFileReader(filename)
        pages = len(pr.pages)
        cscw_len[year].append(pages)


PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]

In [48]:
import pandas as pd
pd.DataFrame(cscw_len[2013]).describe()


Out[48]:
0
count 141.000000
mean 10.687943
std 2.384877
min 1.000000
25% 10.000000
50% 11.000000
75% 12.000000
max 17.000000

In [ ]: