notebook.community

Edit and run



In [1]:

    
!pip install PyPDF2
import urllib
import PyPDF2









    



Requirement already satisfied: PyPDF2 in /home/staeiou/anaconda3/lib/python3.6/site-packages



In [41]:

    
pr = PyPDF2.PdfFileReader("/home/staeiou/github/cscw-pages/data/pdfs/2013/p203-thayer.pdf")









    



PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]






    Out[41]:





11



In [28]:

    
import re
import os
import glob

rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)

def count_pages(filename):
    with open(filename,"rb") as f:
        data = f.read()
    return len(rxcountpages.findall(data))



In [ ]:



In [42]:

    
cscw_len = {}
years = [2008,2010,2011,2012,2013,2014,2015,2016,2017]
for year in years:
    
    cscw_len[year] = []
    for filename in glob.glob("/home/staeiou/github/cscw-pages/data/pdfs/" + str(year) + "/*.pdf"):
        pr = PyPDF2.PdfFileReader(filename)
        pages = len(pr.pages)
        cscw_len[year].append(pages)









    



PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]



In [48]:

    
import pandas as pd
pd.DataFrame(cscw_len[2013]).describe()



In [ ]:

	0
count	141.000000
mean	10.687943
std	2.384877
min	1.000000
25%	10.000000
50%	11.000000
75%	12.000000
max	17.000000