In [1]:
!pip install PyPDF2
import urllib
import PyPDF2
In [41]:
pr = PyPDF2.PdfFileReader("/home/staeiou/github/cscw-pages/data/pdfs/2013/p203-thayer.pdf")
Out[41]:
In [28]:
import re
import os
import glob
rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)
def count_pages(filename):
with open(filename,"rb") as f:
data = f.read()
return len(rxcountpages.findall(data))
In [ ]:
In [42]:
cscw_len = {}
years = [2008,2010,2011,2012,2013,2014,2015,2016,2017]
for year in years:
cscw_len[year] = []
for filename in glob.glob("/home/staeiou/github/cscw-pages/data/pdfs/" + str(year) + "/*.pdf"):
pr = PyPDF2.PdfFileReader(filename)
pages = len(pr.pages)
cscw_len[year].append(pages)
In [48]:
import pandas as pd
pd.DataFrame(cscw_len[2013]).describe()
Out[48]:
In [ ]: