In [31]:
import PyPDF2
import re
import pandas as pd

该方法对可以读取PDF文件,它是按行读取,如果每一行中有缺失掉VALUE,那么如何判断这些VALUE是个大问题。 在parse_items函数中碰到的就是这个问题。


In [33]:
def init_pdf_from_file(path):
    fp=open(path,'rb')
    pdfReader=PyPDF2.PdfFileReader(fp)
    return pdfReader

getPage(n).extractText()获得第n页的text 该文件是表格,每一个表格项就是一行。所以先去掉所有的换行符,因为正则只能作用于每一行,跨行不起作用 findall返回匹配项的list


In [34]:
def extract_info_from_page(pdf,num_of_page,pattern):
    content=pdf.getPage(num_of_page).extractText()
    content=content.replace('\n','*')
    result=pattern.findall(content)
    return result

In [35]:
def extract_info_from_pages_into_list(pdf,pattern):
    TotalPages=pdf.numPages
    result=[]
    #print("There are %d pages"%TotalPages)
    for page in range(TotalPages):
        #print("%d of %d pages"%(page,TotalPages))
        each_page_info=extract_info_from_page(pdf,page,pattern)
        #print("There are %d items"%len(each_page_info))
        result=result+each_page_info
    return result

In [ ]:
parse_items解析由extract_info_from_pages_into_list返回的匹配项的list
因为文档中存在缺失项这里非常难判断哪一项应该对应哪一项
iter(item_array)可以将list(list是container,不是generator,所以list不能直接当iterator来用)变成iterator后面才可以使用next

In [36]:
def parse_items(list_info,df):
    for item in list_info:
        print(item)
        item_array=item.split('*')
        item_iterator=iter(item_array)
        item_dict={}
        item_dict['DATE']=next(item_iterator)
        item_dict['BLOCK TYPE']=next(item_iterator)
        next_content=next(item_iterator)
        item_dict['NUMBER']=int(next_content) if next_content.isdigit() else ''
        item_dict['SUBURB']=next(item_iterator) if next_content.isdigit() else next_content 
        item_dict['BLOCK']=
        item_dict['SECTION']=item_array[5]
        item_dict['AREA']=item_array[6]
        item_dict['MAX DRELLING']=item_array[7]
        item_dict['PRICE']=item_array[8]
        item_dict['PURCHASER']=item_array[9]
        df=df.append(item_dict,ignore_index=True)

In [37]:
def main():
    path='canberra_land_sales_result.pdf'
    pdfReader=init_pdf_from_file(path)
    pattern=re.compile(r'(\d{2}\/\d{2}\/\d{4}.+?)(?:(?=\d{2}\/\d{2}\/\d{4})|$)')
    info_list=extract_info_from_pages_into_list(pdfReader,pattern)
    df=pd.DataFrame()
    parse_items(info_list,df)
    print(df)

In [38]:
if __name__ == '__main__':
    main()


03/02/2017*Residential*3*Throsby*1 to 20*43*8,408*20*$7,899,900*Delta Design and Construction*
03/02/2017*Residential*5*Throsby*2-4; 6-10; *12;15;17 & *19*44*4,667*13*$4,810,000*Roman Development Holdings*
03/02/2017*Residential*6*Throsby*2 to 25*45*9,865*24*$9,393,600*Delta Design and Construction*
03/02/2017*Residential*9*Throsby*3;6;7;9 & 11*35*2,025*5*$1,854,000*MSL Projects*
03/02/2017*Residential*11*Throsby*1;3;4;5;6;7;2*4*33*2,954*7*$2,698,600*MSL Projects*
03/02/2017*Residential*15*Throsby*1-14; 1-15; 1*4; 2; 3*18,101*45*$14,631,150*Wellington Property Group*
22/12/2016*Industrial*Hume*9*21*22,671*$2,800,000*National Formwork Pty Ltd (ACN 119 745 461) atf *Hume Property Trust.*
30/11/2016*Community*Charnwood*22*97*3,600*$2,145,000*Childcare Investments Aus Pty Ltd*
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-38-2436fc2ab63a> in <module>()
      1 if __name__ == '__main__':
----> 2     main()

<ipython-input-37-c8c0739d9b81> in main()
      5     info_list=extract_info_from_pages_into_list(pdfReader,pattern)
      6     df=pd.DataFrame()
----> 7     parse_items(info_list,df)
      8     print(df)

<ipython-input-36-ea815c2e7c3a> in parse_items(list_info, df)
     13         item_dict['MAX DRELLING']=item_array[7]
     14         item_dict['PRICE']=item_array[8]
---> 15         item_dict['PURCHASER']=item_array[9]
     16         df=df.append(item_dict,ignore_index=True)
     17 

IndexError: list index out of range