In [31]:
import PyPDF2
import re
import pandas as pd
该方法对可以读取PDF文件,它是按行读取,如果每一行中有缺失掉VALUE,那么如何判断这些VALUE是个大问题。 在parse_items函数中碰到的就是这个问题。
In [33]:
def init_pdf_from_file(path):
fp=open(path,'rb')
pdfReader=PyPDF2.PdfFileReader(fp)
return pdfReader
getPage(n).extractText()获得第n页的text 该文件是表格,每一个表格项就是一行。所以先去掉所有的换行符,因为正则只能作用于每一行,跨行不起作用 findall返回匹配项的list
In [34]:
def extract_info_from_page(pdf,num_of_page,pattern):
content=pdf.getPage(num_of_page).extractText()
content=content.replace('\n','*')
result=pattern.findall(content)
return result
In [35]:
def extract_info_from_pages_into_list(pdf,pattern):
TotalPages=pdf.numPages
result=[]
#print("There are %d pages"%TotalPages)
for page in range(TotalPages):
#print("%d of %d pages"%(page,TotalPages))
each_page_info=extract_info_from_page(pdf,page,pattern)
#print("There are %d items"%len(each_page_info))
result=result+each_page_info
return result
In [ ]:
parse_items解析由extract_info_from_pages_into_list返回的匹配项的list
因为文档中存在缺失项,这里非常难判断哪一项应该对应哪一项
iter(item_array)可以将list(list是container,不是generator,所以list不能直接当iterator来用)变成iterator,后面才可以使用next
In [36]:
def parse_items(list_info,df):
for item in list_info:
print(item)
item_array=item.split('*')
item_iterator=iter(item_array)
item_dict={}
item_dict['DATE']=next(item_iterator)
item_dict['BLOCK TYPE']=next(item_iterator)
next_content=next(item_iterator)
item_dict['NUMBER']=int(next_content) if next_content.isdigit() else ''
item_dict['SUBURB']=next(item_iterator) if next_content.isdigit() else next_content
item_dict['BLOCK']=
item_dict['SECTION']=item_array[5]
item_dict['AREA']=item_array[6]
item_dict['MAX DRELLING']=item_array[7]
item_dict['PRICE']=item_array[8]
item_dict['PURCHASER']=item_array[9]
df=df.append(item_dict,ignore_index=True)
In [37]:
def main():
path='canberra_land_sales_result.pdf'
pdfReader=init_pdf_from_file(path)
pattern=re.compile(r'(\d{2}\/\d{2}\/\d{4}.+?)(?:(?=\d{2}\/\d{2}\/\d{4})|$)')
info_list=extract_info_from_pages_into_list(pdfReader,pattern)
df=pd.DataFrame()
parse_items(info_list,df)
print(df)
In [38]:
if __name__ == '__main__':
main()