Korean government bidding information crawling - downloading the files


In [1]:
import pandas as pd
import requests
import os
import datetime, time
import string
from time import localtime, strftime
from datetime import timedelta
from tqdm import tqdm
from lxml import html
from requests.utils import unquote

In [2]:
def txt_reader(name):
    with open(name+".txt",'rb') as f:
        line = f.readline()
        return line.decode('utf-8').split('/')

In [3]:
def download_reader(name):
    a=[]
    f = open(name+".txt",'rb').readlines()
    for item in f:
        b = item.decode('utf-8')
        a.append(b[:14])
    return a

In [4]:
download_list = download_reader('download')

In [10]:
class Full_list_to_download(object):

    def get_bidurl(self,bidnum):
        num_split = str(bidnum).split(sep='-')
        bidno = num_split[0]
        if len(bidno) == 11:
            bidseq = num_split[-1]
            bidurl = "http://www.g2b.go.kr:8081/ep/invitation/publish/bidInfoDtl.do?bidno="+bidno+"&bidseq="+bidseq
            return bidurl
        else: 
            return "Check organization website (공고기관) for details"
        bidseq = refnum_split[-1]
        bidurl = "http://www.g2b.go.kr:8081/ep/invitation/publish/bidInfoDtl.do?bidno="+bidno+"&bidseq="+bidseq
        return bidurl

    def get_urlist(self,download_list):
        x=[]
        for item in download_list:
            x.append(self.get_bidurl(item))
        return x
    
    def get_filelist(self,pagelink):
        try:
            r=requests.get(pagelink)
            tree = html.fromstring(r.content)
            file_link = tree.xpath('//*[@class="results"]//table/tbody/tr[*]/td[3]/div/a')
            linklist=[]
            for links in file_link:
                a = links.values()[0]
                b = a[a.find("(")+1:].split(',')[0].replace("'",'')
                link = "http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq="+b
                name = a[a.find(",")+2:a.find(";")-2].replace("'","")
                linklist.append({'link':link,'name':name})
        except: 
            linklist = None
        return linklist

    def final_list(self,download_list):
        urlist = self.get_urlist(download_list)
        fullist = []
        for item in urlist:
            files = self.get_filelist(item)
            budnum = item[item.find('=')+1:item.find('=')+12]+"-"+item[item.find('&')+8:]
            fullist.append([budnum,files])
        return fullist

In [11]:
lets = Full_list_to_download()

In [12]:
finallist = lets.final_list(download_list)

In [13]:
finallist


Out[13]:
[['Check organ-rganization website (공고기관) for details', None],
 ['Check organ-rganization website (공고기관) for details', None],
 ['20170529625-00',
  [{'link': 'http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq=20170529625::00::1::1',
    'name': '20170529625-00_1495697603061_2017년 공군, 해병 출타용가방 구매 공고서.hwp'},
   {'link': 'http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq=20170529625::00::2::2',
    'name': '20170529625-00_1495697603065_구매요구서(출타용 가방)(공군).hwp'},
   {'link': 'http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq=20170529625::00::2::3',
    'name': '20170529625-00_1495697603066_구매요구서(출타용가방)(해병).hwp'},
   {'link': 'http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq=20170529625::00::2::4',
    'name': '20170529625-00_1495697603068_납품안내문 및 물품납품 영수증(0).hwp'}]]]

In [14]:
#Function to download all the files. 
#Source https://github.com/cochoa0x1/mailgunbot/blob/master/mailgunbot/utils.py
def download_file(url,filename=None,chunk_size=512, directory=os.getcwd(), total_size=None, do_not_replace=False):

    if url == None:
        # print('No files uploaded for %s'%filename)
        return
    #if no filename is given, try and get it from the url
    if not filename: 
        filename = unquote(url.split('/')[-1]).split('.')[0]
    #extension = ".%s"%unquote(url.split('.')[-1])
        
    full_name = os.path.join(directory,filename)
    
    #make the destination directory, but guard against race condition
    if not os.path.exists(os.path.dirname(full_name)):
        try:
            os.makedirs(os.path.dirname(full_name))
        except OSError as exc: 
            print(os.path.dirname(full_name))
            raise Exception('something failed')

    if os.path.exists(full_name) and do_not_replace:
        return

    r = requests.get(url, stream=True)
    print("Downloading %s..."%filename)

    rh = requests.head(url, stream=True)
    total_size = float(rh.headers.get('Content-Length',total_size))
    n_iter = int(total_size/chunk_size)+1

    with open(full_name, 'wb') as f:
        for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=n_iter) : 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
    r.close()

In [18]:
for item in finallist:
    idno = item[0]
    try:
        print("Downloading files for id no. %s"%item[0])
        for files in item[1]:
            file_name = files['name']
            directory_for_files = os.path.join('.','attachment_files',"selected_RFPs-"+strftime("%y%m%d"),idno)
            download_file(files['link'],filename=file_name,directory=directory_for_files, do_not_replace=True)
    except:
        print("Please check organization website to access files. ")
        None


Downloading files for id no. Check organ-rganization website (공고기관) for details
Please check organization website to access files. 
Downloading files for id no. Check organ-rganization website (공고기관) for details
Please check organization website to access files. 
Downloading files for id no. 20170529625-00

In [ ]: