In [1]:
import pandas as pd
import requests
import os
import datetime, time
import string
from time import localtime, strftime
from datetime import timedelta
from tqdm import tqdm
from lxml import html
from requests.utils import unquote
In [2]:
def txt_reader(name):
with open(name+".txt",'rb') as f:
line = f.readline()
return line.decode('utf-8').split('/')
In [3]:
def download_reader(name):
a=[]
f = open(name+".txt",'rb').readlines()
for item in f:
b = item.decode('utf-8')
a.append(b[:14])
return a
In [4]:
download_list = download_reader('download')
In [10]:
class Full_list_to_download(object):
def get_bidurl(self,bidnum):
num_split = str(bidnum).split(sep='-')
bidno = num_split[0]
if len(bidno) == 11:
bidseq = num_split[-1]
bidurl = "http://www.g2b.go.kr:8081/ep/invitation/publish/bidInfoDtl.do?bidno="+bidno+"&bidseq="+bidseq
return bidurl
else:
return "Check organization website (공고기관) for details"
bidseq = refnum_split[-1]
bidurl = "http://www.g2b.go.kr:8081/ep/invitation/publish/bidInfoDtl.do?bidno="+bidno+"&bidseq="+bidseq
return bidurl
def get_urlist(self,download_list):
x=[]
for item in download_list:
x.append(self.get_bidurl(item))
return x
def get_filelist(self,pagelink):
try:
r=requests.get(pagelink)
tree = html.fromstring(r.content)
file_link = tree.xpath('//*[@class="results"]//table/tbody/tr[*]/td[3]/div/a')
linklist=[]
for links in file_link:
a = links.values()[0]
b = a[a.find("(")+1:].split(',')[0].replace("'",'')
link = "http://www.g2b.go.kr:8081/ep/co/fileDownload.do?fileTask=NOTIFY&fileSeq="+b
name = a[a.find(",")+2:a.find(";")-2].replace("'","")
linklist.append({'link':link,'name':name})
except:
linklist = None
return linklist
def final_list(self,download_list):
urlist = self.get_urlist(download_list)
fullist = []
for item in urlist:
files = self.get_filelist(item)
budnum = item[item.find('=')+1:item.find('=')+12]+"-"+item[item.find('&')+8:]
fullist.append([budnum,files])
return fullist
In [11]:
lets = Full_list_to_download()
In [12]:
finallist = lets.final_list(download_list)
In [13]:
finallist
Out[13]:
In [14]:
#Function to download all the files.
#Source https://github.com/cochoa0x1/mailgunbot/blob/master/mailgunbot/utils.py
def download_file(url,filename=None,chunk_size=512, directory=os.getcwd(), total_size=None, do_not_replace=False):
if url == None:
# print('No files uploaded for %s'%filename)
return
#if no filename is given, try and get it from the url
if not filename:
filename = unquote(url.split('/')[-1]).split('.')[0]
#extension = ".%s"%unquote(url.split('.')[-1])
full_name = os.path.join(directory,filename)
#make the destination directory, but guard against race condition
if not os.path.exists(os.path.dirname(full_name)):
try:
os.makedirs(os.path.dirname(full_name))
except OSError as exc:
print(os.path.dirname(full_name))
raise Exception('something failed')
if os.path.exists(full_name) and do_not_replace:
return
r = requests.get(url, stream=True)
print("Downloading %s..."%filename)
rh = requests.head(url, stream=True)
total_size = float(rh.headers.get('Content-Length',total_size))
n_iter = int(total_size/chunk_size)+1
with open(full_name, 'wb') as f:
for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=n_iter) :
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
r.close()
In [18]:
for item in finallist:
idno = item[0]
try:
print("Downloading files for id no. %s"%item[0])
for files in item[1]:
file_name = files['name']
directory_for_files = os.path.join('.','attachment_files',"selected_RFPs-"+strftime("%y%m%d"),idno)
download_file(files['link'],filename=file_name,directory=directory_for_files, do_not_replace=True)
except:
print("Please check organization website to access files. ")
None
In [ ]: