In [ ]:
# !pip install -U grequests requests
In [23]:
# %%time
# import grequests
# with open('./proxies.lst', 'r') as f:
# src_proxy_names = list({tuple(line.strip().split('\t')) for line in f if line.strip()})
# print('Source proxies', len(src_proxy_names))
# def make_check_req(proxy):
# res = grequests.get('https://arxiv.org',
# proxies={ 'http': 'http://{}:{}'.format(*proxy) },
# headers={'x-my-proxy' : ' '.join(proxy) },
# timeout=40)
# return res
# def get_proxy_if_good(res):
# if res and res.status_code == 200:
# return tuple(res.request.headers['x-my-proxy'].split(' '))
# return None
# proxy_check_requests = map(make_check_req, src_proxy_names)
# proxy_check_results = grequests.map(proxy_check_requests)
# good_proxies = list(set(filter(None, map(get_proxy_if_good, proxy_check_results))))
# print('Number of good proxies', len(good_proxies))
# with open('./good_proxies.lst', 'w') as f:
# f.write('\n'.join('\t'.join(t) for t in good_proxies))
In [1]:
import grequests, re, json, random, joblib, os, datetime
In [2]:
with open('./good_proxies.lst', 'r') as f:
good_proxies = [line.strip().split('\t') for line in f if line.strip()]
print(len(good_proxies))
In [3]:
ID_RE = re.compile('(\d{3,}\.\d{3,})')
with open('./data/2030_urls', 'r') as f:
ids = [ID_RE.search(line).group(1) for line in f if ID_RE.search(line)]
print(len(ids))
In [4]:
# 1508.01991
def get_sources_req(paper_id):
proxy = random.choice(good_proxies)
return grequests.get('https://arxiv.org/e-print/' + paper_id,
proxies={'http': 'http://{}:{}'.format(*proxy),
'https': 'http://{}:{}'.format(*proxy) },
headers={'x-paper-id': paper_id},
timeout=40)
def parse_get_sources(res):
if not (res and res.status_code == 200):
return False
paper_id = res.request.headers['x-paper-id']
# print(paper_id)
with open('./data/arxiv/sources/{}.tar.gz'.format(paper_id), 'wb') as f:
f.write(res.content)
with open('./data/arxiv/sources/{}.js'.format(paper_id), 'w') as f:
json.dump(dict(content_type=res.headers['content-type']),
f,
indent=2)
return True
In [ ]:
already_downloaded_papers = { ID_RE.search(fname).group(1) for fname in os.listdir('./data/arxiv/sources/') }
print('downloaded:', len(already_downloaded_papers))
ids_to_download = list({ paper_id for paper_id in ids if not paper_id in already_downloaded_papers })
print('left to download:', len(ids_to_download))
BATCH_SIZE = 100
for batch_start in range(0, len(ids_to_download), BATCH_SIZE):
print(datetime.datetime.now(), batch_start)
batch_ids = ids_to_download[batch_start:batch_start+BATCH_SIZE]
reqs = map(get_sources_req, batch_ids)
success = 0
for resp in grequests.map(reqs, size=BATCH_SIZE):
if parse_get_sources(resp):
success += 1
print('success rate:', success)
if success == 0:
print('Update proxies!!!')
break
In [ ]: