In [ ]:
# !pip install -U grequests requests

In [23]:
# %%time

# import grequests

# with open('./proxies.lst', 'r') as f:
#     src_proxy_names = list({tuple(line.strip().split('\t')) for line in f if line.strip()})
# print('Source proxies', len(src_proxy_names))

# def make_check_req(proxy):
#     res = grequests.get('https://arxiv.org',
#                         proxies={ 'http': 'http://{}:{}'.format(*proxy) },
#                         headers={'x-my-proxy' : ' '.join(proxy) },
#                         timeout=40)
#     return res

# def get_proxy_if_good(res):
#     if res and res.status_code == 200:
#         return tuple(res.request.headers['x-my-proxy'].split(' '))
#     return None

# proxy_check_requests = map(make_check_req, src_proxy_names)
# proxy_check_results = grequests.map(proxy_check_requests)
# good_proxies = list(set(filter(None, map(get_proxy_if_good, proxy_check_results))))
# print('Number of good proxies', len(good_proxies))
# with open('./good_proxies.lst', 'w') as f:
#     f.write('\n'.join('\t'.join(t) for t in good_proxies))


Source proxies 961
Number of good proxies 499
CPU times: user 7.63 s, sys: 684 ms, total: 8.32 s
Wall time: 47.1 s

In [1]:
import grequests, re, json, random, joblib, os, datetime

In [2]:
with open('./good_proxies.lst', 'r') as f:
    good_proxies = [line.strip().split('\t') for line in f if line.strip()]
print(len(good_proxies))


499

In [3]:
ID_RE = re.compile('(\d{3,}\.\d{3,})')
with open('./data/2030_urls', 'r') as f:
    ids = [ID_RE.search(line).group(1) for line in f if ID_RE.search(line)]
print(len(ids))


602587

In [4]:
# 1508.01991

def get_sources_req(paper_id):
    proxy = random.choice(good_proxies)
    return grequests.get('https://arxiv.org/e-print/' + paper_id,
                         proxies={'http': 'http://{}:{}'.format(*proxy),
                                  'https': 'http://{}:{}'.format(*proxy) },
                         headers={'x-paper-id': paper_id},
                         timeout=40)

def parse_get_sources(res):
    if not (res and res.status_code == 200):
        return False
    paper_id = res.request.headers['x-paper-id']
#     print(paper_id)
    with open('./data/arxiv/sources/{}.tar.gz'.format(paper_id), 'wb') as f:
        f.write(res.content)
    with open('./data/arxiv/sources/{}.js'.format(paper_id), 'w') as f:
        json.dump(dict(content_type=res.headers['content-type']),
                  f,
                  indent=2)
    return True

In [ ]:
already_downloaded_papers = { ID_RE.search(fname).group(1) for fname in os.listdir('./data/arxiv/sources/') }
print('downloaded:', len(already_downloaded_papers))
ids_to_download = list({ paper_id for paper_id in ids if not paper_id in already_downloaded_papers })
print('left to download:', len(ids_to_download))

BATCH_SIZE = 100

for batch_start in range(0, len(ids_to_download), BATCH_SIZE):
    print(datetime.datetime.now(), batch_start)
    batch_ids = ids_to_download[batch_start:batch_start+BATCH_SIZE]
    reqs = map(get_sources_req, batch_ids)

    success = 0
    for resp in grequests.map(reqs, size=BATCH_SIZE):
        if parse_get_sources(resp):
            success += 1
    print('success rate:', success)

    if success == 0:
        print('Update proxies!!!')
        break


downloaded: 109902
left to download: 491589
2017-10-04 16:04:12.329892 0
success rate: 37
2017-10-04 16:11:11.347987 100
success rate: 41
2017-10-04 16:14:46.210113 200
success rate: 48
2017-10-04 16:17:58.465199 300
success rate: 52
2017-10-04 16:24:40.848049 400
success rate: 52
2017-10-04 16:27:56.354472 500
success rate: 48
2017-10-04 16:31:55.292397 600
success rate: 44
2017-10-04 16:34:58.703527 700
success rate: 46
2017-10-04 16:38:05.587787 800
success rate: 46
2017-10-04 16:40:00.914776 900
success rate: 44
2017-10-04 16:48:18.919269 1000
success rate: 45
2017-10-04 16:57:15.031314 1100
success rate: 53
2017-10-04 17:04:10.455402 1200
success rate: 35
2017-10-04 17:08:41.647366 1300
success rate: 41
2017-10-04 17:12:53.107643 1400
success rate: 47
2017-10-04 17:15:13.858885 1500
success rate: 48
2017-10-04 17:22:09.307548 1600
success rate: 46
2017-10-04 17:30:23.690110 1700
success rate: 41
2017-10-04 17:32:55.783883 1800
success rate: 42
2017-10-04 17:41:35.822333 1900
success rate: 44
2017-10-04 17:47:13.033655 2000
success rate: 46
2017-10-04 17:57:30.883434 2100
success rate: 40
2017-10-04 18:05:05.602678 2200
success rate: 44
2017-10-04 18:11:59.203543 2300
success rate: 50
2017-10-04 18:18:31.635584 2400
success rate: 42
2017-10-04 18:21:00.086066 2500
success rate: 38
2017-10-04 18:24:13.430831 2600
success rate: 44
2017-10-04 18:30:14.821125 2700
success rate: 44
2017-10-04 18:34:17.933136 2800
success rate: 39
2017-10-04 18:39:05.901390 2900
success rate: 47
2017-10-04 18:52:17.354662 3000
success rate: 47
2017-10-04 18:54:43.328145 3100
success rate: 42
2017-10-04 19:00:48.372916 3200
success rate: 51
2017-10-04 19:06:42.489564 3300
success rate: 43
2017-10-04 19:08:53.937654 3400
success rate: 45
2017-10-04 19:21:59.040224 3500
success rate: 47
2017-10-04 19:25:23.270729 3600
success rate: 43
2017-10-04 19:28:37.489670 3700
success rate: 44
2017-10-04 19:31:32.039592 3800
success rate: 54
2017-10-04 19:36:01.931129 3900
success rate: 40
2017-10-04 19:38:38.031454 4000
success rate: 44
2017-10-04 19:40:47.219366 4100
success rate: 48
2017-10-04 19:47:20.494181 4200
success rate: 51
2017-10-04 19:54:53.786671 4300
success rate: 48
2017-10-04 19:59:49.142576 4400
success rate: 34
2017-10-04 20:08:17.567424 4500
success rate: 42
2017-10-04 20:10:27.636315 4600
success rate: 51
2017-10-04 20:14:42.542407 4700
success rate: 34
2017-10-04 20:16:07.547846 4800

In [ ]: