In [1]:
# !pip install mysqltsv
In [64]:
import requests, csv, bz2, logging
AQ_URLS = {
'en': "https://ndownloader.figshare.com/files/6059502",
'fr': "https://ndownloader.figshare.com/files/6217656",
'ru': "https://ndownloader.figshare.com/files/6437268"
}
def decompress_read_rows(f):
decompressed = bz2.open(f, mode="rt")
rows = csv.DictReader(decompressed, delimiter="\t", quoting=csv.QUOTE_NONE)
# {'weighted_sum': '0.10413085831157592', 'title': 'Lost Moon', 'timestamp': '20051201000000', 'prediction': 'Stub', 'page_id': '3204489', 'rev_id': '28847753'}
for row in rows:
row['weighted_sum'] = float(row['weighted_sum'])
row['page_id'] = int(row['page_id'])
row['rev_id'] = int(row['rev_id'])
yield row
def read_aq(lang):
if lang not in AQ_URLS:
raise RuntimeError("Data for {0} not available, try {1}"
.format(lang, tuple(AQ_URLS.keys())))
else:
url = AQ_URLS[lang]
response = requests.get(url, stream=True)
return decompress_read_rows(response.raw)
read_rows = read_aq
In [65]:
import json
PAGE_ID_URLS = {
'women scientists': "https://quarry.wmflabs.org/run/125781/output/0/json-lines?download=true"
}
def read_page_ids(group):
if group not in PAGE_ID_URLS:
raise RuntimeError("Data for {0} not available, try {1}"
.format(group, tuple(PAGE_ID_URLS.keys())))
else:
url = PAGE_ID_URLS[group]
response = requests.get(url, stream=True)
for line in response.iter_lines():
yield json.loads(str(line, 'utf-8'))
In [68]:
# print(next(read_aq('en')))
In [ ]:
In [ ]: