import modules.
In [1]:
import os
from modules.DataArxiv import get_date
from modules.DataArxiv import execute_query
from modules.Translate import Translate
Set credentials.
Need to prepare the credentials file form GCP console.
In [2]:
CREDENTIALS_JSON = "credentials.json"
CREDENTIALS_PATH = os.path.normpath(
os.path.join(os.getcwd(), CREDENTIALS_JSON)
)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = CREDENTIALS_PATH
Set the dates.
No argument leads to set today as the reference date.
You can set the reference data like '20150417' and days we go back.
In [3]:
REF_DATE, PREV_DATE = get_date()
#REF_DATE, PREV_DATE = get_date(baseday='20170420', beforeNdays=7)
Category list.
Set the category key you would like to check.
You can add categories by following https://arxiv.org/help/api/user-manual#subject_classifications.
In [4]:
CATEGORY_LIST = {
'ml' : ["cat:stat.ML","cat:cs.AI","cat:cs.CC","cat:cs.CE","cat:cs.CG","cat:cs.CV","cat:cs.DC","cat:cs.IR","cat:cs.IT","cat:cs.NE"]
, 'ph' : ["hep-ph"]
, 'th' : ["hep-th"]
}
CATEGORY_KEY = 'ml'
Set the query.
How to make a query : https://arxiv.org/help/api/index#about
In [5]:
CATEGORY = "+OR+".join(CATEGORY_LIST[CATEGORY_KEY])
QUERY = '''({})+AND+submittedDate:[{}0000+TO+{}0000]'''.format(
CATEGORY,PREV_DATE,REF_DATE
)
Get bulk data from arXiv.
In [6]:
BULK = execute_query(QUERY, prune=True, start=0, max_results=200)
Set target language and create the instance.
You can select {'ja','de','es','fr','ko','pt','tr','zh-CN'} as of 20/3/2017.
In [7]:
TARGET_LANG = 'ja'
TRANSLATE_CLIENT = Translate(TARGET_LANG)
In [8]:
TRANSLATE_CLIENT.check_arxiv(BULK, nmt=True)
In [ ]: