In [51]:
from pql import *
from IPython.display import HTML
import requests
import urllib
import os
from metrique import pyclient
from metrique.utils import batch_gen, dt2ts
from metrique.jsonconf import JSONConf
In [ ]:
# SIMPLE DEFAULTS
config = {
'host': '127.0.0.1',
'ssl': False,
'ssl_verify': False,
'password': 'YOUR_PASSWORD',
'debug': True
}
In [ ]:
# load the pyclient interface (load cubes, extract data, query, etc)
m = pyclient(**config)
In [ ]:
# register user
#m.user_register(password='YOUR_PASSWORD') # run once
In [ ]:
m.ping(auth=True)
In [ ]:
m.cube_list_all()
In [ ]:
tmp = '/tmp'
uri = 'https://commondatastorage.googleapis.com/ckannet-storage/2012-03-03T021709/environment.csv'
saved_uri = os.path.join(tmp, os.path.basename(uri))
# cache the file locally (not absolutely necessary)
if not os.path.exists(saved_uri):
urllib.urlretrieve(uri, saved_uri)
csv = m.get_cube('csvdata_rows', name='environmental_data', batch_size=1000)
In [ ]:
#csv.cube_register() # run once
m.cube_list_all() # should show ['USER__environment_data']
In [ ]:
# each object needs a unique 'object id'; we'll use country + year
_oid = lambda o: '_'.join((o['country_name'], o['year']))
# this is historical data, we'll override the implicit timestamps using year data
_start = lambda o: dt2ts('%s-01-01' % o['year'])
# extract the data and get back a list of the ids extracts ok/failed
saved = csv.extract(uri=saved_uri, _oid=_oid, _start=_start)
# note, extracting the same data 1+ times will only save 1 version of every object (row)
In [ ]:
result = csv.cube_index('country_name')
In [ ]:
%time df = csv.query_sample(fields='country_name')
In [ ]:
%time fields = csv.cube_sample_fields()
In [ ]:
%time countries = csv.distinct('country_name')
In [ ]:
z = csv.find(fields='country_name, population_in_largest_city, population_density_(people_per_sq_km_of_land_area)')
z[z.country_name == 'United States'].plot(x='_start', y='population_in_largest_city')
In [ ]:
z[z.country_name == 'United States'].plot(x='_start', y='population_density_(people_per_sq_km_of_land_area)')
In [ ]: