NYC Taxi Dataset on Atlas/Hive

Environment


In [1]:
try:
    import verta
except ImportError:
    !pip3 install verta

try:
    from pyhive import hive
except ImportError:
    !pip3 install pyhive
    !pip3 install thrift
    !pip3 install sasl
    !pip3 install thrift_sasl
    from pyhive import hive

from __future__ import print_function

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

In [2]:
HOST = "app.verta.ai"

PROJECT_NAME = "NYC Taxi Demand Prediction"

In [3]:
# import os
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''

Read Connection Information for Atlas/Hive from Environment


In [4]:
atlas_url = %env ATLAS_URL
atlas_user_name = %env ATLAS_USER_NAME
atlas_password = %env ATLAS_PASSWORD
hive_url = %env HIVE_URL
hive_password = %env HIVE_PASSWORD
print("Atlas username {}set".format('' if atlas_user_name else "NOT "))
print("Atlas password {}set".format('' if atlas_password else "NOT "))
print("Hive password {}set".format('' if hive_password else "NOT "))
[atlas_url, hive_url]

Instantiate Client


In [5]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)

Create Dataset and Dataset Version


In [6]:
dataset = client.set_dataset("NYC Taxi Dataset on Atlas and Hive", type="atlas hive")

In [7]:
atlas_entity_endpoint = "/api/atlas/v2/entity/bulk"
atlas_guid = "d2fdde40-706f-44af-afde-155177b8d2e4"

In [8]:
version = dataset.create_version(atlas_guid, 
                                 atlas_url, atlas_user_name,
                                 atlas_password)

Fetch Data from Hive


In [9]:
table_name = list(filter(lambda x: x.key=="table_name", version.attrs))[0].value.string_value
database_name = list(filter(lambda x: x.key=="database_name", version.attrs))[0].value.string_value
query = "select * from {}.{}".format(database_name, table_name)

In [10]:
cursor = hive.connect(hive_url).cursor()
cursor.execute(query)

data = cursor.fetchall()

col_names = [x[0] for x in cursor.description]
data_frame = pd.DataFrame(data, columns=col_names)
data_frame.head()