In [1]:
try:
import verta
except ImportError:
!pip3 install verta
try:
from pyhive import hive
except ImportError:
!pip3 install pyhive
!pip3 install thrift
!pip3 install sasl
!pip3 install thrift_sasl
from pyhive import hive
from __future__ import print_function
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
In [2]:
HOST = "app.verta.ai"
PROJECT_NAME = "NYC Taxi Demand Prediction"
In [3]:
# import os
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''
In [4]:
atlas_url = %env ATLAS_URL
atlas_user_name = %env ATLAS_USER_NAME
atlas_password = %env ATLAS_PASSWORD
hive_url = %env HIVE_URL
hive_password = %env HIVE_PASSWORD
print("Atlas username {}set".format('' if atlas_user_name else "NOT "))
print("Atlas password {}set".format('' if atlas_password else "NOT "))
print("Hive password {}set".format('' if hive_password else "NOT "))
[atlas_url, hive_url]
In [5]:
from verta import Client
from verta.utils import ModelAPI
client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
In [6]:
dataset = client.set_dataset("NYC Taxi Dataset on Atlas and Hive", type="atlas hive")
In [7]:
atlas_entity_endpoint = "/api/atlas/v2/entity/bulk"
atlas_guid = "d2fdde40-706f-44af-afde-155177b8d2e4"
In [8]:
version = dataset.create_version(atlas_guid,
atlas_url, atlas_user_name,
atlas_password)
In [9]:
table_name = list(filter(lambda x: x.key=="table_name", version.attrs))[0].value.string_value
database_name = list(filter(lambda x: x.key=="database_name", version.attrs))[0].value.string_value
query = "select * from {}.{}".format(database_name, table_name)
In [10]:
cursor = hive.connect(hive_url).cursor()
cursor.execute(query)
data = cursor.fetchall()
col_names = [x[0] for x in cursor.description]
data_frame = pd.DataFrame(data, columns=col_names)
data_frame.head()