In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import pandas_td as td
# Initialize connection
con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint='https://api.treasuredata.com/')
Using a connection, you can use some convenient methods like databases
and tables
:
In [2]:
# Get a list of databases
con.databases()
Out[2]:
In [3]:
# Get a list of tables
con.tables('sample_datasets')
Out[3]:
You can choose a query engine and use it to run a queby by read_td
:
In [4]:
# Select a query engine
presto = con.query_engine(database='sample_datasets', type='presto')
# Run a query
td.read_td('select * from nasdaq limit 3', presto)
Out[4]:
Alternatively, you can read a table into a DataFrame by read_td_table
, optionally with a specific time range, sample rate, and/or a limit:
In [5]:
# Read from a table with 5-percent sampling
df = td.read_td_table('nasdaq', presto, time_range=('2000-01-01', '2010-01-01'), sample=0.05, limit=10000)
Pandas-TD also supports importing a DataFrame into a table by to_td
:
In [6]:
# Create a DataFrame with random values
df = pd.DataFrame(np.random.rand(3, 3), columns=['x', 'y', 'z'])
# Import it into 'tutorial.tut1'
td.to_td(df, 'tutorial.tut1', con, if_exists='replace', index=False)
In [7]:
# Check the result
td.read_td('select * from tutorial.tut1', presto)
Out[7]:
In [8]:
help(td.read_td_query)
In [9]:
# Convert "time" to a time series index
df = td.read_td_query("select time, close from nasdaq where symbol='AAPL'",
presto,
index_col='time',
parse_dates={'time': 's'})
df.plot()
Out[9]:
In [10]:
help(td.read_td_table)
In [11]:
# Read all records (up to 10,000 rows by default)
df = td.read_td_table("www_access", presto)
df.head(3)
Out[11]:
In [12]:
# From 1990 to 2010, 1-percent sample, no limit
df = td.read_td_table("nasdaq",
presto,
index_col='time',
parse_dates={'time': 's'},
columns=['time', 'symbol', 'close'],
time_range=('1990', '2010'),
sample=0.01,
limit=None)
len(df)
Out[12]:
In [13]:
df[df.symbol=='AAPL'].plot()
Out[13]:
In [14]:
help(td.to_td)
In [15]:
# Create a DataFrame with random values
df = pd.DataFrame(np.random.rand(3, 3), columns=['x', 'y', 'z'])
to_td
will fail if table already exists:
In [16]:
td.to_td(df, 'tutorial.tut1', con)
In [17]:
# Set "if_exists" to 'replace' or 'append'
td.to_td(df, 'tutorial.tut1', con, if_exists='replace')
Use index=False if you don't need to insert DataFrame index:
In [18]:
td.to_td(df, 'tutorial.tut1', con, if_exists='replace', index=False)
to_td
inserts the current time as "time" column. You can pass "time" column explicitly by "time_col":
In [19]:
import datetime
df = pd.DataFrame(np.random.rand(3, 3), columns=['x', 'y', 'z'])
# Set "time" column explicitly
df['time'] = datetime.datetime.now()
# Use "time" as the time column in Treasure Data
td.to_td(df, 'tutorial.tut1', con, if_exists='replace', index=False, time_col='time')
If you are using a time series index, set "time_index" to 0 (or the right level of indexes):
In [20]:
df = pd.DataFrame(np.random.rand(3, 3), columns=['x', 'y', 'z'])
# Set time series index
df.index = pd.date_range('2001-01-01', periods=3)
# Use index as the time column in Treasure Data
td.to_td(df, 'tutorial.tut1', con, if_exists='replace', index=False, time_index=0)
In [ ]: