This demostrates how plotly (https://plot.ly/) works with Snowflake based on the scenarios in https://plot.ly/python/amazon-redshift/
SF_ACCOUNT, SF_USER and SF_PASSWORDSF_S3_BUCKET, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to load the daata from the given location.jupyter notebookjupyter notebook
In [1]:
import os
account=os.getenv('SF_ACCOUNT')
user=os.getenv('SF_USER')
password=os.getenv('SF_PASSWORD')
In [2]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
engine = create_engine(URL(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
))
In [3]:
from snowflake.connector import connect
con = connect(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
)
cur = con.cursor()
In [4]:
cur.execute("select current_database(), current_schema()").fetchone()
Out[4]:
In [6]:
aws_key=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
sf_s3_bucket=os.getenv("SF_S3_BUCKET")
In [10]:
cur.execute("""
create or replace table users(
userid integer not null,
username char(8),
firstname varchar(30),
lastname varchar(30),
city varchar(30),
state char(2),
email varchar(100),
phone char(14),
likesports boolean,
liketheatre boolean,
likeconcerts boolean,
likejazz boolean,
likeclassical boolean,
likeopera boolean,
likerock boolean,
likevegas boolean,
likebroadway boolean,
likemusicals boolean)""")
cur.execute("""
create or replace table venue(
venueid smallint not null,
venuename varchar(100),
venuecity varchar(30),
venuestate char(2),
venueseats integer)""")
cur.execute("""
create or replace table category(
catid smallint not null,
catgroup varchar(10),
catname varchar(10),
catdesc varchar(50))
""")
cur.execute("""
create or replace table date(
dateid smallint not null,
caldate date not null,
day character(3) not null,
week smallint not null,
month character(5) not null,
qtr character(5) not null,
year smallint not null,
holiday boolean default('N'))""")
cur.execute("""
create or replace table event(
eventid integer not null,
venueid smallint not null,
catid smallint not null,
dateid smallint not null,
eventname varchar(200),
starttime timestamp)""")
cur.execute("""
create or replace table listing(
listid integer not null,
sellerid integer not null,
eventid integer not null,
dateid smallint not null,
numtickets smallint not null,
priceperticket decimal(8,2),
totalprice decimal(8,2),
listtime timestamp)""")
cur.execute("""
create or replace table sales(
salesid integer not null,
listid integer not null,
sellerid integer not null,
buyerid integer not null,
eventid integer not null,
dateid smallint not null,
qtysold smallint not null,
pricepaid decimal(8,2),
commission decimal(8,2),
saletime timestamp)""")
Out[10]:
In [11]:
tables = ["users", "venue", "category", "date", "event", "listing"]
data_files = ["allusers_pipe", "venue_pipe", "category_pipe", "date2008_pipe", "allevents_pipe", "listings_pipe"]
for table, src_file in zip(tables, data_files):
cur.execute("""
copy into {table} from s3://{s3_bucket}/aws_sampled_bus_west2/{src_file}
credentials=(
aws_key_id='{aws_key_id}'
aws_secret_key='{aws_secret_key}'
)
file_format= (
field_delimiter='|'
error_on_column_count_mismatch=false
)
""".format(
s3_bucket = sf_s3_bucket,
src_file=src_file,
table=table,
aws_key_id=aws_key,
aws_secret_key=aws_secret))
In [12]:
cur.execute("""
copy into sales from s3://{s3_bucket}/aws_sampled_bus_west2/sales_tab.txt
credentials=(
aws_key_id='{aws_key_id}'
aws_secret_key='{aws_secret_key}'
)
file_format= (
field_delimiter='\t'
error_on_column_count_mismatch=false
)
""".format(
s3_bucket = sf_s3_bucket,
src_file=src_file,
table=table,
aws_key_id=aws_key,
aws_secret_key=aws_secret))
Out[12]:
Wait for the previous queries to finish
In [13]:
con.commit()
In [14]:
from __future__ import print_function #python 3 support
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import pandas as pd
import os
import requests
In [15]:
df = pd.read_sql_query("""
SELECT
sum(case likesports when true then 1 else 0 end) as sports,
sum(case liketheatre when true then 1 else 0 end) as theatre,
sum(case likeconcerts when true then 1 else 0 end) as concerts,
sum(case likejazz when true then 1 else 0 end) as jazz,
sum(case likeclassical when true then 1 else 0 end) as classical,
sum(case likeopera when true then 1 else 0 end) as opera,
sum(case likerock when true then 1 else 0 end) as rock,
sum(case likevegas when true then 1 else 0 end) as vegas,
sum(case likebroadway when true then 1 else 0 end) as broadway,
sum(case likemusicals when true then 1 else 0 end) as musical,
state
FROM users
GROUP BY state
ORDER BY state asc
""", engine)
In [16]:
df
Out[16]:
In [17]:
data = Data([
Heatmap(
z = df.drop('state', axis=1).values,
x = df.drop('state', axis=1).columns,
y = df.state,
colorscale = 'Hot'
)
])
layout = Layout(title="State and Music Tastes", yaxis=YAxis(autotick=False, dtick=1))
py.iplot(Figure(data=data, layout=layout), filename='snowflake/state and music taste heatmap', height=1000)
Out[17]:
In [18]:
layout = Layout(title="Declared User Preference Box Plots",
yaxis=YAxis())
data = []
for pref in df.drop('state', axis=1).columns:
# for every preference type, make a box plot
data.append(Box(y=df[pref], name=pref))
py.iplot(Figure(data=data, layout=layout), filename='snowflake/user preference box plots')
Out[18]:
In [19]:
df = pd.read_sql_query("""
SELECT
sum(event.catid) as category_sum,
catname as category_name
FROM event, category
where event.catid = category.catid
GROUP BY category.catname
""", engine)
In [20]:
layout = Layout(title="Event Categories Sum", yaxis=YAxis(title="Sum"))
data = [Bar(x=df.category_name, y=df.category_sum)]
py.iplot(Figure(data=data, layout=layout))
Out[20]:
In [21]:
df = pd.read_sql_query("""
SELECT
sum(sales.qtysold) as quantity_sold,
date.caldate
FROM sales, date
WHERE sales.dateid = date.dateid
GROUP BY date.caldate
ORDER BY date.caldate asc;
""", engine)
In [22]:
layout = Layout(
title="Event Sales Per Day",
yaxis=YAxis(title="Sales Quantity"))
data = [Scatter(x=df.caldate, y=df.quantity_sold)]
py.iplot(Figure(data=data, layout=layout))
Out[22]:
In [23]:
df = pd.read_sql_query("""
SELECT
sum(sales.qtysold) as quantity_sold,
date.caldate, category.catname as category_name
FROM sales, date, event, category
WHERE sales.dateid = date.dateid
AND sales.eventid = event.eventid
AND event.catid = category.catid
GROUP BY date.caldate, category_name
ORDER BY date.caldate asc
""", engine)
In [24]:
data = []
for count, (name, g) in enumerate(df.groupby("category_name")):
data.append(Scatter(
name=name,
x=g.caldate,
y=g.quantity_sold,
xaxis='x' + str(count + 1),
yaxis='y' + str(count + 1)
))
fig = tls.make_subplots(rows=2,cols=2)
fig['layout'].update(title="Event Sales Per Day By Category")
fig['data'] += data
py.iplot(fig)
Out[24]:
In [25]:
data = []
for name, g in df.groupby("category_name"):
data.append(Scatter(
name=name,
x=g.caldate,
y=g.quantity_sold
))
fig = Figure()
fig['layout'].update(title="Event Sales Per Day By Category")
fig['data'] += data
py.iplot(fig, filename='redshift/Event Sales Per Day by Category')
Out[25]:
In [ ]: