Plotly Integration Demo

This demostrates how plotly (https://plot.ly/) works with Snowflake based on the scenarios in https://plot.ly/python/amazon-redshift/

Setup

  • Install Anaconda from https://www.continuum.io/downloads
  • Set the account name, user name and password to the environment variables to SF_ACCOUNT, SF_USER and SF_PASSWORD
  • Set the S3 Bucket, AWS access key id and AWS secret access key to SF_S3_BUCKET, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to load the daata from the given location.
  • Walk through https://plot.ly/python/getting-started/ to ensure installing ploty package and API key in the environment and run jupyter notebook
  • Start Jupyter notebook jupyter notebook

In [1]:
import os
account=os.getenv('SF_ACCOUNT')
user=os.getenv('SF_USER')
password=os.getenv('SF_PASSWORD')

In [2]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
engine = create_engine(URL(
    account=account,
    user=user,
    password=password,
    database='testdb',
    schema='public',
    ))

In [3]:
from snowflake.connector import connect
con = connect(
    account=account,
    user=user,
    password=password,
    database='testdb',
    schema='public',
)
cur = con.cursor()

In [4]:
cur.execute("select current_database(), current_schema()").fetchone()


Out[4]:
('TESTDB', 'PUBLIC')

In [6]:
aws_key=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
sf_s3_bucket=os.getenv("SF_S3_BUCKET")

In [10]:
cur.execute("""
create or replace table users(
    userid integer not null,
    username char(8),
    firstname varchar(30),
    lastname varchar(30),
    city varchar(30),
    state char(2),
    email varchar(100),
    phone char(14),
    likesports boolean,
    liketheatre boolean,
    likeconcerts boolean,
    likejazz boolean,
    likeclassical boolean,
    likeopera boolean,
    likerock boolean,
    likevegas boolean,
    likebroadway boolean,
    likemusicals boolean)""")
cur.execute("""
create or replace table venue(
    venueid smallint not null,
    venuename varchar(100),
    venuecity varchar(30),
    venuestate char(2),
    venueseats integer)""")
cur.execute("""
create or replace table category(
    catid smallint not null,
    catgroup varchar(10),
    catname varchar(10),
    catdesc varchar(50))
""")
cur.execute("""
create or replace table date(
    dateid smallint not null,
    caldate date not null,
    day character(3) not null,
    week smallint not null,
    month character(5) not null,
    qtr character(5) not null,
    year smallint not null,
    holiday boolean default('N'))""")
cur.execute("""
create or replace table event(
    eventid integer not null,
    venueid smallint not null,
    catid smallint not null,
    dateid smallint not null,
    eventname varchar(200),
    starttime timestamp)""")
cur.execute("""
create or replace table listing(
    listid integer not null,
    sellerid integer not null,
    eventid integer not null,
    dateid smallint not null,
    numtickets smallint not null,
    priceperticket decimal(8,2),
    totalprice decimal(8,2),
    listtime timestamp)""")
cur.execute("""
create or replace table sales(
    salesid integer not null,
    listid integer not null,
    sellerid integer not null,
    buyerid integer not null,
    eventid integer not null,
    dateid smallint not null,
    qtysold smallint not null,
    pricepaid decimal(8,2),
    commission decimal(8,2),
    saletime timestamp)""")


Out[10]:
<snowflake.connector.cursor.SnowflakeCursor at 0x7fcda7e71470>

In [11]:
tables = ["users", "venue", "category", "date", "event", "listing"]
data_files = ["allusers_pipe", "venue_pipe", "category_pipe", "date2008_pipe", "allevents_pipe", "listings_pipe"]

for table, src_file in zip(tables, data_files):
    cur.execute("""
copy into {table} from s3://{s3_bucket}/aws_sampled_bus_west2/{src_file}
    credentials=(
        aws_key_id='{aws_key_id}' 
        aws_secret_key='{aws_secret_key}'
    )
    file_format= (
        field_delimiter='|'
        error_on_column_count_mismatch=false
    )
""".format(
        s3_bucket = sf_s3_bucket,
        src_file=src_file,
        table=table,
        aws_key_id=aws_key,
        aws_secret_key=aws_secret))

In [12]:
cur.execute("""
copy into sales from s3://{s3_bucket}/aws_sampled_bus_west2/sales_tab.txt
    credentials=(
        aws_key_id='{aws_key_id}' 
        aws_secret_key='{aws_secret_key}'
    )
    file_format= (
        field_delimiter='\t'
        error_on_column_count_mismatch=false
    )
""".format(
        s3_bucket = sf_s3_bucket,
        src_file=src_file,
        table=table,
        aws_key_id=aws_key,
        aws_secret_key=aws_secret))


Out[12]:
<snowflake.connector.cursor.SnowflakeCursor at 0x7fcda7e71470>

Wait for the previous queries to finish


In [13]:
con.commit()

In [14]:
from __future__ import print_function #python 3 support

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import pandas as pd
import os
import requests

In [15]:
df = pd.read_sql_query("""
SELECT 
    sum(case likesports when true then 1 else 0 end) as sports,
    sum(case liketheatre when true then 1 else 0 end) as theatre,  
    sum(case likeconcerts when true then 1 else 0 end) as concerts,
    sum(case likejazz when true then 1 else 0 end) as jazz, 
    sum(case likeclassical when true then 1 else 0 end) as classical,
    sum(case likeopera when true then 1 else 0 end) as opera,  
    sum(case likerock when true then 1 else 0 end) as rock,
    sum(case likevegas when true then 1 else 0 end) as vegas,  
    sum(case likebroadway when true then 1 else 0 end) as broadway,
    sum(case likemusicals when true then 1 else 0 end) as musical, 
    state
FROM users 
GROUP BY state
ORDER BY state asc
""", engine)

In [16]:
df


Out[16]:
sports theatre concerts jazz classical opera rock vegas broadway musical state
0 332 449 456 480 455 463 453 463 493 437 AB
1 185 255 246 249 252 246 228 259 262 262 AK
2 97 128 127 133 129 137 127 135 127 132 AL
3 82 145 120 113 140 150 142 122 128 121 AZ
4 341 494 511 495 503 465 507 461 513 508 BC
5 92 107 135 135 127 129 116 108 119 135 CA
6 99 138 139 113 132 131 129 117 137 124 CO
7 76 120 121 139 113 131 128 118 134 130 CT
8 87 117 115 128 125 123 110 113 106 124 DC
9 92 119 120 122 131 130 109 114 113 143 DE
10 99 136 117 130 124 142 120 119 119 122 FL
11 80 122 112 131 103 145 120 111 118 126 GA
12 86 97 110 123 104 103 123 106 117 105 HI
13 89 127 133 122 129 109 113 118 119 118 IA
14 93 130 130 117 130 111 116 129 110 128 ID
15 85 118 137 110 149 144 123 138 151 127 IL
16 95 126 124 134 129 120 116 118 124 119 IN
17 84 128 109 119 132 120 128 123 116 120 KS
18 88 141 138 137 123 144 146 114 125 117 KY
19 99 111 116 111 148 123 108 131 127 127 LA
20 95 113 115 122 125 110 121 106 113 122 MA
21 326 465 474 464 503 514 495 491 505 457 MB
22 75 122 99 120 108 107 111 110 116 114 MD
23 71 127 115 115 100 99 131 102 125 113 ME
24 98 114 136 121 133 128 114 121 124 115 MI
25 73 132 127 134 130 149 111 123 110 129 MN
26 84 130 133 114 149 121 130 154 134 133 MO
27 90 105 89 135 130 105 114 137 106 107 MS
28 91 117 126 122 128 135 128 146 120 121 MT
29 307 484 517 487 474 449 485 492 512 498 NB
... ... ... ... ... ... ... ... ... ... ... ...
33 80 139 111 113 117 140 119 140 133 152 NH
34 92 117 123 106 110 118 112 110 124 132 NJ
35 341 455 486 487 480 466 532 461 484 486 NL
36 76 112 133 131 119 124 118 101 117 126 NM
37 325 466 478 473 491 480 462 500 446 486 NS
38 351 519 496 492 520 527 471 483 487 525 NT
39 314 487 430 448 452 469 435 475 433 485 NU
40 93 123 118 131 151 110 147 133 115 120 NV
41 101 149 128 128 148 149 127 128 125 145 NY
42 84 114 131 115 118 110 120 106 110 119 OH
43 84 116 107 135 102 124 124 96 111 117 OK
44 323 500 483 438 456 450 479 449 450 454 ON
45 70 102 109 123 110 107 100 119 115 101 OR
46 81 139 134 144 111 121 128 129 125 109 PA
47 333 470 477 466 441 472 479 499 422 498 PE
48 338 482 461 506 518 470 498 525 458 479 QC
49 93 126 130 122 108 121 140 136 108 136 RI
50 82 123 120 124 114 137 126 131 117 113 SC
51 76 129 125 123 131 111 138 111 129 114 SD
52 331 479 487 493 464 457 488 449 473 478 SK
53 88 130 132 119 136 137 136 123 127 144 TN
54 72 122 132 110 113 100 90 118 111 101 TX
55 83 125 140 115 126 122 110 128 133 139 UT
56 81 124 109 112 111 123 108 118 114 120 VA
57 90 158 142 139 126 124 133 150 150 159 VT
58 88 131 111 118 118 97 112 123 122 119 WA
59 80 120 131 124 118 128 114 117 122 134 WI
60 92 137 126 111 112 118 109 124 115 140 WV
61 65 106 132 113 109 140 111 109 127 121 WY
62 333 488 482 477 472 467 485 483 492 476 YT

63 rows × 11 columns


In [17]:
data = Data([
        Heatmap(
            z = df.drop('state', axis=1).values,
            x = df.drop('state', axis=1).columns,
            y = df.state,
            colorscale = 'Hot'
        )
    ])
layout = Layout(title="State and Music Tastes", yaxis=YAxis(autotick=False, dtick=1))
py.iplot(Figure(data=data, layout=layout), filename='snowflake/state and music taste heatmap', height=1000)


Out[17]:

In [18]:
layout = Layout(title="Declared User Preference Box Plots",
                yaxis=YAxis())

data = []
for pref in df.drop('state', axis=1).columns:
    # for every preference type, make a box plot
    data.append(Box(y=df[pref], name=pref))

py.iplot(Figure(data=data, layout=layout), filename='snowflake/user preference box plots')


Out[18]:

In [19]:
df = pd.read_sql_query("""
SELECT 
    sum(event.catid) as category_sum,
    catname as category_name
FROM event, category
where event.catid = category.catid
GROUP BY category.catname
""", engine)

In [20]:
layout = Layout(title="Event Categories Sum", yaxis=YAxis(title="Sum"))
data = [Bar(x=df.category_name, y=df.category_sum)]
py.iplot(Figure(data=data, layout=layout))


Out[20]:

In [21]:
df = pd.read_sql_query("""
SELECT 
    sum(sales.qtysold) as quantity_sold,
    date.caldate  
FROM sales, date
WHERE sales.dateid = date.dateid 
GROUP BY date.caldate 
ORDER BY date.caldate asc;
""", engine)

In [22]:
layout = Layout(
    title="Event Sales Per Day", 
    yaxis=YAxis(title="Sales Quantity"))
data = [Scatter(x=df.caldate, y=df.quantity_sold)]
py.iplot(Figure(data=data, layout=layout))


Out[22]:

In [23]:
df = pd.read_sql_query("""
SELECT 
    sum(sales.qtysold) as quantity_sold,
    date.caldate, category.catname as category_name
FROM sales, date, event, category
WHERE sales.dateid = date.dateid 
AND sales.eventid = event.eventid
AND event.catid = category.catid
GROUP BY date.caldate, category_name
ORDER BY date.caldate asc
""", engine)

In [24]:
data = []
for count, (name, g) in enumerate(df.groupby("category_name")):
    data.append(Scatter(
            name=name,
            x=g.caldate,
            y=g.quantity_sold,
            xaxis='x' + str(count + 1),
            yaxis='y' + str(count + 1)
        ))

fig = tls.make_subplots(rows=2,cols=2)
fig['layout'].update(title="Event Sales Per Day By Category")
fig['data'] += data
py.iplot(fig)


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]

Out[24]:

In [25]:
data = []
for name, g in df.groupby("category_name"):
    data.append(Scatter(
            name=name,
            x=g.caldate,
            y=g.quantity_sold
        ))

fig = Figure()
fig['layout'].update(title="Event Sales Per Day By Category")
fig['data'] += data
py.iplot(fig, filename='redshift/Event Sales Per Day by Category')


Out[25]:

In [ ]: