This demo is based on an example in the book: https://www.amazon.com/dp/1449319793/
Python for Data Analysis by William Wesley McKinney (O'Reilly). Copyright 2012 William McKinney, 978-1-449-31979-3
The data are available at: https://github.com/usagov/1.USA.gov-Data
SF_ACCOUNT, SF_USER and SF_PASSWORDSF_S3_BUCKET, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to load the daata from the given location.jupyter notebook
In [1]:
import os
account=os.getenv('SF_ACCOUNT')
user=os.getenv('SF_USER')
password=os.getenv('SF_PASSWORD')
In [2]:
from snowflake.connector import connect
con = connect(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
)
cur = con.cursor()
In [3]:
cur.execute("select current_database(), current_schema()").fetchone()
Out[3]:
In [4]:
cur.execute("create or replace table usagov_bitly(data variant)")
Out[4]:
In [5]:
aws_key=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
sf_s3_bucket=os.getenv("SF_S3_BUCKET")
Update the source data location to your S3 bucket.
In [6]:
cur.execute("""
copy into usagov_bitly from s3://{s3_bucket}/usagov/data/
credentials=(
aws_key_id='{aws_key_id}'
aws_secret_key='{aws_secret_key}'
)
file_format= (
field_delimiter='none'
type='json'
)
""".format(
s3_bucket=sf_s3_bucket,
aws_key_id=aws_key,
aws_secret_key=aws_secret))
Out[6]:
In [7]:
from snowflake.connector.cursor import DictCursor
cur = con.cursor(DictCursor)
In [8]:
cur.execute("""
select
data:a::string as a,
data:al as al,
data:c as c,
data:cy as cy,
data:g as g,
data:gr as gr,
data:h as h,
data:hc::float as hc,
data:hh as hh,
data:l as l,
data:ll as ll,
data:nk::float as nk,
data:r as r,
data:t::float as t,
data:tz::string as tz,
data:u as u
from usagov_bitly as v""")
Out[8]:
In [9]:
time_zones = [rec['TZ'] for rec in cur.fetchall() if rec['TZ'] is not None]
In [10]:
len(time_zones)
Out[10]:
In [11]:
time_zones[:10]
Out[11]:
In [12]:
from collections import defaultdict
def get_counts(sequence):
counts = defaultdict(int)
for x in sequence:
counts[x] += 1
return counts
In [13]:
counts = get_counts(time_zones)
In [14]:
counts['America/New_York']
Out[14]:
In [15]:
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
In [16]:
top_counts(counts)
Out[16]:
In [17]:
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
Out[17]:
In [18]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
engine = create_engine(URL(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
))
In [19]:
from pandas import DataFrame, Series
import pandas as pd
In [20]:
%matplotlib notebook
In [21]:
df = pd.read_sql_query("""
select
data:a::string as a,
data:al as al,
data:c as c,
data:cy as cy,
data:g as g,
data:gr as gr,
data:h as h,
data:hc::float as hc,
data:hh as hh,
data:l as l,
data:ll as ll,
data:nk::float as nk,
data:r as r,
data:t::float as t,
data:tz::string as tz,
data:u as u
from usagov_bitly as v""", engine)
In [22]:
df.info()
In [23]:
df['tz'][:10]
Out[23]:
In [24]:
tz_counts = df['tz'].value_counts()
In [25]:
tz_counts[:10]
Out[25]:
In [26]:
clean_tz = df['tz'].fillna('Missing')
In [27]:
clean_tz[clean_tz == ''] = 'Unknown'
In [28]:
tz_counts = clean_tz.value_counts()
In [29]:
tz_counts[:10]
Out[29]:
In [30]:
tz_counts[:10].plot(kind='barh', rot=0)
Out[30]:
In [31]:
df['a'][1]
Out[31]:
In [32]:
df['a'][50]
Out[32]:
In [33]:
results = Series([x.split()[0] for x in df.a.dropna()])
In [34]:
results[:5]
Out[34]:
In [35]:
results.value_counts()[:8]
Out[35]:
In [36]:
cframe = df[df.a.notnull()]
In [37]:
import numpy as np
In [38]:
operating_system= np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
In [39]:
operating_system[:5]
Out[39]:
In [40]:
by_tz_os = cframe.groupby(['tz', operating_system])
In [41]:
agg_counts = by_tz_os.size().unstack().fillna(0)
In [42]:
agg_counts[:10]
Out[42]:
In [43]:
indexer = agg_counts.sum(1).argsort()
In [44]:
indexer[:10]
Out[44]:
In [45]:
count_subset = agg_counts.take(indexer)[-10:]
In [46]:
count_subset
Out[46]:
In [47]:
count_subset.plot(kind='barh', stacked=True)
Out[47]:
In [49]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
In [50]:
normed_subset.plot(kind='barh', stacked=True)
Out[50]: