Peter Torosian
Data Bootcamp MBA
5/12/16
Author: Peter Torosian
I was interested in investigating the increase of craft beer production and brewery openings over the past decade. In particular, I wanted to visualize clusters of breweries and where the growth of craft brewing has been over the past few years.
The final graph shows surprising growth in states like Pennsylvania, Wisconsin and Florida. It is also interesting to see the states that produce the most craft beer per resident.
Some open source brewery data is found on the https://openbeerdb.com/ website.
The dataframes that I have uploaded for my project include: production data for all microbreweries in the US, provided by Brewers Association and an open source directory of microbreweries that include their location data.
In [295]:
# import packages
import pandas as pd # data management
import matplotlib.pyplot as plt # graphics
import numpy as np # foundation for Pandas
import seaborn.apionly as sns # fancy matplotlib graphics (no styling)
import requests, io # internet and input tools
import zipfile as zf # zip file tools
from geopy.geocoders import Nominatim #import geopy for getting geocodes
geolocator = Nominatim() #first must install geopy using "pip install geopy"
# plotly imports
from plotly.offline import iplot, iplot_mpl # plotting functions
import plotly.graph_objs as go # plotting functions
import plotly # just to print version and init notebook
import cufflinks as cf # gives us df.iplot that feels like df.plot
cf.set_config_file(offline=True, offline_show_link=False)
# these lines make our graphics show up in the notebook
%matplotlib inline
plotly.offline.init_notebook_mode()
# check Python version
import datetime as dt
import sys
print('Today is', dt.date.today())
print('What version of Python are we running? \n', sys.version, sep='')
print('Plotly version: ', plotly.__version__)
print('Pandas version: ', pd.__version__)
print('Peter Torosian')
In [296]:
# get "response" from url
url = 'http://openbeerdb.com/files/openbeerdb_csv.zip'
r = requests.get(url)
# describe response
print('Response status code:', r.status_code)
print('Response type:', type(r))
print('Response .content:', type(r.content))
print('Response headers:\n', r.headers, sep='')
In [297]:
# convert bytes to zip file
br = zf.ZipFile(io.BytesIO(r.content))
print('Type of zipfile object:', type(br))
In [298]:
# what's in the zip file?
br.namelist()
Out[298]:
In [299]:
# set csv files imported to dataframes
breweries = pd.read_csv(br.open(br.namelist()[1])) #this file is a list of breweries and addresses, etc
geocode = pd.read_csv(br.open(br.namelist()[2])) #this file is only brewery IDs and Lat/Long positions
In [300]:
# rename id column to brewery id so it matches with geocode file
breweries = breweries.rename(columns={'id': 'brewery_id'})
In [301]:
# merge the two dataframes
geobrew = pd.merge(breweries, geocode,
how = 'left',
on='brewery_id')
In [302]:
#reduce dataframe to only breweries in the United States
USA = geobrew['country'] == 'United States'
geobrewUSA = geobrew[USA]
#clean up bad data: bad data found during cleanup later in the notebook
geobrewUSA = geobrewUSA[geobrewUSA.brewery_id != 353]
geobrewUSA = geobrewUSA[geobrewUSA.brewery_id != 1184]
In [303]:
#find entries with missing latitude/longitude information
missing = geobrewUSA['latitude'].isnull()
missingGEO = geobrewUSA[missing]
missingGEO
Out[303]:
In [304]:
#the Lat/Long lookup algorithm incorrectly handles Aspen, Colorado. Colorado was replaced by CO
missingGEO['state'] = missingGEO['state'].replace(['Colorado'], 'CO')
In [305]:
#combine city and state for find lat/long for the missing rows later on
missingGEO['city_state'] = missingGEO['city'] + ' ' + missingGEO['state']
missingGEO.head(2)
Out[305]:
In [306]:
# make an array of cities/states that need lat/long
spot = []
spot = missingGEO.iloc[:,17]
spot
Out[306]:
In [307]:
# keep track of index values for corresponding cities for recombining later on
join = []
join = missingGEO.index.values
idIndex = pd.DataFrame(join, columns =['index'])
In [308]:
# test code to see if geolocator algorithm works for a particular city/state
location = geolocator.geocode('belmont CA')
print(location.address)
print((location.latitude, location.longitude))
In [309]:
# loop through the list to get both latitude and longitude
locations = []
for row in spot:
location = geolocator.geocode([row])
locations.append({'latitude' : location.latitude,'longitude' : location.longitude})
coord = pd.DataFrame(locations, columns=['latitude', 'longitude'])
coord = pd.concat([coord, idIndex], axis = 1)
coord.head(3)
Out[309]:
In [310]:
# set index to match the previous index from the original USA brewery list
coord = coord.set_index('index')
In [311]:
# drop the columns of missing data so that new lat/long data can be merged in
missingGEO = missingGEO.drop(missingGEO.columns[[14,15,16,17]], axis =1)
missingGEO.head(2)
Out[311]:
In [312]:
# concat in new lat/long data
FixedmissingGEO = pd.concat([missingGEO, coord], axis = 1)
In [313]:
# get rid of any remaining breweries with NaN for lat/long. these were determined to not be relevant
notmissing = geobrewUSA['latitude'].notnull()
geobrewUSAfixed = geobrewUSA[notmissing]
In [314]:
# make a new geobrewUSA dataframe that is complete with every brewery having latitude and longitude
geobrewUSA = pd.concat([geobrewUSAfixed, FixedmissingGEO], axis = 0)
In [315]:
#drop the last redundant row
geobrewUSA = geobrewUSA[:-1]
In [316]:
#Find stray data point in Europe. This was corrected by changing "Colorado" to "CO"
euro = geobrewUSA['longitude'] > 0
europoint = geobrewUSA[euro]
europoint
Out[316]:
In [317]:
#Find stray data point in USVI, should not be included
USVI = geobrewUSA['latitude'] < 20
straypoint = geobrewUSA[USVI]
straypoint
Out[317]:
In [318]:
# remove data that is not needed from dataframe
geobrewUSA = geobrewUSA.drop(geobrewUSA.columns[[0,1,2,4,5,6,7,8,9,10,14,15,16]], axis =1)
In [319]:
# create marker for map
marker = {"color": "crimson",
"size": 5,
"colorscale": "Red"}
In [335]:
# create layout for map
layout = dict(title = '2008 US Breweries<br>(hover over point for brewery name)', #title
showlegend = False, #remove legend
geo = dict(
scope='usa',
projection=dict( type='albers usa' ), #use built in USA map
showland = True,
landcolor = 'rgb(217, 217, 217)', #set map/border colors annd sizes
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"),
width=1050, height=750)
In [336]:
# using lat/lon mode for map
trace = dict(type="scattergeo", # trace type
mode="markers", # draw dots
lat=geobrewUSA["latitude"], # latitude coordinate
lon=geobrewUSA["longitude"], # longitude coordinate
text = geobrewUSA['name'], # what shows up with hovering over data point
marker=marker # marker settings (color, size...)
)
In [338]:
# plot all the breweries in the US from data
iplot(go.Figure(data=[trace], layout=layout), link_text="")
In [323]:
# import full list of brewery production data: file name = Brewers_Association_Data_2014.xls
localURL = 'http://localhost:8888/files/Documents/NYU%20Stern/Spring%202016/Data%20Boot%20Camp/Brewers_Association_Data_2014.xls'
BAdata = pd.read_excel(localURL)
BAdata = BAdata.rename(columns={'Craft Brewer Name': 'name'}) #rename 'name' column to match other dataframes
BAdata.head(2)
Out[323]:
In [324]:
# import data on craft beer production by state: file name = Brewers_Association_Data_2014_States.xls
localURL = 'http://localhost:8888/files/Documents/NYU%20Stern/Spring%202016/Data%20Boot%20Camp/Brewers_Association_Data_2014_States.xls'
BAdata_states = pd.read_excel(localURL)
In [325]:
# clean up data by state for plotting: sort by production, drop rows without data, set index for labeling
BAdata_states.drop(BAdata_states.index[52:57], inplace=True)
BAdata_states = BAdata_states.sort_values(by=['Gallons Produced /21+ Adult'], ascending=[True])
BAdata_states = BAdata_states[BAdata_states.State != 'U.S. Territories']
BAdata_states = BAdata_states.set_index('State')
BAdata_states.head(2)
Out[325]:
In [326]:
# plot beer produced with labels and titles
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots()
BAdata_states['Gallons Produced /21+ Adult'].plot(ax=ax, kind='barh', alpha=0.5, figsize=(6,13))
ax.set_title('Beer Produced per Capita', loc='left', fontsize=16)
ax.set_xlabel('Beer Gallons per 21+ Adult')
ax.set_ylabel('')
Out[326]:
In [339]:
# rearrange production data for plotting total production information
BAdata_states = BAdata_states.sort_values(by=['2014 Craft Barrels'], ascending=[False])
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots()
BAdata_states['2014 Craft Barrels'].plot(ax=ax, kind='bar', alpha=0.5, figsize=(13,5),color='m') #2014 data
BAdata_states['2013 Craft Barrels'].plot(ax=ax, kind='bar', alpha=1, figsize=(13,5)) #2013 data
ax.set_title('Total Craft Beer Produced by State', loc='center', fontsize=20) #set title
ax.set_xlabel('') #x-axis label
ax.set_ylabel('Beer Gallons per Year') #y-axis label
ax.legend(['2014 Craft Barrels', '2013 Craft Barrels']) #legend
Out[339]:
In [ ]: