In [1]:
# Load the libraries
import pandas as pd
import numpy as np
In [2]:
# Load the price data again and fill the missing values, Add year
df = pd.read_csv("data/Weed_Price.csv", parse_dates=[-1])
df.sort(columns=['State','date'], inplace=True)
df.fillna(method = "ffill", inplace=True)
In [3]:
# Load the demographic data
df_demo = pd.read_csv("data/Demographics_State.csv")
Lets load the libraries required for Visual Exploration
In [4]:
# Load the visualisation libraries - Matplotlib and Seaborn
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
In [6]:
# Filter data for location California and calculate the Year
df['year'] = pd.DatetimeIndex(df['date']).year
df_cal = df[df["State"] == "California"]
df_cal.head()
Out[6]:
In [7]:
# Plot
df_cal.plot(x = "date", y = "HighQ")
Out[7]:
In [8]:
# Set index as date - this is important to get the labels in the plots automatically
df_cal.index = df_cal.date
df_cal.head()
Out[8]:
In [9]:
# Lets plot the HighQ prices
df_cal.HighQ.plot()
Out[9]:
In [10]:
# Lets plot this HighQ as a histogram to see the most common price
df_cal.HighQ.plot(kind = "hist")
Out[10]:
In [11]:
# Lets increase the bins to see some granularity
df_cal.HighQ.plot(kind = "hist", bins = 40)
Out[11]:
In [12]:
# Lets plot all the three prices in California
df_cal[["HighQ", "MedQ", "LowQ"]].plot()
Out[12]:
In [13]:
# Lets see the distribution of these prices by using a histogram
df_cal[["HighQ", "MedQ", "LowQ"]].plot(kind = "hist", bins = 50, alpha = 0.5)
Out[13]:
In [ ]:
Plot the HighQ, MedQ and LowQ prices for Alaska in 2014
In [ ]:
Plot the histogram of HighQ, MedQ and LowQ prices for Alaska in 2014
In [ ]:
In [14]:
# Lets plot a box plot for the HighQ, MedQ and LowQ
df_cal.describe()
Out[14]:
In [15]:
# Lets plot a Box Plot for the prices
df_cal[["HighQ", "MedQ", "LowQ"]].plot(kind = "box")
Out[15]:
In [16]:
# Lets plot a Box Plot for the sample size
df_cal[["HighQN", "MedQN", "LowQN"]].plot(kind = "box")
Out[16]:
What if we want to show the price in all the states in the year 2014?
In [17]:
# Select only the year 2014
df_2014 = df[df["year"] == 2014]
df_2014.head()
Out[17]:
In [18]:
# Lets use pivot tables to get HighQ values for each Date by each State
df_states = pd.pivot_table(df_2014, values = "HighQ", index = "date", columns = "State")
df_states.head()
Out[18]:
In [19]:
# Lets plot of these lines
df_states.plot()
Out[19]:
In [20]:
df_states.iloc[:,1:10].plot()
Out[20]:
In [21]:
# What if we group by State and plot
# df_2014.groupby("State").plot(x = "date", y = "HighQ")
In [22]:
# Arrange in a grid fashion
grid = sns.FacetGrid(df_2014, col = "State", col_wrap = 7)
grid.map(plt.plot, "date", "HighQ")
Out[22]:
In [23]:
df_demo.head()
Out[23]:
In [24]:
# Create an index in the demographic data to ease the labels
df_demo.index = df_demo.region
df_demo.head()
Out[24]:
In [25]:
# DO NOT make pie charts, especially when the number of category is greater than 6
df_demo.total_population.plot(kind = "pie")
Out[25]:
In [ ]:
In [26]:
# Lets plot this in a simple bar chart
df_demo.total_population.plot(kind ="bar")
Out[26]:
In [27]:
# Lets sort the columns to make them in ascending order
df_demo.sort(columns = 'total_population', ascending = True, inplace = True)
df_demo.head()
Out[27]:
In [28]:
# Lets now plot again
df_demo.total_population.plot(kind = "barh")
Out[28]:
In [29]:
# Lets select the percent population
df_demo.iloc[:,2:6].head()
Out[29]:
In [30]:
# Lets plot the percentage population
df_demo.iloc[:,2:6].plot(kind = "barh")
Out[30]:
In [31]:
# Lets stack the percentage population
df_demo.iloc[:,2:6].plot(kind = "barh", stacked = True)
Out[31]:
In [32]:
# Lets plot the Per Capita Income
df_demo.sort(columns='per_capita_income', ascending=True).per_capita_income.plot(kind = "barh")
Out[32]:
In [33]:
# Scatter - per_capita_income and percent_white
df_demo.plot(kind = "scatter", x = "per_capita_income", y = "percent_white", s = 100)
Out[33]:
In [ ]:
Is there a relationship between median_age and percentage of asians?
In [ ]:
In [34]:
# Lets get the latitude and longitude for each of the state
df_geo = pd.read_csv("data/State_Location.csv")
df_geo.head()
Out[34]:
In [35]:
# Lets draw the states as a point
df_geo.plot(kind = "scatter", x = "longitude", y = "latitude")
Out[35]:
In [36]:
# Lets change the color of the dot and increase the size
df_geo.plot(kind = "scatter", x = "longitude", y = "latitude", c = "red", s = 100)
Out[36]:
In [37]:
# Lets zoom in to the mainland USA
df_geo.plot(kind = "scatter", x = "longitude", y = "latitude", c = "red", s = 100,
xlim = (-140,-60), ylim = (20,60))
Out[37]:
In [38]:
# Lets plot some data on this plot
df_demo_geo = pd.merge(df_demo, df_geo, on = "region")
df_demo_geo.index = df_demo_geo.region
df_demo_geo.head()
Out[38]:
In [39]:
df_demo_geo.index = df_demo_geo.region
df_demo_geo.iloc[:,10:12].head()
Out[39]:
In [40]:
# Lets draw a scatter plot
df_demo_geo.plot(kind = "scatter", x = "longitude", y = "latitude", c = "per_capita_income", s = 400,
xlim = (-140,-60), ylim = (20,60), colormap = "Oranges")
Out[40]:
In [ ]:
In [41]:
from IPython.display import HTML
import folium
# Temporary code to embed in Jupyter Notebook - will be fixed in version 0.16
def display(m, height=500):
"""Takes a folium instance and embed HTML."""
m._build_map()
srcdoc = m.HTML.replace('"', '"')
embed = HTML('<iframe srcdoc="{0}" '
'style="width: 100%; height: {1}px; '
'border: none"></iframe>'.format(srcdoc, height))
return embed
In [42]:
map = folium.Map(location=[48, -102], zoom_start= 4)
display(map)
Out[42]:
In [43]:
# Import a Geo JSON
state_geo = r'data/us-states.json'
In [82]:
# Plot it as an overlay on the map
map = folium.Map(location=[48, -102], zoom_start=3)
map.geo_json(geo_path = state_geo)
display(map)
Out[82]:
In [83]:
# Lets bind some data to this
df_demo_geo.sort("region", inplace = True)
df_demo_geo.head()
Out[83]:
In [89]:
mapa = folium.Map(location=[48, -102], zoom_start=3)
mapa.geo_json(geo_path=state_geo,
data=df_demo_geo,
columns=['state', 'total_population'],
fill_color='BuPu',
key_on='feature.id', reset = True)
# We will not use inline version due to bug in 0.15.0
mapa.create_map()
# Go to http://localhost:8888/files/map.html
In [ ]: