In [1]:
# Load the libraries
import pandas as pd
import numpy as np
In [2]:
# Load the price data again and fill the missing values, Add year
df = pd.read_csv("data/Weed_Price.csv", parse_dates=[-1])
df.sort(columns=['State','date'], inplace=True)
df.fillna(method = "ffill", inplace=True)
In [3]:
# Load the demographic data
df_demo = pd.read_csv("data/Demographics_State.csv")
Lets load the libraries required for Visual Exploration
In [4]:
# Load the visualisation libraries - Matplotlib and Seaborn
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
In [6]:
# Filter data for location California and calculate the Year
df['year'] = pd.DatetimeIndex(df['date']).year
df_cal = df[df["State"] == "California"]
df_cal.head()
Out[6]:
In [7]:
# Plot
df_cal.plot(x = "date", y = "HighQ")
Out[7]:
In [8]:
# Set index as date - this is important to get the labels in the plots automatically
df_cal.index = df_cal.date
df_cal.head()
Out[8]:
In [9]:
# Lets plot the HighQ prices
df_cal.HighQ.plot()
Out[9]:
In [10]:
# Lets plot this HighQ as a histogram to see the most common price
df_cal.HighQ.plot(kind = "hist")
Out[10]:
In [11]:
# Lets increase the bins to see some granularity
df_cal.HighQ.plot(kind = "hist", bins = 40)
Out[11]:
In [12]:
# Lets plot all the three prices in California
df_cal[["HighQ", "MedQ", "LowQ"]].plot()
Out[12]:
In [13]:
# Lets see the distribution of these prices by using a histogram
df_cal[["HighQ", "MedQ", "LowQ"]].plot(kind = "hist", bins = 50, alpha = 0.5)
Out[13]:
In [ ]:
Plot the HighQ, MedQ and LowQ prices for Alaska in 2014
In [ ]:
Plot the histogram of HighQ, MedQ and LowQ prices for Alaska in 2014
In [ ]:
In [14]:
# Lets plot a box plot for the HighQ, MedQ and LowQ
df_cal.describe()
Out[14]:
In [15]:
# Lets plot a Box Plot for the prices
df_cal[["HighQ", "MedQ", "LowQ"]].plot(kind = "box")
Out[15]:
In [16]:
# Lets plot a Box Plot for the sample size
df_cal[["HighQN", "MedQN", "LowQN"]].plot(kind = "box")
Out[16]:
What if we want to show the price in all the states in the year 2014?
In [17]:
# Select only the year 2014
df_2014 = df[df["year"] == 2014]
df_2014.head()
Out[17]:
In [18]:
# Lets use pivot tables to get HighQ values for each Date by each State
df_states = pd.pivot_table(df_2014, values = "HighQ", index = "date", columns = "State")
df_states.head()
Out[18]:
In [19]:
# Lets plot of these lines
df_states.plot()
Out[19]:
In [20]:
df_states.iloc[:,1:10].plot()
Out[20]:
In [21]:
# What if we group by State and plot
# df_2014.groupby("State").plot(x = "date", y = "HighQ")
In [22]:
# Arrange in a grid fashion
grid = sns.FacetGrid(df_2014, col = "State", col_wrap = 7)
grid.map(plt.plot, "date", "HighQ")
Out[22]:
In [23]:
df_demo.head()
Out[23]:
In [24]:
# Create an index in the demographic data to ease the labels
df_demo.index = df_demo.region
df_demo.head()
Out[24]:
In [25]:
# DO NOT make pie charts, especially when the number of category is greater than 6
df_demo.total_population.plot(kind = "pie")
Out[25]: