In [1]:
    
library(dplyr)
library(tidyr)
    
    
In [2]:
    
# Lets start by loading in the data
df_price = read.csv("data/Weed_Price.csv")
str(df_price)
    
    
In [3]:
    
# Fix the data format in the data
df_price$date <- as.Date(df_price$date)
str(df_price)
    
    
In [4]:
    
head(df_price)
    
    Out[4]:
In [5]:
    
dim(df_price)
    
    Out[5]:
In [6]:
    
# Check for missing value
df_missing <- subset(df_price, is.na(df_price$LowQ))
str(df_missing)
    
    
In [7]:
    
# Create the new dataframe for summary by years
df_price_high <- df_price %>%
                group_by(State) %>%
                summarise(HighQ_Mean = mean(HighQ))
str(df_price_high)
    
    
In [8]:
    
# Create the new dataframe for summary by years
df_price_mean <- df_price %>%
                group_by(State) %>%
                summarise_each(funs(mean(., na.rm = TRUE)))
str(df_price_mean)
    
    
In [9]:
    
head(df_price_mean)
    
    Out[9]:
In [10]:
    
# Load the demographic data
df_demo <- read.csv("data/Demographics_State.csv")
str(df_demo)
    
    
In [11]:
    
# Calculate population by type
df_demo$pop_white <- ceiling(df_demo$percent_white / 100 * df_demo$total_population)
df_demo$pop_black <- ceiling(df_demo$percent_black / 100 * df_demo$total_population)
df_demo$pop_asian <- ceiling(df_demo$percent_asian / 100 * df_demo$total_population)
df_demo$pop_hispanic <- ceiling(df_demo$percent_hispanic / 100 * df_demo$total_population)
    
In [12]:
    
# Calculate others percentage in the population type
df_demo$percent_other <- 100 - df_demo$percent_white - df_demo$percent_black - df_demo$percent_asian - df_demo$percent_hispanic
df_demo$pop_other <- df_demo$total_population - df_demo$pop_white - df_demo$pop_black - df_demo$pop_asian - df_demo$pop_hispanic
    
In [13]:
    
df_state <- read.csv("data/State_Location.csv")
str(df_state)
    
    
In [14]:
    
colnames(df_price_mean)[1] <- "region"
df_price_mean$region <- tolower(df_price_mean$region)
df_mid <- merge(df_price_mean, df_demo, by = "region")
df <- merge(df_mid, df_state, by = "region")
str(df)
    
    
In [15]:
    
# Load the visualisation library
library(ggplot2)
library(scales)
library(ggmap)
    
In [16]:
    
# Create the new dataframe for summary by years
head(df)
    
    Out[16]:
In [17]:
    
# Distribution - Barchart for the population by states
ggplot(df) + aes(reorder(region, total_population),weight = total_population) + 
  geom_bar(fill = "orange") + xlab("region") + scale_y_continuous(labels = comma) + coord_flip()
    
    
In [18]:
    
# Change from tall to wide dataset
df_wide <- df %>%
                select(1, 17:20,22 ) %>%
                gather("pop_frac", "value", 2:6)
str(df_wide)
head(df_wide)
    
    
    Out[18]:
In [19]:
    
# Distribution -  Population by states by Type
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
  geom_bar() + coord_flip() + scale_y_continuous(labels = comma)
    
    
In [20]:
    
# Distribution -  Population by states by Type (Fill)
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
  geom_bar(position = "fill") + xlab("region") + ylab("population") + coord_flip()
    
    
In [21]:
    
# Distribution - Per capita Income for each state
ggplot(df) + aes(reorder(region, per_capita_income), weight = per_capita_income) + 
  geom_bar(fill = "orange") + xlab("region") + ylab("per capita income") +
  scale_y_continuous(labels = comma) + coord_flip()
    
    
In [22]:
    
# Distribution - High Quality Number for each state
ggplot(df) + aes(reorder(region, HighQN), weight = HighQN) + 
  geom_bar(fill = "orange") + xlab("region") + ylab("HighQN") +
  scale_y_continuous(labels = comma) + coord_flip()
    
    
In [23]:
    
ggplot(df) + aes(longitude, latitude, size = total_population) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
    
    
    
In [24]:
    
ggplot(df) + aes(longitude, latitude, size = percent_black) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
    
    
    
In [25]:
    
ggplot(df) + aes(longitude, latitude, size = HighQN) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
    
    
    
In [26]:
    
ggplot(df) + aes(longitude, latitude, size = percent_hispanic) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
    
    
    
In [27]:
    
# Distribution - Mapping on geographic projection
library(maps)
map("state")
    
    
In [28]:
    
# Create the dataset for choloropleth map
states <- map_data("state")
chloro <- merge(states, df, sort = FALSE, by = "region")
    
In [29]:
    
# Choloropleth for Total Population
ggplot(chloro) + aes(long, lat, group = group, fill = total_population) +
  geom_polygon() + coord_map()
    
    
In [30]:
    
# Choloropleth for White Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_white) +
  geom_polygon() + coord_map()
    
    
In [31]:
    
# Choloropleth for Hispanic Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_hispanic) +
  geom_polygon() + coord_map()
    
    
In [32]:
    
# Quantity of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQN) +
  geom_polygon() + coord_map()
    
    
In [33]:
    
# Price of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQ) +
  geom_polygon() + coord_map()
    
    
In [49]:
    
# Price of Weed - HighQ
df_california <- df_price %>%
                 filter(State == "California")
str(df_california)
    
    
In [58]:
    
ggplot(df_california) + geom_point(aes(date, HighQ)) +
    geom_point(aes(date,MedQ)) +
    geom_point(aes(date,LowQ))
    
    
    
In [43]:
    
str(df_price)
    
    
In [59]:
    
# Scatter - Per capita Income vs Total population
ggplot(df) + aes(total_population, per_capita_income) +
    geom_point() + scale_x_continuous(label = comma)
    
    
In [60]:
    
# Bubble - Per capita Income vs White_Population, Size = Total population
ggplot(df) + aes(percent_hispanic, per_capita_income, size = total_population) +
    geom_point() + scale_x_continuous(label = comma)
    
    
In [61]:
    
# Scatter - Per capita Income vs HighQN
ggplot(df) + aes(HighQN, per_capita_income) +
    geom_point() + scale_x_continuous(label = comma)
    
    
In [ ]: