In [1]:
library(dplyr)
library(tidyr)
In [2]:
# Lets start by loading in the data
df_price = read.csv("data/Weed_Price.csv")
str(df_price)
In [3]:
# Fix the data format in the data
df_price$date <- as.Date(df_price$date)
str(df_price)
In [4]:
head(df_price)
Out[4]:
In [5]:
dim(df_price)
Out[5]:
In [6]:
# Check for missing value
df_missing <- subset(df_price, is.na(df_price$LowQ))
str(df_missing)
In [7]:
# Create the new dataframe for summary by years
df_price_high <- df_price %>%
group_by(State) %>%
summarise(HighQ_Mean = mean(HighQ))
str(df_price_high)
In [8]:
# Create the new dataframe for summary by years
df_price_mean <- df_price %>%
group_by(State) %>%
summarise_each(funs(mean(., na.rm = TRUE)))
str(df_price_mean)
In [9]:
head(df_price_mean)
Out[9]:
In [10]:
# Load the demographic data
df_demo <- read.csv("data/Demographics_State.csv")
str(df_demo)
In [11]:
# Calculate population by type
df_demo$pop_white <- ceiling(df_demo$percent_white / 100 * df_demo$total_population)
df_demo$pop_black <- ceiling(df_demo$percent_black / 100 * df_demo$total_population)
df_demo$pop_asian <- ceiling(df_demo$percent_asian / 100 * df_demo$total_population)
df_demo$pop_hispanic <- ceiling(df_demo$percent_hispanic / 100 * df_demo$total_population)
In [12]:
# Calculate others percentage in the population type
df_demo$percent_other <- 100 - df_demo$percent_white - df_demo$percent_black - df_demo$percent_asian - df_demo$percent_hispanic
df_demo$pop_other <- df_demo$total_population - df_demo$pop_white - df_demo$pop_black - df_demo$pop_asian - df_demo$pop_hispanic
In [13]:
df_state <- read.csv("data/State_Location.csv")
str(df_state)
In [14]:
colnames(df_price_mean)[1] <- "region"
df_price_mean$region <- tolower(df_price_mean$region)
df_mid <- merge(df_price_mean, df_demo, by = "region")
df <- merge(df_mid, df_state, by = "region")
str(df)
In [15]:
# Load the visualisation library
library(ggplot2)
library(scales)
library(ggmap)
In [16]:
# Create the new dataframe for summary by years
head(df)
Out[16]:
In [17]:
# Distribution - Barchart for the population by states
ggplot(df) + aes(reorder(region, total_population),weight = total_population) +
geom_bar(fill = "orange") + xlab("region") + scale_y_continuous(labels = comma) + coord_flip()
In [18]:
# Change from tall to wide dataset
df_wide <- df %>%
select(1, 17:20,22 ) %>%
gather("pop_frac", "value", 2:6)
str(df_wide)
head(df_wide)
Out[18]:
In [19]:
# Distribution - Population by states by Type
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
geom_bar() + coord_flip() + scale_y_continuous(labels = comma)
In [20]:
# Distribution - Population by states by Type (Fill)
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
geom_bar(position = "fill") + xlab("region") + ylab("population") + coord_flip()
In [21]:
# Distribution - Per capita Income for each state
ggplot(df) + aes(reorder(region, per_capita_income), weight = per_capita_income) +
geom_bar(fill = "orange") + xlab("region") + ylab("per capita income") +
scale_y_continuous(labels = comma) + coord_flip()
In [22]:
# Distribution - High Quality Number for each state
ggplot(df) + aes(reorder(region, HighQN), weight = HighQN) +
geom_bar(fill = "orange") + xlab("region") + ylab("HighQN") +
scale_y_continuous(labels = comma) + coord_flip()
In [23]:
ggplot(df) + aes(longitude, latitude, size = total_population) +
geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
In [24]:
ggplot(df) + aes(longitude, latitude, size = percent_black) +
geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
In [25]:
ggplot(df) + aes(longitude, latitude, size = HighQN) +
geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
In [26]:
ggplot(df) + aes(longitude, latitude, size = percent_hispanic) +
geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)
In [27]:
# Distribution - Mapping on geographic projection
library(maps)
map("state")
In [28]:
# Create the dataset for choloropleth map
states <- map_data("state")
chloro <- merge(states, df, sort = FALSE, by = "region")
In [29]:
# Choloropleth for Total Population
ggplot(chloro) + aes(long, lat, group = group, fill = total_population) +
geom_polygon() + coord_map()
In [30]:
# Choloropleth for White Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_white) +
geom_polygon() + coord_map()
In [31]:
# Choloropleth for Hispanic Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_hispanic) +
geom_polygon() + coord_map()
In [32]:
# Quantity of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQN) +
geom_polygon() + coord_map()
In [33]:
# Price of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQ) +
geom_polygon() + coord_map()
In [49]:
# Price of Weed - HighQ
df_california <- df_price %>%
filter(State == "California")
str(df_california)
In [58]:
ggplot(df_california) + geom_point(aes(date, HighQ)) +
geom_point(aes(date,MedQ)) +
geom_point(aes(date,LowQ))
In [43]:
str(df_price)
In [59]:
# Scatter - Per capita Income vs Total population
ggplot(df) + aes(total_population, per_capita_income) +
geom_point() + scale_x_continuous(label = comma)
In [60]:
# Bubble - Per capita Income vs White_Population, Size = Total population
ggplot(df) + aes(percent_hispanic, per_capita_income, size = total_population) +
geom_point() + scale_x_continuous(label = comma)
In [61]:
# Scatter - Per capita Income vs HighQN
ggplot(df) + aes(HighQN, per_capita_income) +
geom_point() + scale_x_continuous(label = comma)
In [ ]: