Note: This notebook is using the R Kernel (and not the Python Kernel)

Load the Data


In [1]:
library(dplyr)
library(tidyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


In [2]:
# Lets start by loading in the data
df_price = read.csv("data/Weed_Price.csv")
str(df_price)


'data.frame':	22899 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ : num  339 289 303 362 249 ...
 $ HighQN: int  1042 252 1941 576 12096 2161 1294 347 433 6506 ...
 $ MedQ  : num  199 261 209 186 194 ...
 $ MedQN : int  933 297 1625 544 12812 1728 1316 273 349 5237 ...
 $ LowQ  : num  149 389 189 126 193 ...
 $ LowQN : int  123 26 222 112 778 128 91 34 39 514 ...
 $ date  : Factor w/ 449 levels "2013-12-27","2013-12-28",..: 6 6 6 6 6 6 6 6 6 6 ...

In [3]:
# Fix the data format in the data
df_price$date <- as.Date(df_price$date)
str(df_price)


'data.frame':	22899 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ : num  339 289 303 362 249 ...
 $ HighQN: int  1042 252 1941 576 12096 2161 1294 347 433 6506 ...
 $ MedQ  : num  199 261 209 186 194 ...
 $ MedQN : int  933 297 1625 544 12812 1728 1316 273 349 5237 ...
 $ LowQ  : num  149 389 189 126 193 ...
 $ LowQN : int  123 26 222 112 778 128 91 34 39 514 ...
 $ date  : Date, format: "2014-01-01" "2014-01-01" ...

In [4]:
head(df_price)


Out[4]:
StateHighQHighQNMedQMedQNLowQLowQNdate
1Alabama339.061042198.64933149.491232014-01-01
2Alaska288.75252260.6297388.58262014-01-01
3Arizona303.311941209.351625189.452222014-01-01
4Arkansas361.85576185.62544125.871122014-01-01
5California248.7812096193.5612812192.927782014-01-01
6Colorado236.312161195.291728213.51282014-01-01

In [5]:
dim(df_price)


Out[5]:
  1. 22899
  2. 8

In [6]:
# Check for missing value
df_missing <- subset(df_price, is.na(df_price$LowQ))
str(df_missing)


'data.frame':	10557 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ : num  338 303 299 342 244 ...
 $ HighQN: int  1539 350 2638 846 16512 3148 1835 490 657 9369 ...
 $ MedQ  : num  208 270 209 207 189 ...
 $ MedQN : int  1463 475 2426 836 19151 2877 2069 431 578 8166 ...
 $ LowQ  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ LowQN : int  182 37 306 145 1096 178 126 41 49 695 ...
 $ date  : Date, format: "2015-01-01" "2015-01-01" ...

In [7]:
# Create the new dataframe for summary by years
df_price_high <- df_price %>%
                group_by(State) %>%
                summarise(HighQ_Mean = mean(HighQ))
str(df_price_high)


Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	51 obs. of  2 variables:
 $ State     : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ_Mean: num  340 291 301 348 245 ...
 - attr(*, "drop")= logi TRUE

In [8]:
# Create the new dataframe for summary by years
df_price_mean <- df_price %>%
                group_by(State) %>%
                summarise_each(funs(mean(., na.rm = TRUE)))
str(df_price_mean)


Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	51 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ : num  340 291 301 348 245 ...
 $ HighQN: num  1379 321 2392 752 14947 ...
 $ MedQ  : num  205 262 209 190 191 ...
 $ MedQN : num  1270 408 2137 725 16770 ...
 $ LowQ  : num  147 387 191 127 191 ...
 $ LowQN : num  161.1 32.3 279 135.9 976.3 ...
 $ date  : Date, format: "2014-08-14" "2014-08-14" ...
 - attr(*, "drop")= logi TRUE

In [9]:
head(df_price_mean)


Out[9]:
StateHighQHighQNMedQMedQNLowQLowQNdate
1Alabama339.56181379.414204.60621270.352146.8326161.14922014-08-14
2Alaska291.482321.245262.0464407.9176387.232732.334082014-08-14
3Arizona300.66752392.465209.36532137.414190.8269279.00672014-08-14
4Arkansas348.0561751.9889190.4147724.6837127.3455135.9022014-08-14
5California245.376114947.07191.268916769.82190.796976.29842014-08-14
6Colorado238.91872816.218196.53252457.512226.7906165.34972014-08-14

In [10]:
# Load the demographic data
df_demo <- read.csv("data/Demographics_State.csv")
str(df_demo)


'data.frame':	51 obs. of  9 variables:
 $ region           : Factor w/ 51 levels "alabama","alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ total_population : int  4799277 720316 6479703 2933369 37659181 5119329 3583561 908446 619371 19091156 ...
 $ percent_white    : int  67 63 57 74 40 70 70 65 35 57 ...
 $ percent_black    : int  26 3 4 15 6 4 9 21 49 15 ...
 $ percent_asian    : int  1 5 3 1 13 3 4 3 3 2 ...
 $ percent_hispanic : int  4 6 30 7 38 21 14 8 10 23 ...
 $ per_capita_income: int  23680 32651 25358 22170 29527 31109 37892 29819 45290 26236 ...
 $ median_rent      : int  501 978 747 480 1119 825 880 828 1154 838 ...
 $ median_age       : num  38.1 33.6 36.3 37.5 35.4 36.1 40.2 38.9 33.8 41 ...

In [11]:
# Calculate population by type
df_demo$pop_white <- ceiling(df_demo$percent_white / 100 * df_demo$total_population)
df_demo$pop_black <- ceiling(df_demo$percent_black / 100 * df_demo$total_population)
df_demo$pop_asian <- ceiling(df_demo$percent_asian / 100 * df_demo$total_population)
df_demo$pop_hispanic <- ceiling(df_demo$percent_hispanic / 100 * df_demo$total_population)

In [12]:
# Calculate others percentage in the population type
df_demo$percent_other <- 100 - df_demo$percent_white - df_demo$percent_black - df_demo$percent_asian - df_demo$percent_hispanic
df_demo$pop_other <- df_demo$total_population - df_demo$pop_white - df_demo$pop_black - df_demo$pop_asian - df_demo$pop_hispanic

In [13]:
df_state <- read.csv("data/State_Location.csv")
str(df_state)


'data.frame':	51 obs. of  4 variables:
 $ region   : Factor w/ 51 levels "alabama","alaska",..: 2 1 4 3 5 6 7 9 8 10 ...
 $ state    : Factor w/ 51 levels "AK","AL","AR",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ latitude : num  61.4 32.8 35 33.8 36.2 ...
 $ longitude: num  -152.3 -86.8 -92.4 -111.4 -119.7 ...

In [14]:
colnames(df_price_mean)[1] <- "region"
df_price_mean$region <- tolower(df_price_mean$region)
df_mid <- merge(df_price_mean, df_demo, by = "region")
df <- merge(df_mid, df_state, by = "region")
str(df)


'data.frame':	51 obs. of  25 variables:
 $ region           : chr  "alabama" "alaska" "arizona" "arkansas" ...
 $ HighQ            : num  340 291 301 348 245 ...
 $ HighQN           : num  1379 321 2392 752 14947 ...
 $ MedQ             : num  205 262 209 190 191 ...
 $ MedQN            : num  1270 408 2137 725 16770 ...
 $ LowQ             : num  147 387 191 127 191 ...
 $ LowQN            : num  161.1 32.3 279 135.9 976.3 ...
 $ date             : Date, format: "2014-08-14" "2014-08-14" ...
 $ total_population : int  4799277 720316 6479703 2933369 37659181 5119329 3583561 908446 619371 19091156 ...
 $ percent_white    : int  67 63 57 74 40 70 70 65 35 57 ...
 $ percent_black    : int  26 3 4 15 6 4 9 21 49 15 ...
 $ percent_asian    : int  1 5 3 1 13 3 4 3 3 2 ...
 $ percent_hispanic : int  4 6 30 7 38 21 14 8 10 23 ...
 $ per_capita_income: int  23680 32651 25358 22170 29527 31109 37892 29819 45290 26236 ...
 $ median_rent      : int  501 978 747 480 1119 825 880 828 1154 838 ...
 $ median_age       : num  38.1 33.6 36.3 37.5 35.4 36.1 40.2 38.9 33.8 41 ...
 $ pop_white        : num  3215516 453800 3693431 2170694 15063673 ...
 $ pop_black        : num  1247813 21610 259189 440006 2259551 ...
 $ pop_asian        : num  47993 36016 194392 29334 4895694 ...
 $ pop_hispanic     : num  191972 43219 1943911 205336 14310489 ...
 $ percent_other    : num  2 23 6 3 3 2 3 3 3 3 ...
 $ pop_other        : num  95983 165671 388780 87999 1129774 ...
 $ state            : Factor w/ 51 levels "AK","AL","AR",..: 2 1 4 3 5 6 7 9 8 10 ...
 $ latitude         : num  32.8 61.4 33.8 35 36.2 ...
 $ longitude        : num  -86.8 -152.3 -111.4 -92.4 -119.7 ...

Single Variable Visualisation


In [15]:
# Load the visualisation library
library(ggplot2)
library(scales)
library(ggmap)

In [16]:
# Create the new dataframe for summary by years
head(df)


Out[16]:
regionHighQHighQNMedQMedQNLowQLowQNdatetotal_populationpercent_whitepercent_blackpercent_asianpercent_hispanicper_capita_incomemedian_rentmedian_agepop_whitepop_blackpop_asianpop_hispanicpercent_otherpop_otherstatelatitudelongitude
1alabama339.56181379.414204.60621270.352146.8326161.14922014-08-1447992776726142368050138.13215516124781347993191972295983AL32.799-86.8073
2alaska291.482321.245262.0464407.9176387.232732.334082014-08-14720316633563265197833.645380021610360164321923165671AK61.385-152.2683
3arizona300.66752392.465209.36532137.414190.8269279.00672014-08-1464797035743302535874736.3369343125918919439219439116388780AZ33.7712-111.3877
4arkansas348.0561751.9889190.4147724.6837127.3455135.9022014-08-1429333697415172217048037.5217069444000629334205336387999AR34.9513-92.3809
5california245.376114947.07191.268916769.82190.796976.29842014-08-1437659181406133829527111935.415063673225955148956941431048931129774CA36.17-119.7462
6colorado238.91872816.218196.53252457.512226.7906165.34972014-08-1451193297043213110982536.1358353120477415358010750602102384CO39.0646-105.3272

In [17]:
# Distribution - Barchart for the population by states
ggplot(df) + aes(reorder(region, total_population),weight = total_population) + 
  geom_bar(fill = "orange") + xlab("region") + scale_y_continuous(labels = comma) + coord_flip()



In [18]:
# Change from tall to wide dataset
df_wide <- df %>%
                select(1, 17:20,22 ) %>%
                gather("pop_frac", "value", 2:6)
str(df_wide)
head(df_wide)


'data.frame':	255 obs. of  3 variables:
 $ region  : chr  "alabama" "alaska" "arizona" "arkansas" ...
 $ pop_frac: Factor w/ 5 levels "pop_white","pop_black",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ value   : num  3215516 453800 3693431 2170694 15063673 ...
Out[18]:
regionpop_fracvalue
1alabamapop_white3215516
2alaskapop_white453800
3arizonapop_white3693431
4arkansaspop_white2170694
5californiapop_white15063673
6coloradopop_white3583531

In [19]:
# Distribution -  Population by states by Type
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
  geom_bar() + coord_flip() + scale_y_continuous(labels = comma)



In [20]:
# Distribution -  Population by states by Type (Fill)
ggplot(df_wide) + aes(reorder(region,value), weight = value, fill=pop_frac) +
  geom_bar(position = "fill") + xlab("region") + ylab("population") + coord_flip()



In [21]:
# Distribution - Per capita Income for each state
ggplot(df) + aes(reorder(region, per_capita_income), weight = per_capita_income) + 
  geom_bar(fill = "orange") + xlab("region") + ylab("per capita income") +
  scale_y_continuous(labels = comma) + coord_flip()



In [22]:
# Distribution - High Quality Number for each state
ggplot(df) + aes(reorder(region, HighQN), weight = HighQN) + 
  geom_bar(fill = "orange") + xlab("region") + ylab("HighQN") +
  scale_y_continuous(labels = comma) + coord_flip()


Geographic Mapping


In [23]:
ggplot(df) + aes(longitude, latitude, size = total_population) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)


Warning message:
: Removed 2 rows containing missing values (geom_point).

In [24]:
ggplot(df) + aes(longitude, latitude, size = percent_black) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)


Warning message:
: Removed 2 rows containing missing values (geom_point).

In [25]:
ggplot(df) + aes(longitude, latitude, size = HighQN) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)


Warning message:
: Removed 2 rows containing missing values (geom_point).

In [26]:
ggplot(df) + aes(longitude, latitude, size = percent_hispanic) + 
   geom_point(color = "orange") + coord_map() + xlim(-130, -60) + ylim(20,50)


Warning message:
: Removed 2 rows containing missing values (geom_point).

In [27]:
# Distribution - Mapping on geographic projection
library(maps)
map("state")



In [28]:
# Create the dataset for choloropleth map
states <- map_data("state")
chloro <- merge(states, df, sort = FALSE, by = "region")

In [29]:
# Choloropleth for Total Population
ggplot(chloro) + aes(long, lat, group = group, fill = total_population) +
  geom_polygon() + coord_map()



In [30]:
# Choloropleth for White Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_white) +
  geom_polygon() + coord_map()



In [31]:
# Choloropleth for Hispanic Population
ggplot(chloro) + aes(long, lat, group = group, fill = percent_hispanic) +
  geom_polygon() + coord_map()



In [32]:
# Quantity of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQN) +
  geom_polygon() + coord_map()



In [33]:
# Price of Weed - High
ggplot(chloro) + aes(long, lat, group = group, fill = HighQ) +
  geom_polygon() + coord_map()



In [49]:
# Price of Weed - HighQ
df_california <- df_price %>%
                 filter(State == "California")
str(df_california)


'data.frame':	449 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ HighQ : num  249 244 248 243 248 ...
 $ HighQN: int  12096 16512 12571 16904 12988 13396 13787 14178 18468 14539 ...
 $ MedQ  : num  194 189 193 189 193 ...
 $ MedQN : int  12812 19151 13406 19764 13906 14527 15047 15588 22012 16092 ...
 $ LowQ  : num  193 NA 192 NA 191 ...
 $ LowQN : int  778 1096 804 1123 839 863 891 930 1231 949 ...
 $ date  : Date, format: "2014-01-01" "2015-01-01" ...

In [58]:
ggplot(df_california) + geom_point(aes(date, HighQ)) +
    geom_point(aes(date,MedQ)) +
    geom_point(aes(date,LowQ))


Warning message:
: Removed 207 rows containing missing values (geom_point).

In [43]:
str(df_price)


'data.frame':	22899 obs. of  8 variables:
 $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ HighQ : num  339 289 303 362 249 ...
 $ HighQN: int  1042 252 1941 576 12096 2161 1294 347 433 6506 ...
 $ MedQ  : num  199 261 209 186 194 ...
 $ MedQN : int  933 297 1625 544 12812 1728 1316 273 349 5237 ...
 $ LowQ  : num  149 389 189 126 193 ...
 $ LowQN : int  123 26 222 112 778 128 91 34 39 514 ...
 $ date  : Date, format: "2014-01-01" "2014-01-01" ...

Two Variable Visualisation


In [59]:
# Scatter - Per capita Income vs Total population
ggplot(df) + aes(total_population, per_capita_income) +
    geom_point() + scale_x_continuous(label = comma)



In [60]:
# Bubble - Per capita Income vs White_Population, Size = Total population
ggplot(df) + aes(percent_hispanic, per_capita_income, size = total_population) +
    geom_point() + scale_x_continuous(label = comma)



In [61]:
# Scatter - Per capita Income vs HighQN
ggplot(df) + aes(HighQN, per_capita_income) +
    geom_point() + scale_x_continuous(label = comma)



In [ ]: