Course Logistics

functional

In [ ]:

knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.align = 'center')

In [ ]:

library(dplyr)
library(stringr)
taxi_url <- "http://alizaidi.blob.core.windows.net/training/trainingData/manhattan_df.rds"
taxi_df  <- readRDS(gzcon(url(taxi_url)))
(taxi_df <- tbl_df(taxi_df))

In [ ]:

filter(taxi_df,
       dropoff_dow %in% c("Fri", "Sat", "Sun"),
       tip_amount > 1)

In [ ]:

library(stringr)
table(taxi_df$pickup_nhood)
harlem_pickups <- filter(taxi_df, str_detect(pickup_nhood, "Harlem"))
harlem_pickups
# uncomment the line below (ctrl+shift+c) and filter harlem_pickups on Financial District
# how many rows?
# fidi <- filter(harlem_pickups, ...)

In [ ]:

select(taxi_df, pickup_nhood, dropoff_nhood,
       fare_amount, dropoff_hour, trip_distance)

In [ ]:

select(arrange(taxi_df, desc(fare_amount), pickup_nhood),
       fare_amount, pickup_nhood)

head(select(arrange(taxi_df, desc(fare_amount), pickup_nhood),
       fare_amount, pickup_nhood), 10)

In [ ]:

taxi_df <- mutate(taxi_df, tip_pct = tip_amount/fare_amount)
select(taxi_df, tip_pct, fare_amount, tip_amount)
transmute(taxi_df, tip_pct = tip_amount/fare_amount)

In [ ]:

grouped_taxi <- group_by(taxi_df, dropoff_nhood)
class(grouped_taxi)
grouped_taxi

In [ ]:

summarize(group_by(taxi_df, dropoff_nhood),
          Num = n(), ave_tip_pct = mean(tip_pct))

In [ ]:

summarise(group_by(taxi_df, pickup_nhood, dropoff_nhood),
          Num = n(), ave_tip_pct = mean(tip_pct))

In [ ]:

filter(arrange(summarise(group_by(taxi_df, pickup_nhood, dropoff_nhood), Num = n(), ave_tip_pct = mean(tip_pct)), desc(ave_tip_pct)), Num >= 10)

In [ ]:

filter(
  arrange(
    summarise(
      group_by(taxi_df,
               pickup_nhood, dropoff_nhood),
      Num = n(),
      ave_tip_pct = mean(tip_pct)),
    desc(ave_tip_pct)),
  Num >= 10)

In [ ]:

taxi_df %>%
  group_by(pickup_nhood, dropoff_nhood) %>%
  summarize(Num = n(),
            ave_tip_pct = mean(tip_pct)) %>%
  arrange(desc(ave_tip_pct)) %>%
  filter(Num >= 10)

In [ ]:

mht_url <- "http://alizaidi.blob.core.windows.net/training/manhattan.rds"
manhattan_hoods <- readRDS(gzcon(url(mht_url)))
taxi_df %>%
  filter(pickup_nhood %in% manhattan_hoods,
         dropoff_nhood %in% manhattan_hoods) %>%
  group_by(dropoff_nhood, pickup_nhood) %>%
  summarize(ave_tip = mean(tip_pct),
            ave_dist = mean(trip_distance)) %>%
  filter(ave_dist > 3, ave_tip > 0.05)

In [ ]:

library(ggplot2)
taxi_df %>%
  filter(pickup_nhood %in% manhattan_hoods,
         dropoff_nhood %in% manhattan_hoods) %>%
  group_by(dropoff_nhood, pickup_nhood) %>%
  summarize(ave_tip = mean(tip_pct),
            ave_dist = mean(trip_distance)) %>%
  filter(ave_dist > 3, ave_tip > 0.05) %>%
  ggplot(aes(x = pickup_nhood, y = dropoff_nhood)) +
    geom_tile(aes(fill = ave_tip), colour = "white") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = 'bottom') +
    scale_fill_gradient(low = "white", high = "steelblue")

In [ ]:

library(ggplot2)
taxi_df %>%
  filter(pickup_nhood %in% manhattan_hoods,
         dropoff_nhood %in% manhattan_hoods) %>%
  group_by(dropoff_nhood, pickup_nhood) %>%
  summarize(ave_tip = mean(tip_pct),
            ave_dist = mean(trip_distance)) %>%
  filter(ave_dist > 3, ave_tip > 0.05) %>%
  ggplot(aes(x = pickup_nhood, y = dropoff_nhood)) +
    geom_tile(aes(fill = ave_tip), colour = "white") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = 'bottom') +
    scale_fill_gradient(low = "white", high = "steelblue")

In [ ]:

taxi_df %>%
  filter(pickup_nhood %in% manhattan_hoods,
         dropoff_nhood %in% manhattan_hoods) %>%
  group_by(dropoff_nhood, pickup_nhood) %>%
  summarize(ave_tip = mean(tip_pct),
            ave_dist = mean(trip_distance)) %>%
  lm(ave_tip ~ ave_dist, data = .) -> taxi_model
summary(taxi_model)

In [ ]:

taxi_hood_sum <- function(taxi_data = taxi_df) {

  mht_url <- "http://alizaidi.blob.core.windows.net/training/manhattan.rds"

  manhattan_hoods <- readRDS(gzcon(url(mht_url)))
  taxi_data %>%
    filter(pickup_nhood %in% manhattan_hoods,
           dropoff_nhood %in% manhattan_hoods) %>%
    group_by(dropoff_nhood, pickup_nhood) %>%
    summarize(ave_tip = mean(tip_pct),
              ave_dist = mean(trip_distance)) %>%
    filter(ave_dist > 3, ave_tip > 0.05) -> sum_df

  return(sum_df)

}

In [ ]:

tile_plot_hood <- function(df = taxi_hood_sum()) {

  library(ggplot2)

  ggplot(data = df, aes(x = pickup_nhood, y = dropoff_nhood)) +
    geom_tile(aes(fill = ave_tip), colour = "white") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = 'bottom') +
    scale_fill_gradient(low = "white", high = "steelblue") -> gplot

  return(gplot)
}

In [ ]:

library(plotly)
taxi_hood_sum(taxi_df) %>% tile_plot_hood %>% ggplotly

In [ ]:

taxi_df %>% group_by(dropoff_dow) %>%
  filter(!is.na(dropoff_nhood), !is.na(pickup_nhood)) %>%
  arrange(desc(tip_pct)) %>%
  do(slice(., 1:2)) %>%
  select(dropoff_dow, tip_amount, tip_pct,
         fare_amount, dropoff_nhood, pickup_nhood)

In [ ]:

dow_lms <- taxi_df %>% sample_n(10^4) %>%
  group_by(dropoff_dow) %>%
  do(lm_tip = lm(tip_pct ~ pickup_nhood + passenger_count + pickup_hour,
     data = .))

In [ ]:

dow_lms

In [ ]:

summary(dow_lms$lm_tip[[1]])
library(broom)
dow_lms %>% tidy(lm_tip)

In [ ]:

library(broom)
taxi_df %>% sample_n(10^5) %>%
  group_by(dropoff_dow) %>%
  do(glance(lm(tip_pct ~ pickup_nhood + passenger_count + pickup_hour,
     data = .)))

In [ ]:

taxi_df %>% sample_n(10^5) %>%
  group_by(dropoff_dow) %>%
  do(tidy(lm(tip_pct ~ pickup_nhood + passenger_count + pickup_hour,
     data = .)))

In [ ]:

# min and max coordinates:
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004

pickups <- taxi_df %>%
  filter(pickup_longitude > min_long,
         pickup_latitude < max_lat,
         dropoff_longitude > min_long,
         dropoff_latitude < max_lat) %>%
  group_by(pickup_hour,
           pickup_longitude,
           pickup_latitude) %>%
  summarise(num_pickups = n())

In [ ]:

library(purrr)
library(lubridate)
library(RColorBrewer)
library(magick)

In [ ]:

theme_map_dark <- function(palate_color = "Greys") {

  palate <- brewer.pal(palate_color, n=9)
  color.background = "black"
  color.grid.minor = "black"
  color.grid.major = "black"
  color.axis.text = palate[1]
  color.axis.title = palate[1]
  color.title = palate[1]

  font.title <- "Source Sans Pro"
  font.axis <- "Open Sans Condensed Bold"

  theme_bw(base_size=5) +
    theme(panel.background=element_rect(fill=color.background, color=color.background)) +
    theme(plot.background=element_rect(fill=color.background, color=color.background)) +
    theme(panel.border=element_rect(color=color.background)) +
    theme(panel.grid.major=element_blank()) +
    theme(panel.grid.minor=element_blank()) +
    theme(axis.ticks=element_blank()) +
    theme(legend.background = element_rect(fill=color.background)) +
    theme(legend.text = element_text(size=3,colour=color.axis.title,family=font.axis)) +
    theme(legend.title = element_blank(), legend.position="top", legend.direction="horizontal") +
    theme(legend.key.width=unit(1, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm")) +
    theme(plot.title=element_text(colour=color.title,family=font.title, size=14)) +
    theme(plot.subtitle = element_text(colour=color.title,family=font.title, size=12)) +
    theme(axis.text.x=element_blank()) +
    theme(axis.text.y=element_blank()) +
    theme(axis.title.y=element_blank()) +
    theme(axis.title.x=element_blank()) +
    theme(strip.background = element_rect(fill=color.background,
                                          color=color.background),
          strip.text=element_text(size=7,colour=color.axis.title,family=font.title))

}

In [ ]:

# x axis should be longitude
# y axis should be latitude
map_nyc <- function(df, pickup_hr) {

  gplot <- ggplot(df,
                  aes(x=...,
                      y=...)) +
    geom_point(color="white", size=0.06) +
    scale_x_continuous(limits=c(min_long, max_long)) +
    scale_y_continuous(limits=c(min_lat, max_lat)) +
    theme_map_dark() +
    labs(title = "Map of NYC Taxi Pickups",
         subtitle = paste0("Pickups between ", pickup_hr))

  return(gplot)

}

In [ ]:

hour_plots <- ungroup(pickups) %>%
  filter(num_pickups > 1) %>%
  split(.$pickup_hour) %>%
  map(~ map_nyc(.x, pickup_hr = .x$pickup_hour[1]))

hour_plots

Symbol	Meaning
`<-`	assignment operator
`>`	ready for a new command
`+`	awaiting the completion of an existing command
`?`	get help for following function

	Homogeneous	Heterogeneous
1d	Atomic vector	List
2d	Matrix	Data frame
nd	Array

Course Logistics

Day One

R U Ready?

Day Two

Scalable Data Analysis with Microsoft R

Day Three

Distributing Computing on Spark Clusters with R

Prerequisites

Computing Environments

Development Environments

Where to Write R Code

What is R?

Why should I care?

R's Philosophy

What R Thou?

The aRt of Being Lazy

Lazy Evaluation in R

R's Programming Paradigm

Keys to R

Strengths of R

Where R Succeeds

Weaknesses of R

Where R Falls Short

Distributions of R

Some Essential Open Source Packages

R Foundations

Command line prompts

I'm Lost!

Getting Help for R

Quick Tour of Things You Need to Know

Data Structures

Quick Tour of Things You Need to Know

Data Types

Manipulating Data Structures

Subsetting Operators

Object Representation

Data Manipulation with the dplyr Package

Overview

Why use dplyr?

The Grammar of Data Manipulation

Tidy Data and Happier Coding

Premature Optimization

Manipulation verbs

Aggregation verbs

NYC Taxi Data

Data for Class

Viewing Data

tibble

Filtering and Reordering Data

Subsetting Data

Filter

Exercise

Solution

Select a set of columns

Select Example

Select: Other Options

Reordering Data

Arrange

Exercise

Summary

Data Aggregations and Transformations

Transformations

Summarise Data by Groups

Group By Neighborhoods Example

Chaining/Piping

Standard Code

Reformatted

Magrittr

Put that Function in Your Pipe and...

Pipe + group_by()

Pipe and Plot

Piping to other arguments

Exercise

Functional Programming

Creating Functional Pipelines

Too Many Pipes?

Reusable code

Functional Pipelines

Summarization

Functional Pipelines

Data Manipulation with the `dplyr` Package

Spatial Visualizations with `ggplot2` and `purrr`