by Max Woolf
This notebook is the complement to my blog post Quantifying and Visualizing the Reddit Hivemind.
Note: Code is more hacky than my typical R Jupyter notebooks.
In [1]:
options(warn=-1)
# IMPORTANT: This assumes that all packages in "Rstart.R" are installed,
# and the fonts "Source Sans Pro" and "Open Sans Condensed Bold" are installed
# via extrafont. If ggplot2 charts fail to render, you may need to change/remove the theme call.
source("Rstart.R")
options(repr.plot.mimetypes = 'image/png', repr.plot.width=4, repr.plot.height=3, repr.plot.res=300)
In [2]:
df_subreddits <- read_csv("subreddit-data.csv")
df_means <- read_csv("subreddit-words-mean.csv")
df_medians <- read_csv("subreddit-words-median.csv")
Prepare Top 15 subreddits for Top 15 subreddit image.
In [3]:
invalid_subreddits <- c("reddit.com", "Fireteams", "POLITIC")
n <- 15
subreddits <- df_subreddits$subreddit[1:(n+length(invalid_subreddits))]
subreddits <- subreddits[-(which(subreddits %in% invalid_subreddits))]
print(subreddits)
Filter subreddits into a subset with the 15 subreddits and only 10 out of the 20 words.
In [4]:
df_subset <- df_means %>% filter(subreddit %in% subreddits, score_rank <= 10)
df_subset$subreddit <- factor(df_subset$subreddit, levels=subreddits)
subreddit_test <- "AskReddit"
df_subset %>% filter(subreddit==subreddit_test)
sprintf("nrows (expected: 15*10): %s", nrow(df_subset))
Out[4]:
Out[4]:
Unfortunately, ggplot2 does not allow free scales for faceting on bar charts, which makes it impossible to use ggplot2 alone. The alternate solution is to plot rhe 15 charts seperately and stich them together.
The multiplot function from Cookbook for R does the trick.
In [5]:
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Test multiplot
to make sure it actually works!
In [6]:
df_subset_test <- df_subset %>% filter(subreddit==subreddit_test)
left_labels <- ifelse(df_subset_test$avg_score > max(df_subset_test$avg_score) * 0.50,
df_subset_test$avg_score, '')
right_labels <- ifelse(df_subset_test$avg_score < max(df_subset_test$avg_score) * 0.50,
df_subset_test$avg_score, '')
plot <- ggplot(df_subset_test, aes(x=word, y=avg_score)) +
geom_bar(stat="identity", fill="#3498db") +
geom_hline(yintercept = (df_subreddits %>% filter(subreddit==subreddit_test))$avg_score, linetype="dashed", alpha=0.8) +
geom_text(label=left_labels, size=2, hjust=1.25, color="white", family="Open Sans Condensed Bold") +
geom_text(label=right_labels, size=2, hjust=-0.25, color="#3498db", family="Open Sans Condensed Bold") +
fte_theme() +
coord_flip() +
scale_y_continuous(labels=comma, breaks=pretty_breaks(6)) +
theme(axis.title.y=element_blank())
#max_save(plot, "AskReddit-test", "Reddit")
png(paste("AskReddit-test","png",sep="."),res=300,units="in",width=8,height=3)
multiplot(plot, plot, cols=2)
dev.off()
Out[6]:
Store all 15 plots into a list, then pass the list to multiplot
to plot in a 3x5 matrix, then save.
In [7]:
plots <- list()
for (i in 1:length(subreddits)) {
subreddit_plot <- subreddits[i]
df_subset_test <- df_subset %>% filter(subreddit==subreddit_plot)
df_subset_test$word <- factor(df_subset_test$word, levels=rev(df_subset_test$word))
left_labels <- ifelse(df_subset_test$avg_score > max(df_subset_test$avg_score) * 0.50,
sprintf("%0.1f", df_subset_test$avg_score), '')
right_labels <- ifelse(df_subset_test$avg_score < max(df_subset_test$avg_score) * 0.50,
sprintf("%0.1f", df_subset_test$avg_score), '')
plots[[i]] <- ggplot(df_subset_test, aes(x=word, y=avg_score)) +
geom_bar(stat="identity", fill="#3498db", alpha=0.85) +
geom_hline(yintercept = (df_subreddits %>% filter(subreddit==subreddit_plot))$avg_score, size=0.25, alpha=0.8) +
geom_text(label=left_labels, size=1.5, hjust=1.25, color="white", family="Open Sans Condensed Bold") +
geom_text(label=right_labels, size=1.5, hjust=-0.25, color="#3498db", family="Open Sans Condensed Bold") +
fte_theme() +
coord_flip() +
scale_y_continuous(labels=comma, breaks=pretty_breaks(6)) +
theme(axis.title.y=element_blank(), axis.title.x=element_blank(), axis.text.y=element_text(size=5), axis.text.x=element_text(size=4), plot.margin = unit(c(0.3,0,0,0), "cm")) +
labs(title=subreddit_plot)
}
#max_save(plot, "AskReddit-test", "Reddit")
png(paste("subreddit-means","png",sep="."),res=300,units="in",width=8,height=6)
multiplot(plotlist=plots, cols=5)
dev.off()
Out[7]:
Same code, except for the medians.
In [8]:
df_subset <- df_medians %>% filter(subreddit %in% subreddits, score_rank <= 10)
df_subset$subreddit <- factor(df_subset$subreddit, levels=subreddits)
plots <- list()
for (i in 1:length(subreddits)) {
subreddit_plot <- subreddits[i]
df_subset_test <- df_subset %>% filter(subreddit==subreddit_plot)
df_subset_test$word <- factor(df_subset_test$word, levels=rev(df_subset_test$word))
left_labels <- ifelse(df_subset_test$median > max(df_subset_test$median) * 0,
sprintf("%0.0f", df_subset_test$median), '')
#right_labels <- ifelse(df_subset_test$median < max(df_subset_test$median) * 0,
# sprintf("%0.0f", df_subset_test$median), '')
plots[[i]] <- ggplot(df_subset_test, aes(x=word, y=median)) +
geom_bar(stat="identity", fill="#e74c3c", alpha=0.85) +
geom_hline(yintercept = (df_subreddits %>% filter(subreddit==subreddit_plot))$median, size=0.25, alpha=0.8) +
geom_text(label=left_labels, size=1.5, hjust=1.75, color="white", family="Open Sans Condensed Bold") +
#geom_text(label=right_labels, size=1.5, hjust=-0.25, color="#e74c3c", family="Open Sans Condensed Bold") +
fte_theme() +
coord_flip() +
scale_y_continuous(labels=comma, breaks=pretty_breaks(6)) +
theme(axis.title.y=element_blank(), axis.title.x=element_blank(), axis.text.y=element_text(size=5), axis.text.x=element_text(size=4), plot.margin = unit(c(0.3,0,0,0), "cm")) +
labs(title=subreddit_plot)
}
#max_save(plot, "AskReddit-test", "Reddit")
png(paste("subreddit-medians","png",sep="."),res=300,units="in",width=8,height=6)
multiplot(plotlist=plots, cols=5)
dev.off()
Out[8]:
Create function for generating charts for a given subreddit. Use different formatting for Means and Medians.
In [9]:
system("mkdir -p subreddit-mean")
system("mkdir -p subreddit-median")
df_subreddits$rank <- 1:500
subreddit_word_chart <- function(subreddit_filter, type) {
truth <- df_subreddits %>% filter(subreddit==subreddit_filter)
if (type=="mean") {
df <- df_means %>% filter(subreddit==subreddit_filter)
df$var <- df$avg_score
true_var <- truth$avg_score
fill_color <- "#3498db"
caps <- "Mean"
#print(df)
}
else {
df <- df_medians %>% filter(subreddit==subreddit_filter)
df$var <- df$median
true_var <- truth$median
fill_color <- "#e74c3c"
caps <- "Median"
}
df$word <- factor(df$word, levels=rev(df$word))
left_labels <- ifelse(df$var > max(df$var) * 0.30,
sprintf("%0.1f", df$var), '')
right_labels <- ifelse(df$var < max(df$var) * 0.30,
sprintf("%0.1f", df$var), '')
#print(left_labels)
plot <- ggplot(df, aes(x=word, y=var)) +
geom_bar(stat="identity", fill=fill_color, alpha=0.85) +
geom_hline(yintercept = true_var, size=0.25, alpha=0.8) +
geom_text(label=left_labels, size=1.5, hjust=1.25, color="white", family="Open Sans Condensed Bold") +
geom_text(label=right_labels, size=1.5, hjust=-0.25, color=fill_color, family="Open Sans Condensed Bold") +
fte_theme() +
coord_flip() +
scale_y_continuous(breaks=pretty_breaks(6)) +
theme(axis.title.y=element_blank(), plot.title=element_text(size=7, hjust=0.5)) +
labs(title=sprintf("%s Score for Reddit Submissions in /r/%s Containing Keyword", caps, subreddit_filter), y=sprintf("%s Score for Submissions Containing Word", caps))
max_save(plot, sprintf("subreddit-%s/%s-%03d-%s", type, type, truth$rank, subreddit_filter), "Reddit")
}
subreddit_word_chart("AskReddit", "mean")
subreddit_word_chart("AskReddit", "median")
In [10]:
system.time(lapply(df_subreddits$subreddit,subreddit_word_chart, "mean"))
Out[10]:
In [11]:
system.time(lapply(df_subreddits$subreddit,subreddit_word_chart, "median"))
Out[11]: