In [1]:
library('tidyverse')
library('grid')
library('gridExtra')
In [2]:
nyt2 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt2.csv')
nyt3 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt3.csv')
nyt4 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt4.csv')
nyt5 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt5.csv')
nyt6 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt6.csv')
In [3]:
head(nyt2)
In [4]:
p1 <- ggplot(nyt2, aes(factor(Gender), Age)) + geom_boxplot(fill = "blue")
p2 <- ggplot(nyt3, aes(factor(Gender), Age)) + geom_boxplot(fill = "red")
p3 <- ggplot(nyt4, aes(factor(Gender), Age)) + geom_boxplot(fill = "yellow")
p4 <- ggplot(nyt5, aes(factor(Gender), Age)) + geom_boxplot(fill = "green")
p5 <- ggplot(nyt6, aes(factor(Gender), Age)) + geom_boxplot(fill = "orange")
grid.arrange(p1, p2, p3, p4, p4, nrow = 3, ncol = 2)
In [5]:
p1 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "blue")
p2 <- ggplot(nyt3, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "red")
p3 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "green")
p4 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "yellow")
p5 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "orange")
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [6]:
p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(fill = "blue", binwidth = 5)
p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(fill = "red", binwidth = 5)
p3 <- ggplot(nyt4, aes(Age)) + geom_histogram(fill = "yellow", binwidth = 5)
p4 <- ggplot(nyt5, aes(Age)) + geom_histogram(fill = "green", binwidth = 5)
p5 <- ggplot(nyt6, aes(Age)) + geom_histogram(fill = "orange", binwidth = 5)
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [7]:
p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(fill = "blue", binwidth = 5) + xlim(c(5,100))
p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(fill = "red", binwidth = 5) + xlim(c(5,100))
p3 <- ggplot(nyt4, aes(Age)) + geom_histogram(fill = "yellow", binwidth = 5) + xlim(c(5,100))
p4 <- ggplot(nyt5, aes(Age)) + geom_histogram(fill = "green", binwidth = 5) + xlim(c(5,100))
p5 <- ggplot(nyt6, aes(Age)) + geom_histogram(fill = "orange", binwidth = 5) + xlim(c(5,100))
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [8]:
p1 <- ggplot(nyt2, aes(Impressions)) + geom_histogram(fill = "blue", color = "red", binwidth = 1) + xlim(c(-1,16))
p2 <- ggplot(nyt3, aes(Impressions)) + geom_histogram(fill = "red", color = "blue", binwidth = 1) + xlim(c(-1,16))
p3 <- ggplot(nyt4, aes(Impressions)) + geom_histogram(fill = "yellow", color = "green", binwidth = 1) + xlim(c(-1,16))
p4 <- ggplot(nyt5, aes(Impressions)) + geom_histogram(fill = "orange", color = "black", binwidth = 1) + xlim(c(-1,16))
p5 <- ggplot(nyt6, aes(Impressions)) + geom_histogram(fill = "green", color = "yellow", binwidth = 1) + xlim(c(-1,16))
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [49]:
p1 <- ggplot(nyt2, aes(Age)) + stat_ecdf(geom = "step", color = "blue")
p2 <- ggplot(nyt3, aes(Age)) + stat_ecdf(geom = "step", color = "red")
p3 <- ggplot(nyt4, aes(Age)) + stat_ecdf(geom = "step", color = "darkgreen")
p4 <- ggplot(nyt5, aes(Age)) + stat_ecdf(geom = "step", color = "darkblue")
p5 <- ggplot(nyt6, aes(Age)) + stat_ecdf(geom = "step", color = "orange")
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [41]:
p1 <- ggplot(nyt2, aes(Impressions)) + stat_ecdf(geom = "step", color = "blue")
p2 <- ggplot(nyt3, aes(Impressions)) + stat_ecdf(geom = "step", color = "red")
p3 <- ggplot(nyt4, aes(Impressions)) + stat_ecdf(geom = "step", color = "darkgreen")
p4 <- ggplot(nyt5, aes(Impressions)) + stat_ecdf(geom = "step", color = "darkblue")
p5 <- ggplot(nyt6, aes(Impressions)) + stat_ecdf(geom = "step", color = "orange")
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [11]:
p1 <- ggplot(nyt2, aes(sample = Age)) + stat_qq(colour = "blue")
p2 <- ggplot(nyt3, aes(sample = Age)) + stat_qq(colour = "red")
p3 <- ggplot(nyt4, aes(sample = Age)) + stat_qq(colour = "yellow")
p4 <- ggplot(nyt5, aes(sample = Age)) + stat_qq(colour = "orange")
p5 <- ggplot(nyt6, aes(sample = Age)) + stat_qq(colour = "orange")
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
In [12]:
p1 <- ggplot(nyt2, aes(sample = Impressions)) + stat_qq(colour = "blue")
p2 <- ggplot(nyt3, aes(sample = Impressions)) + stat_qq(colour = "red")
p3 <- ggplot(nyt4, aes(sample = Impressions)) + stat_qq(colour = "yellow")
p4 <- ggplot(nyt5, aes(sample = Impressions)) + stat_qq(colour = "orange")
p5 <- ggplot(nyt6, aes(sample = Impressions)) + stat_qq(colour = "orange")
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)
Null Hypothesis : The variables follow a Normal Distribution
Alternative Hypothesis : The variables do not follow a Normal Distribution
In [30]:
# We first check for normality
library(nortest)
ad.test(nyt2$Age)
ad.test(nyt2$Impressions)
In [33]:
ad.test(nyt3$Age)
ad.test(nyt3$Impressions)
In [34]:
ad.test(nyt4$Age)
ad.test(nyt4$Impressions)
In [35]:
ad.test(nyt5$Age)
ad.test(nyt5$Impressions)
In [36]:
ad.test(nyt6$Age)
ad.test(nyt6$Impressions)
In [15]:
ggplot(nyt2, aes(x = Age, y = Impressions, fill = factor(Gender))) + geom_point()
In [16]:
# The Impressions variable has a lot of missing values and filtering out the missing values would be helpful
nyt2<-nyt2[which(nyt2$Impressions>0 & nyt2$Age>0),] # Selecting only the values of Impressions > 0 and Age > 0
nyt3<-nyt3[which(nyt3$Impressions>0 & nyt3$Age>0),] # Selecting only the values of Impressions > 0 and Age > 0
In [17]:
p1 <- ggplot(nyt2, aes(factor(Gender), Age)) + geom_boxplot(aes(fill = factor(Clicks)))
p2 <- ggplot(nyt3, aes(factor(Gender), Age)) + geom_boxplot(aes(fill = factor(Clicks)))
grid.arrange(p1, p2, nrow = 2)
In [18]:
p1 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(aes(fill = factor(Clicks)))
p2 <- ggplot(nyt3, aes(factor(Gender), Impressions)) + geom_boxplot(aes(fill = factor(Clicks)))
grid.arrange(p1, p2, nrow = 2)
In [19]:
p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 5)
p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 5)
grid.arrange(p1, p2, nrow = 2)
In [20]:
p1 <- ggplot(nyt2, aes(Impressions)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 1)
p2 <- ggplot(nyt3, aes(Impressions)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 1)
grid.arrange(p1, p2, nrow = 2)
In [59]:
p1 <- ggplot(data = nyt2, aes(nyt2$Age,group = nyt2$Gender,colour = nyt2$Gender)) + stat_ecdf(geom = "step")
p2 <- ggplot(data = nyt3, aes(nyt3$Age,group = nyt3$Gender, colour = nyt3$Gender)) + stat_ecdf(geom = "step")
p3 <- ggplot(data = nyt2, aes(nyt2$Impressions,group = nyt2$Gender,colour = nyt2$Gender)) + stat_ecdf(geom = "step")
p4 <- ggplot(data = nyt3, aes(nyt3$Impressions,group = nyt3$Gender,colour = nyt3$Gender)) + stat_ecdf(geom = "step")
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
In [56]:
p1 <- ggplot(data = nyt2, aes(sample=nyt2$Age,group = nyt2$Gender,colour = nyt2$Gender)) + stat_qq()
p2 <- ggplot(data = nyt3, aes(sample=nyt3$Age,group = nyt3$Gender,colour = nyt3$Gender)) + stat_qq()
p3 <- ggplot(data = nyt2, aes(sample=nyt2$Impressions,group = nyt2$Gender,colour = nyt2$Gender)) + stat_qq()
p4 <- ggplot(data = nyt3, aes(sample=nyt3$Impressions, group = nyt3$Gender, colour = nyt3$Gender)) + stat_qq()
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
Null Hypothesis : The variables follow a Normal Distribution
Alternative Hypothesis : The variables do not follow a Normal Distribution
In [39]:
ad.test(nyt2$Age)
ad.test(nyt2$Impressions)
In [40]:
ad.test(nyt3$Age)
ad.test(nyt3$Impressions)
End of Project