In [1]:
#Importing the Gadfly package for use in this notebook
using Gadfly
In [3]:
# Create 100 x-coordinate and 100 y-coordinate points, each value drawn at random from
# the standard normal distribution using the rand() function
xvals = rand(100)
yvals = rand(100)
Out[3]:
In [4]:
# Invoking the plot() function from Gadfly
# When we only pass two arguments (x and y) it results in a scatter plot (the geometry is taken as 'Geom.point')
# The x and y arguments are referred to as aesthetics
plot(x = xvals, y = yvals)
Out[4]:
In [5]:
# Actually specifying the geomtery as points (which was done by default in the previous example)
# The point geometry takes the input x- and y-aesthetics
plot(x = xvals, y = yvals, Geom.point)
Out[5]:
In [6]:
# Using the sort function (which changes the correlation complete)
plot(x = sort(xvals), y = sort(yvals))
Out[6]:
In [7]:
# Adding another geomtery, which in this case is a line connecting the points
# The point and line geometries act on the same input data
xvals2 = rand(15)
yvals2 = rand(15)
plot(x = xvals2, y = yvals2, Geom.point, Geom.line)
Out[7]:
In [8]:
# We can also smooth out the line
plot(x = xvals2, y = yvals2, Geom.point, Geom.smooth)
Out[8]:
In [9]:
# Different model for the line
plot(x = xvals2, y = yvals2, Geom.point, Geom.smooth(method = :lm))
Out[9]:
In [10]:
# Adding layer using the layer() function
# The geometry has to be specified now
plot(layer(x = xvals2, y = yvals2, Geom.point), layer(x = sort(xvals2), y = sort(yvals2), Geom.point))
Out[10]:
In [11]:
# It was not clear from the plot above that we were plotting two layers
plot(layer(x = xvals2, y = yvals2, Geom.point),
layer(x = sort(xvals2), y = sort(yvals2), Geom.point, Theme(default_color = colorant"orange")))
Out[11]:
In [12]:
# Another way of doing layers
# Adding a legend
points1 = layer(x = xvals2, y = yvals2, Geom.point, Theme(default_color = colorant"deepskyblue"))
points2 = layer(x = sort(xvals2), y = sort(yvals2), Geom.point, Theme(default_color = colorant"orange"))
plot(points1, points2, Guide.manual_color_key("Legend for this plot",
["Set of points", "Sorted set of points"], ["deepskyblue", "orange"]))
Out[12]:
In [13]:
# Changing the grid color
plot(layer(x = xvals2, y = yvals2, Geom.point),
layer(x = sort(xvals2), y = sort(yvals2), Geom.point, Theme(default_color = colorant"orange")),
Theme(grid_color = colorant"white"))
Out[13]:
In [14]:
# Removing the pop-up (mouse-over) grid by changing its color to white too
plot(layer(x = xvals2, y = yvals2, Geom.point),
layer(x = sort(xvals2), y = sort(yvals2), Geom.point, Theme(default_color = colorant"orange")),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[14]:
In [15]:
# Changing the size (stroke thickness) of the line
# Changing the line model
plot(x = xvals2, y = yvals2, Geom.point, Geom.smooth(method = :lm), Theme(line_width = 4px))
Out[15]:
In [16]:
# Different colors for point and line by using different layers
# Changing the point size
plot(layer(x = xvals2, y = yvals2, Geom.point, Theme(default_point_size = 10px)),
layer(x = xvals2, y = yvals2, Geom.smooth(method = :lm), Theme(line_width = 4px, default_color = colorant"orange")))
Out[16]:
In [17]:
plot(x = sort(xvals), y = sort(yvals), Geom.point,
Guide.title("My scatter plot"),
Guide.xlabel("x values"), Guide.ylabel("y values"))
Out[17]:
In [17]:
# Use Cairo to save PNG files
using Cairo
In [18]:
# The Distribution package for random variables
using Distributions
In [18]:
# Using normal distribution
draw(PNG("myqqplot.png", 8inch, 6inch), plot(x = rand(Normal(), 100), y = Normal(), Stat.qq, Geom.point, Guide.title("QQ plot")))
In [19]:
using DataFrames
In [20]:
# Importing a .csv file and using the semicolon to supress the output
df = readtable("GadflyTutorialData.csv");
In [21]:
# Looking at the last rows
# We note 200 row entries (patients)
# Gender, age, 4 variables and 3 categories
tail(df)
Out[21]:
In [22]:
# Data types in each column
showcols(df)
In [23]:
# Describe each column
describe(df[:Gender])
In [24]:
# From the Case-Control-Series Project lectures
# Count of number per group of gender
# Use the values for categorical data analysis
by(df, :Gender, d -> DataFrame(N = size(d, 1)))
Out[24]:
In [26]:
describe(df[:Age])
In [27]:
describe(df[:Variable1])
In [28]:
describe(df[:Variable2])
In [29]:
describe(df[:Variable3])
In [30]:
describe(df[:Variable4])
In [31]:
describe(df[:Category1])
In [32]:
# Count of number per group of category 1
# Use the values for categorical data analysis
by(df, :Category1, d -> DataFrame(N = size(d, 1)))
Out[32]:
In [33]:
describe(df[:Category2])
In [34]:
# Count of number per group of category 2
# Use the values for categorical data analysis
by(df, :Category2, d -> DataFrame(N = size(d, 1)))
Out[34]:
In [35]:
describe(df[:Category3])
In [36]:
# Count of number per group of category 3
# Use the values for categorical data analysis
by(df, :Category3, d -> DataFrame(N = size(d, 1)))
Out[36]:
In [37]:
plot(df, x = "Gender", y = "Age", Geom.boxplot,
Guide.title("Boxplot of ages of male and female patients"),
Guide.xlabel("Gender"))
Out[37]:
In [38]:
# Changing the size (stroke thickness) of the median line
# Change the color of the boxes and whiskers
# Increase the spacing between the boxes (making them thinner)
plot(df, x = "Gender", y = "Age", Geom.boxplot,
Guide.title("Boxplot of ages of male and female patients"),
Guide.xlabel("Gender"),
Theme(boxplot_spacing = 100px, grid_color = colorant"white", default_color = colorant"#AAAAAA", middle_width = 5px))
Out[38]:
In [39]:
# Some more categories (by using Category 3
# Bigger outlier points
plot(df, x = "Category3", y = "Variable2", Geom.boxplot,
Guide.title("Boxplot of variable 2 values for category 3 groups"),
Guide.xlabel("Catgeory 3 groups"), Theme(default_color = colorant"orange",
grid_color = colorant"white", grid_color_focused = colorant"white",
boxplot_spacing = 80px, default_point_size = 7px))
Out[39]:
In [40]:
# Subplots to add another dimension to the plot
plot(df, ygroup = "Gender", x = "Category1", y = "Variable2", Geom.subplot_grid(Geom.boxplot),
Theme(default_color = colorant"orange",
grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[40]:
In [41]:
# Density estimate of ages
plot(df, x = "Age", Geom.density,
Guide.title("Age distribution"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white", line_width = 4px))
Out[41]:
In [42]:
# Age distribution by group using the color argument
plot(df, x = "Age", color = "Gender", Geom.density,
Guide.title("Age distribution by gender"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[42]:
In [43]:
plot(df, x = "Age", color = "Category3", Geom.density,
Guide.title("Age distribution by category 3"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[43]:
In [44]:
plot(df, x = "Variable2", color = "Category3", Geom.density,
Guide.title("Variable 2 distribution by category 3"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[44]:
In [45]:
plot(df, x = "Variable1", Geom.histogram,
Guide.title("Variable 1 histogram"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[45]:
In [46]:
# Controlling the bincount
plot(df, x = "Variable1", Geom.histogram(bincount = 10),
Guide.title("Variable 1 histogram"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[46]:
In [47]:
# A variable with a more normal distribution
plot(df, x = "Variable2", Geom.histogram(bincount = 11),
Guide.title("Variable 2 histogram"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[47]:
In [48]:
# An x-axis variable for more than one categorical groups
plot(df, x = "Variable2", color = "Gender", Geom.histogram(bincount = 21),
Guide.title("Variable 2 histogram by gender"),
Theme(grid_color = colorant"white", grid_color_focused = colorant"white"))
Out[48]:
In [49]:
plot(df, x = :Category3, y = :Variable1, Geom.violin)
Out[49]:
In [50]:
# Note how the shape is the same as the density plots
plot(df, x = :Variable1, color = :Category3, Geom.density)
Out[50]:
In [52]:
using Distributions
In [53]:
# These sample point were not from a population in which the underlying variable is
# normally distributed (cannot use non-parametric tests)
plot(x = rand(Chisq(1), 100), y = Normal(), Stat.qq, Geom.point)
Out[53]:
In [54]:
# Checking to see if variable 1 is from a sample in which the underlying population has that
# variable normally distributed
plot(x = sort(df[:Variable1]), y = Normal(mean(df[:Variable1]), std(df[:Variable1])), Stat.qq, Geom.point)
Out[54]:
In [55]:
# Variable 2
plot(x = sort(df[:Variable2]), y = Normal(mean(df[:Variable2]), std(df[:Variable2])), Stat.qq, Geom.point)
Out[55]:
In [56]:
# Variable 3
plot(x = sort(df[:Variable3]), y = Normal(mean(df[:Variable3]), std(df[:Variable3])), Stat.qq, Geom.point)
Out[56]:
In [57]:
# Setting the title for the graph key (legend)
plot(df, x = "Variable1", y ="Variable2", color = "Category3",
Geom.point, Geom.smooth(method = :lm), Guide.colorkey("Category 3"), Theme(line_width = 3px))
Out[57]:
In [58]:
# Subplots to add another dimension to the plot
plot(df, ygroup = "Category3", x = "Variable1", y = "Variable2", Geom.subplot_grid(Geom.point, Geom.smooth(method = :lm)),
Theme(grid_color = colorant"white", line_width = 3px))
Out[58]:
In [59]:
# Drawing two vertical and two horizontal lines
# Adding size and color
plot(df, x = "Variable1", y = "Variable2", yintercept = [90, 110], Geom.point, Geom.hline(color = colorant"Red"),
xintercept = [10, 22], Geom.vline(size = 3px))
Out[59]:
In [60]:
# Horizontal then vertical
x = rand(30)
y = rand(30)
plot(x = x, y = y, Geom.point, Geom.step(direction = :hv))
Out[60]:
In [61]:
# Vertical then horizontal
plot(x = x, y = y, Geom.point, Geom.step(direction = :vh))
Out[61]:
In [62]:
plot(df, x = :Category3, y = :Variable1, color = :Gender, Geom.beeswarm)
Out[62]:
In [63]:
plot(df, x = :Variable1, y = :Variable2, color = :Variable3,
Scale.color_continuous(minvalue = minimum(df[:Variable3]), maxvalue = maximum(df[:Variable3])),
Theme(default_point_size = 7px))
Out[63]:
In [62]:
plot(df, x = :Variable1, y = :Variable2, color = :Category3, Geom.point,
Scale.color_discrete_manual(colorant"#FFDC57", colorant"#6EFFED", colorant"#B2B2B2"),
Theme(default_point_size = 7px))
Out[62]:
In [63]:
plot(df, x = :Variable1, y = :Variable2, color = :Category3, Geom.point,
Scale.color_discrete_manual(colorant"#FFDC57", colorant"#6EFFED", colorant"#B2B2B2"),
Theme(default_point_size = 7px, grid_line_width = 0px))
Out[63]:
In [ ]: