Why base R graphics? I certainly have nothing against ggplot2, lattice, or other graphics packages, and used them more in the past. I've found it easier, though (as in the demo code), to specify the values and location for each boxplot instead of generating complex conditioning statements. This may be a bit of the opposite of the logic of the Grammar of Graphics, but oh well.
# R code written by Joset A. Etzel (jetzel@wustl.edu, mvpa.blogspot.com) and released 9 February 2016
# This code may be adapted for personal use, provided the source is cited.
# The code demonstrates producing complex boxplots in base R graphics.
###################################################################################################################################################################
# side-by-side boxplots, at two levels of sorting: subject.type and task.type
rm(list=ls()); # clear R's workspace
sub.types <- c("controls", "patients"); # labels for the subject types; each subject in just one sub.types category
c.ids <- paste0("c", 1:30); # the subject ID codes for the control subjects
p.ids <- paste0("p", 1:30); # subject ID codes for the paitents
task.types <- paste0("task", 1:4); # each person has a measurement for each of the four task types
# make a data.frame of fake data to plot.
set.seed(4002); # to reproduce the random numbers
data.tbl <- data.frame(array(NA, c((length(c.ids)+length(p.ids))*length(task.types), 4))); # blank table we'll fill up with the fake dataset; long-format
colnames(data.tbl) <- c("sub.id", "sub.type", "task.type", "data.value");
ctr <- 1; # row-counter for filling up data.tbl
# a bit lazy to code up looping over each subject type separately, but hopefully clear
for (t.id in 1:length(task.types)) {
for (i in 1:length(c.ids)) { # t.id <- 1; i <- 1;
data.tbl$sub.id[ctr] <- c.ids[i];
data.tbl$sub.type[ctr] <- "controls"; # hard-coded, since hard-coded c.ids on previous line
data.tbl$task.type[ctr] <- task.types[t.id];
data.tbl$data.value[ctr] <- rnorm(n=1, mean=t.id, sd=2); # a number for this task.type and person
ctr <- ctr + 1;
}
for (i in 1:length(p.ids)) { # t.id <- 1; i <- 1;
data.tbl$sub.id[ctr] <- p.ids[i];
data.tbl$sub.type[ctr] <- "patients"; # hard-coded again
data.tbl$task.type[ctr] <- task.types[t.id];
data.tbl$data.value[ctr] <- rnorm(n=1, mean=t.id/2, sd=1); # a number for this task.type and person
ctr <- ctr + 1;
}
}
# put the string columns to factors, not necessary for boxplots, but needed for other functions
data.tbl$sub.id <- factor(data.tbl$sub.id);
data.tbl$sub.type <- factor(data.tbl$sub.type);
data.tbl$task.type <- factor(data.tbl$task.type);
# head(data.tbl)
# sub.id sub.type task.type data.value
#1 c1 control task1 -0.1329034
#2 c2 control task1 1.7035703
#3 c3 control task1 2.0525445
#4 c4 control task1 1.2178570
# now have a fake dataset, so can shift to plotting. The code first sets some parameters used for both plots, then generates each plot separately.
windows(width=7, height=3); # specify plot window size. quartz() is the mac equivalent. the code will run without this, but the plot size (and so needed boxplot
# widths, font sizes, etc will vary.
#layout(matrix(c(1,1,2), c(1,3))); # put two plots side-by-side, first twice as wide as the second
layout(matrix(c(1,2), c(1,2))); # put two plots side-by-side, equal widths
par(mar=c(2, 3, 2.5, 0.75), mgp=c(1.1, 0.2, 0), tcl=-0.2); # I don't like wide margins ... totally personal preference.
# mar: c(bottom, left, top, right), number of lines of margin. default is c(5, 4, 4, 2) + 0.1.
# mgp: margin line (in mex units) for the axis title, axis labels and axis line. mgp[1] is title, mgp[2:3] is axis. default is c(3, 1, 0).
# The length of tick marks as a fraction of the height of a line of text. The default value is -0.5; setting tcl = NA sets tck = -0.01 which is S' default.
# plotting-related variables used to simplify plotting code below. I generally fiddle with the cx, shifts, boxwex variables to make the plots readable.
cx <- 1; # cex for axis label text; default is 1. bigger numbers make the text bigger, smaller, smaller.
x.ttl <- ""; # blank x-axis label
y.ttl <- "y-axis label";
ttl <- "plot title";
y.lim <- c(min(data.tbl$data.value), max(data.tbl$data.value)+1.5); # y-axis limits. +2 at top to give room for legend
# 1st: plot four boxes together: one for each task.type, grouped by sub.type
x.lim <- c(0.5, (length(sub.types)+0.5)); # x-axis limits; one for each of the two sub.types
cols <- c('paleturquoise4', 'paleturquoise3', 'paleturquoise2', 'paleturquoise1'); # color to plot each of the task.types (in task.types order)
shifts <- c(-0.3, -0.1, 0.1, 0.3); # x offset for the bars (see boxplot() call below).
# I find it easier to hard-code where the boxplots will fall (via shifts) rather than calculating the plotting locations on the fly;
plot(x=0, y=0, xlim=x.lim, ylim=y.lim, xaxt='n', col='white', xlab=x.ttl, ylab=y.ttl, main=ttl, cex.main=cx, cex.axis=cx, cex.lab=cx);
axis(side=1, at=1:length(sub.types), labels=sub.types, cex.axis=cx, cex.lab=cx); # put on the x-axis labels
grid(nx=NA, ny=NULL, col='darkgrey');
lines(x=c(-1,100), y=c(0,0), col='darkgrey');
for (t.id in 1:length(task.types)) {
for (i in 1:length(sub.types)) { # t.id <- 1; i <- 1;
inds <- which(data.tbl$sub.type == sub.types[i] & data.tbl$task.type == task.types[t.id]); # rows for this sub.type and task.type; what we'll plot.
if (length(inds) != length(c.ids)) { stop("length(inds) != length(c.ids)"); } # error-checking code; works since length(c.ids) == length(p.ids)
boxplot(data.tbl$data.value[inds], at=i+shifts[t.id], col=cols[t.id], add=TRUE, xaxt='n', yaxt='n', bty='n', boxwex=0.4, cex=1); # draw the boxplot!
# setting xaxt, yaxt, and bty to 'n' is so that R doesn't redraw the axis labels each time a boxplot is added to the plot
# when making lots of boxplots, sometimes R doesn't guess the box width (boxwex) or outlier dot (cex) size properly.
# The default for these is 1; bigger numbers makes wider boxes and bigger dots; smaller, smaller.
}
}
legend(x='top', legend=task.types, fill=cols, horiz=TRUE, cex=0.8, bg='white', bty='n'); # set font size a bit smaller to fit
box(); # redraw the box around the outside to be nice and neat
# 2nd: plot two boxes together: one for each sub.type, sorted by task.type
x.lim <- c(0.5, (length(task.types)+0.5)); # x-axis limits; one for each of the two sub.types
cols <- c('lightgreen', 'cornsilk'); # color to plot each of the sub.types (in sub.types order)
shifts <- c(-0.15, 0.15); # shifts and cols are just two entries long now: just two boxplots (patients and controls) for each sub.type
# shifts[] gives the offset for the center of each boxplot, so will change if boxwex changes.
plot(x=0, y=0, xlim=x.lim, ylim=y.lim, xaxt='n', col='white', xlab=x.ttl, ylab=y.ttl, main=ttl, cex.main=cx, cex.axis=cx, cex.lab=cx);
axis(side=1, at=1:length(task.types), labels=task.types, cex.axis=cx, cex.lab=cx); # put on the x-axis labels
grid(nx=NA, ny=NULL, col='darkgrey');
lines(x=c(-1,100), y=c(0,0), col='darkgrey');
for (t.id in 1:length(task.types)) {
for (i in 1:length(sub.types)) { # t.id <- 1; i <- 1;
inds <- which(data.tbl$sub.type == sub.types[i] & data.tbl$task.type == task.types[t.id]); # rows for this sub.type and task.type; what we'll plot.
if (length(inds) != length(c.ids)) { stop("length(inds) != length(c.ids)"); } # error-checking code; works since length(c.ids) == length(p.ids)
boxplot(data.tbl$data.value[inds], at=t.id+shifts[i], col=cols[i], add=TRUE, xaxt='n', yaxt='n', bty='n', boxwex=0.5, cex=0.9);
}
}
legend(x='top', legend=sub.types, fill=cols, horiz=TRUE, cex=cx, bg='white', bty='n');
box();
No comments:
Post a Comment