Tuesday, February 9, 2016

R demo: specifying side-by-side boxplots in base R

This post has the base R code (pasted after the jump below, and available here) to produce the boxplot graphs shown at right.

Why base R graphics? I certainly have nothing against ggplot2, lattice, or other graphics packages, and  used them more in the past. I've found it easier, though (as in the demo code), to specify the values and location for each boxplot instead of generating complex conditioning statements. This may be a bit of the opposite of the logic of the Grammar of Graphics, but oh well.



 # R code written by Joset A. Etzel (jetzel@wustl.edu, mvpa.blogspot.com) and released 9 February 2016  
 # This code may be adapted for personal use, provided the source is cited.  
 # The code demonstrates producing complex boxplots in base R graphics.  
 ###################################################################################################################################################################  
 # side-by-side boxplots, at two levels of sorting: subject.type and task.type  
   
 rm(list=ls()); # clear R's workspace  
   
 sub.types <- c("controls", "patients"); # labels for the subject types; each subject in just one sub.types category  
 c.ids <- paste0("c", 1:30);  # the subject ID codes for the control subjects  
 p.ids <- paste0("p", 1:30);  # subject ID codes for the paitents  
 task.types <- paste0("task", 1:4);  # each person has a measurement for each of the four task types  
   
   
 # make a data.frame of fake data to plot.  
 set.seed(4002); # to reproduce the random numbers  
 data.tbl <- data.frame(array(NA, c((length(c.ids)+length(p.ids))*length(task.types), 4))); # blank table we'll fill up with the fake dataset; long-format  
 colnames(data.tbl) <- c("sub.id", "sub.type", "task.type", "data.value");  
 ctr <- 1;  # row-counter for filling up data.tbl  
 # a bit lazy to code up looping over each subject type separately, but hopefully clear  
 for (t.id in 1:length(task.types)) {   
  for (i in 1:length(c.ids)) {  # t.id <- 1; i <- 1;   
   data.tbl$sub.id[ctr] <- c.ids[i];  
   data.tbl$sub.type[ctr] <- "controls";  # hard-coded, since hard-coded c.ids on previous line  
   data.tbl$task.type[ctr] <- task.types[t.id];  
   data.tbl$data.value[ctr] <- rnorm(n=1, mean=t.id, sd=2);  # a number for this task.type and person  
   ctr <- ctr + 1;  
  }  
  for (i in 1:length(p.ids)) {  # t.id <- 1; i <- 1;   
   data.tbl$sub.id[ctr] <- p.ids[i];  
   data.tbl$sub.type[ctr] <- "patients";  # hard-coded again  
   data.tbl$task.type[ctr] <- task.types[t.id];  
   data.tbl$data.value[ctr] <- rnorm(n=1, mean=t.id/2, sd=1);  # a number for this task.type and person  
   ctr <- ctr + 1;  
  }  
 }  
 # put the string columns to factors, not necessary for boxplots, but needed for other functions  
 data.tbl$sub.id <- factor(data.tbl$sub.id);  
 data.tbl$sub.type <- factor(data.tbl$sub.type);  
 data.tbl$task.type <- factor(data.tbl$task.type);  
   
 # head(data.tbl)  
 # sub.id sub.type task.type data.value  
 #1   c1 control   task1 -0.1329034  
 #2   c2 control   task1 1.7035703  
 #3   c3 control   task1 2.0525445  
 #4   c4 control   task1 1.2178570  
   
   
   
   
 # now have a fake dataset, so can shift to plotting. The code first sets some parameters used for both plots, then generates each plot separately.  
 windows(width=7, height=3); # specify plot window size. quartz() is the mac equivalent. the code will run without this, but the plot size (and so needed boxplot  
  # widths, font sizes, etc will vary.  
 #layout(matrix(c(1,1,2), c(1,3)));  # put two plots side-by-side, first twice as wide as the second  
 layout(matrix(c(1,2), c(1,2)));  # put two plots side-by-side, equal widths  
 par(mar=c(2, 3, 2.5, 0.75), mgp=c(1.1, 0.2, 0), tcl=-0.2); # I don't like wide margins ... totally personal preference.  
 # mar: c(bottom, left, top, right), number of lines of margin. default is c(5, 4, 4, 2) + 0.1.  
 # mgp: margin line (in mex units) for the axis title, axis labels and axis line. mgp[1] is title, mgp[2:3] is axis. default is c(3, 1, 0).  
 # The length of tick marks as a fraction of the height of a line of text. The default value is -0.5; setting tcl = NA sets tck = -0.01 which is S' default.  
   
   
 # plotting-related variables used to simplify plotting code below. I generally fiddle with the cx, shifts, boxwex variables to make the plots readable.  
 cx <- 1;  # cex for axis label text; default is 1. bigger numbers make the text bigger, smaller, smaller.  
 x.ttl <- ""; # blank x-axis label  
 y.ttl <- "y-axis label";  
 ttl <- "plot title";  
 y.lim <- c(min(data.tbl$data.value), max(data.tbl$data.value)+1.5); # y-axis limits. +2 at top to give room for legend  
   
   
 # 1st: plot four boxes together: one for each task.type, grouped by sub.type  
 x.lim <- c(0.5, (length(sub.types)+0.5));  # x-axis limits; one for each of the two sub.types  
 cols <- c('paleturquoise4', 'paleturquoise3', 'paleturquoise2', 'paleturquoise1');  # color to plot each of the task.types (in task.types order)  
 shifts <- c(-0.3, -0.1, 0.1, 0.3); # x offset for the bars (see boxplot() call below).  
  # I find it easier to hard-code where the boxplots will fall (via shifts) rather than calculating the plotting locations on the fly;   
   
 plot(x=0, y=0, xlim=x.lim, ylim=y.lim, xaxt='n', col='white', xlab=x.ttl, ylab=y.ttl, main=ttl, cex.main=cx, cex.axis=cx, cex.lab=cx);  
 axis(side=1, at=1:length(sub.types), labels=sub.types, cex.axis=cx, cex.lab=cx);  # put on the x-axis labels  
 grid(nx=NA, ny=NULL, col='darkgrey');  
 lines(x=c(-1,100), y=c(0,0), col='darkgrey');  
 for (t.id in 1:length(task.types)) {  
  for (i in 1:length(sub.types)) {  # t.id <- 1; i <- 1;   
   inds <- which(data.tbl$sub.type == sub.types[i] & data.tbl$task.type == task.types[t.id]); # rows for this sub.type and task.type; what we'll plot.  
   if (length(inds) != length(c.ids)) { stop("length(inds) != length(c.ids)"); } # error-checking code; works since length(c.ids) == length(p.ids)  
   boxplot(data.tbl$data.value[inds], at=i+shifts[t.id], col=cols[t.id], add=TRUE, xaxt='n', yaxt='n', bty='n', boxwex=0.4, cex=1); # draw the boxplot!  
    # setting xaxt, yaxt, and bty to 'n' is so that R doesn't redraw the axis labels each time a boxplot is added to the plot  
    # when making lots of boxplots, sometimes R doesn't guess the box width (boxwex) or outlier dot (cex) size properly.  
    # The default for these is 1; bigger numbers makes wider boxes and bigger dots; smaller, smaller.  
  }  
 }  
 legend(x='top', legend=task.types, fill=cols, horiz=TRUE, cex=0.8, bg='white', bty='n'); # set font size a bit smaller to fit  
 box(); # redraw the box around the outside to be nice and neat  
   
    
 # 2nd: plot two boxes together: one for each sub.type, sorted by task.type  
 x.lim <- c(0.5, (length(task.types)+0.5));  # x-axis limits; one for each of the two sub.types  
 cols <- c('lightgreen', 'cornsilk');  # color to plot each of the sub.types (in sub.types order)  
 shifts <- c(-0.15, 0.15);  # shifts and cols are just two entries long now: just two boxplots (patients and controls) for each sub.type  
  # shifts[] gives the offset for the center of each boxplot, so will change if boxwex changes.  
   
 plot(x=0, y=0, xlim=x.lim, ylim=y.lim, xaxt='n', col='white', xlab=x.ttl, ylab=y.ttl, main=ttl, cex.main=cx, cex.axis=cx, cex.lab=cx);  
 axis(side=1, at=1:length(task.types), labels=task.types, cex.axis=cx, cex.lab=cx);  # put on the x-axis labels  
 grid(nx=NA, ny=NULL, col='darkgrey');  
 lines(x=c(-1,100), y=c(0,0), col='darkgrey');  
 for (t.id in 1:length(task.types)) {  
  for (i in 1:length(sub.types)) {  # t.id <- 1; i <- 1;   
   inds <- which(data.tbl$sub.type == sub.types[i] & data.tbl$task.type == task.types[t.id]); # rows for this sub.type and task.type; what we'll plot.  
   if (length(inds) != length(c.ids)) { stop("length(inds) != length(c.ids)"); } # error-checking code; works since length(c.ids) == length(p.ids)  
   boxplot(data.tbl$data.value[inds], at=t.id+shifts[i], col=cols[i], add=TRUE, xaxt='n', yaxt='n', bty='n', boxwex=0.5, cex=0.9);   
  }  
 }  
 legend(x='top', legend=sub.types, fill=cols, horiz=TRUE, cex=cx, bg='white', bty='n');  
 box();  

No comments:

Post a Comment