Appendix C. Boosted tree examples in R.
This appendix shows examples of analyzing data using boosted trees. To run the examples you need to download and install R and the packages gbm and/or gbmplus. gbmplus is an adapted version of gbm, and is available from the Supplement or the author. R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows, and MacOS. R is `GNU S', a freely available language and environment for statistical computing and graphics which provides a wide variety of statistical and graphical techniques: linear and nonlinear modeling, statistical tests, time series analysis, classification, clustering, etc. CRAN is a network of ftp and web servers around the world that store identical, up-to-date, versions of code and documentation for R.
# Example of environmental data:
# Run R
# Load the package gbmplus
library(gbmplus)
# Load the data set ozodat and examine the data
# The objective is to predict ozone levels from radiation, temperature and wind
# There are just 110 cases
data(ozodat)
summary(ozodat)
nrow(ozodata)
# Set the seed to get repeatable results
set.seed(1)
# Fit a gbm to the data and plot the prediction error with tree size
fit <- gbm(ozone ~ radiation + temperature + wind, data = ozodat)
plot.err(fit)
# Decrease the shrinkage and increase the number of trees
fit1 <- gbm(ozone ~ radiation + temperature + wind, data = ozodat, shrinkage = 0.02, n.tree = 500, int = 3)
plot.err(fit1)
# Plot the influences and the partial plots for each predictor
plot.all(fit1)
# calculate the summaries
all.summary(fit1)
# for 2-way effects plots we can use contour, perspective trellis or line plots
par(mfrow=c(2, 2))
plot(fit1, c(2, 3), ptype="c")
plot(fit1, c(2, 3), ptype="p", theta=210)
plot(fit1, c(2, 3), ptype="t")
plot(fit1, c(2, 3), ptype="l")
# generate a set of presepctive plots from different angles
old.par<-par(mfrow=c(2, 2), mar=rep(0, 4))
for (i in 1:4) plot(fit1, c(2, 3), ptype="p", theta=30+i*90, axes=F)
for (i in 1:4) plot(fit1, c(1, 3), ptype="p", theta=30+i*90, axes=F)
for (i in 1:4) plot(fit1, c(1, 2), ptype="p", theta=30+i*90, axes=F)
par(old.par)
# Will a main effects model predict as well?
# Fit it by setting interaction to 1 and increase the number of trees
# Note : we can abbreviate arguments to functions
fit2 <- gbm(ozone~radiation + temperature + wind, data = ozodat, shr = 0.02, n.tree = 1000, int = 1)
# The error has increased by about 15% -- clearly the interactions are important
# What about a model with just 1st order interactions
fit3 <- gbm(ozone~radiation + temperature + wind, data = ozodat, shr = 0.02, n.tree = 1000, int = 2)
# Yes -- 1st order interactions model is OK: don't need the 3-way interaction
# Is radiation an important predictor?
fit4 <- gbm(ozone~temperature + wind, data = ozodat, shr = 0.02, n.tree = 1000, int = 2)
# The error has increased by 5-8% -- radiation has a weak effect
# Are effect smonotone as we might expect?
fit5 <- gbm(ozone~radiation + temperature + wind, data = ozodat, shr = 0.02, n.tree = 1000, int = 3, var.mono=c(1, 1, -1))
plot.all(fit5)
# So-so -- as long as we include the 2nd order interaction -- increase in error is ~3%.
# Simulation examples for gbmplus
# One: shows some diagnostic and plotting options
simdata <- function(n) {
x1 <- runif(n, -2, 2)
x2 <- runif(n, -2, 2)
x3 <- runif(n, 0, 2)
err <- rnorm(n)
yt <- x1 + 2*x2*x3^2
y <- yt + err
data.frame(y, yt, x1, x2, x3)
}
set.seed(1)
df <- simdata(1000)
z <- gbm(y~x1+x2+x3, data = df, shr = 0.025, n.tree = 1000)
plot.err(z)
plot.all(z)
all.summary(z)
plot(z, 2:3, pty="c")
plot(z, 2:3, pty="t")
plot(z, 2:3, pty="l")
plot(z, 2:3, pty="p")
old.par <- par(mfrow=c(3, 3), mar=rep(0, 4))
for (i in 1:9) plot(z, 2:3, pty="p", theta = i*40 - 10, xlab="", ylab="", zlab='', axes=F)
par(old.par)
#Two: comparing the accuracy of ABTs and BTs
set.seed(2)
nrep <- 10; n <- 500; nt <- 2000; shr <- 0.02
res <- matrix(NA, ncol=2, nrow=nrep)
dimnames(res) <- list(1:nrep, c("Mgbm", "Sgbm"))
for (i in 1:nrep) {
df <- simdata(n)
df.test <- simdata(n)
z0 <- gbm(y~x1+x2+x3, data = df, shr = shr, n.tree = nt)
zp0 <- predict(z0, newdata=df.test)
err0 <- mean((zp0 - df.test$yt)^2)
z1 <- gbm(y~x1+x2+x3, data = df, shr = shr, n.tree = nt, single.tree = TRUE)
zp1 <- predict(z1, newdata=df.test)
err1 <- mean((zp1 - df.test$yt)^2)
res[i, ] <- c(err0, err1)
}
summary(res)
t.test(res[, 1], res[, 2], pair=T)
Additional examples are available in the help files of the R gbm package.