library(ISLR)
library(boot)

Based on: https://class.stanford.edu/courses/HumanitiesScience/StatLearning/Winter2014

plot(mpg ~ horsepower, data=Auto, pch=16)

Leave-one-out cross validation (LOOCV)

m = glm(mpg ~ horsepower, data=Auto)
cv = cv.glm(Auto, glmfit = m)
cv$delta    # cv prediction error: 1) raw result, 
[1] 24.23151 24.23114
            # 2) bias corrected as traning set is smaller than original

The average squared prediction error is

\[ \frac{1}{n} \sum_{i=1}^n{(y_i - \hat y_{-i})^2} \]

where \(\hat y_{-i}\) is the prediction based on the leave-one-out training set, i.e. the sample without observation \(i\). This aaproac is computer intense as it fits \(n\) seperate models. Leave one out CV can be done much faster using the following formula, where \(H_{ii}\) is the element number \(i\) of the diagonal of the hat matrix in linear regression. The hat matrix is the projection matrix that projects the outcome observation vector onto the columns space spanned by the observed variable vectors.

\[ \frac{1}{n} \sum_{i=1}^n{ \frac{(y_i - \hat y_i)^2}{ (1 - H_{ii} )^2} } \]

Implementing this approach is much faster than the brute force approach of fitting \(n\) models, as it only requires the estimation of one model. Note though that it is non bias corrected version.

loocv <- function(fit) {
  h <- influence(fit)$hat
  mean(residuals(fit)^2 / (1-h)^2)
}
loocv(m)
[1] 24.23151

Now we will get the LOOCV prediction error for different models with polynomials of increasing power.

cv.error = rep(NA, 5)   # init result vector
degree = 1:5
for (d in degree) {
  fit <- glm(mpg ~ poly(horsepower, d), data=Auto)  
  cv.error[d] = loocv(fit)
}
plot(degree, cv.error, type="b")

k-fold cross validation

This approach is less computation intense compared to LOOCV as we fit \(k\) (usually 5 or 10), not \(n\) models.

plot(degree, cv.error, type="b")
k = 10
cv.error = rep(NA, 5)   # init result vector
degree = 1:5
for (d in degree) {
  fit <- glm(mpg ~ poly(horsepower, d), data=Auto)  
  cv.error[d] = cv.glm(Auto, fit, K = k)$delta[1]
}
lines(degree, cv.error, type="b", col="red")     # overlay

Example weather forecast

data from: http://www.weather.uwaterloo.ca/data.html#archive

# #read waterloo temperature data
# files <- dir("data", full.names = TRUE)
# l <- list()
# for (i in seq_along(files)) {
#   x <- read.csv(files[i])
#   x$index <- 1:nrow(x)
#   x$year <- i
#   l[[i]] <- x
# }
# x <- do.call(rbind, l)
# x <- subset(x, index <= 365)  # remove day 366

x <- read.csv("data/Daily_summary_2007.csv")
x$index <- 1:nrow(x)
plot(x$index, x$High.temperature, pch=16, cex=.5)

Fitting

degree = 1:12
mm = list()
m.error = rep(NA, 10)
for (d in degree) {
  mm[[d]] <- lm(High.temperature ~ poly(index, d), data=x)
  m.error[d] = mean(residuals(mm[[d]])^2)
}
ps = c(3,12)
plot(x$index, x$High.temperature, pch=16, cex=.5, las=1)
lines(x$index, predict(mm[[ps[1]]]), col="red", lwd=2)
lines(x$index, predict(mm[[ps[2]]]), col="blue", lwd=2)
legend("topleft", legend = paste(c("Polynomial of order"), ps), 
       fill = c("red", "blue"), cex = .7)

x <- read.csv("data/Daily_summary_2007.csv")
x$index <- 1:nrow(x)
x2 <- read.csv("data/Daily_summary_2009.csv")
x2$index <- 1:nrow(x)
plot(x$index, x$High.temperature, pch=16, cex=.5, las=1)
points(x2$index, x2$High.temperature, pch=16, cex=.5, col="red")

m.pred.error = NA
m.error = NA
cv.error = NA
loocv.error = NA 
degree = 1:20
k=5
for (d in degree) {
  m <- glm(High.temperature ~ poly(index, d), data=x)
  m.error[d] = mean(residuals(m)^2)
  cv.error[d] = cv.glm(x, m, K = k)$delta[1]
  loocv.error[d] = loocv(m)
  m.pred.error[d] =  mean((x2$High.temperature - predict(m, newdata=x2))^2)
}
# error by polynomial
plot(degree, m.error, type="l", col="darkgreen", las=1, xlim=c(2, 20))
points(degree, m.error, pch=16, col="darkgreen")
lines(degree, m.pred.error, col="red")
points(degree, m.pred.error, pch=16, col="red")
lines(degree, cv.error, col="blue")
points(degree, cv.error, pch=16, col="blue")
lines(degree, loocv.error, col="brown")
points(degree, loocv.error, pch=16, col="brown")