## -----------------------------------------------------------------------------
#| label: BuildWall
#| echo: false
#| column: margin
#| out-width: 50%

## -----------------------------------------------------------------------------
#| label: fig-overfitting-1
#| echo: false
#| column: margin
#| fig-width: 3
#| fig-height: 3
#| fig-cap: "An example for **overfitting**: two regression lines are fit to data in the $(x, y)$-plane (black points). We can think of such a line as a rule that predicts the $y$-value, given an $x$-value. Both lines are smooth, but the fits differ in what is called their **bandwidth**, which intuitively can be interpreted their stiffness. The blue line seems overly keen to follow minor wiggles in the data, while the orange line captures the general trend but is less detailed. The effective number of parameters needed to describe the blue line is much higher than for the orange line. Also, if we were to obtain additional data, it is likely that the blue line would do a **worse** job than the orange line in modeling the new data. We'll formalize these concepts --training error and test set error-- later in this chapter. Although exemplified here with line fitting, the concept applies more generally to prediction models."
library("tibble")
library("ggplot2")
ov = tibble(
  x = seq(0, 30, by = 1),
  y = 2 + 0.01 * x^2 + 0.1 * x + 2 * rnorm(length(x)))
ggplot(ov, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(span = 0.2, col = "dodgerblue3", se = FALSE) +
  geom_smooth(span = 0.8, col = "darkorange1", se = FALSE)

## -----------------------------------------------------------------------------
#| label: fig-fourtypes
#| echo: false
#| column: margin

## -----------------------------------------------------------------------------
#| label: diabetes
data("diabetes", package = "rrcov")
head(diabetes)

## -----------------------------------------------------------------------------
#| label: fig-ldagroups-1
#| column: margin
#| fig-width: 3.5
#| fig-height: 7.5
#| fig-cap: "We see already from the one-dimensional distributions that some of the individual variables could potentially predict which group a patient is more likely to belong to. Our goal is to combine variables to improve over such one-dimensional prediction models."
library("reshape2")
ggplot(melt(diabetes, id.vars = "group"), aes(x = value, col = group)) +
 geom_density() + facet_wrap( ~variable, ncol = 1, scales = "free") +
 theme(legend.position = "bottom")

## -----------------------------------------------------------------------------
#| label: fig-cellshape
#| echo: false
#| out-width: 75%
#| fig-cap-location: margin

## -----------------------------------------------------------------------------
#| label: fig-scatterdiabetes-1
#| column: margin
#| fig-width: 3.5
#| fig-height: 3
#| fig-cap: "Scatterplot of two of the variables in the `diabetes` data. Each point is a sample, and the color indicates the diabetes type as encoded in the `group` variable."
ggdb = ggplot(mapping = aes(x = sspg, y = glucose)) +
  geom_point(aes(colour = group), data = diabetes)
ggdb

## -----------------------------------------------------------------------------
#| label: ldaresults
library("MASS")
diabetes_lda = lda(group ~ sspg + glucose, data = diabetes)
diabetes_lda
ghat = predict(diabetes_lda)$class
table(ghat, diabetes$group)
mean(ghat != diabetes$group)

## -----------------------------------------------------------------------------
#| label: make1Dgrid
make1Dgrid = function(x) {
  rg = grDevices::extendrange(x)
  seq(from = rg[1], to = rg[2], length.out = 100)
}

## -----------------------------------------------------------------------------
#| label: diabetes_grid_1
diabetes_grid = with(diabetes,
  expand.grid(sspg = make1Dgrid(sspg),
              glucose = make1Dgrid(glucose)))

## -----------------------------------------------------------------------------
#| label: diabetes_grid_2
diabetes_grid$ghat =
  predict(diabetes_lda, newdata = diabetes_grid)$class

## -----------------------------------------------------------------------------
#| label: centers
centers = diabetes_lda$means

## -----------------------------------------------------------------------------
#| label: unitcircle
unitcircle = exp(1i * seq(0, 2*pi, length.out = 360)) |>
          (\(z) cbind(Re(z), Im(z)))() 
ellipse = unitcircle %*% solve(diabetes_lda$scaling) |> as_tibble()

## -----------------------------------------------------------------------------
#| label: ellipses
#| cache: false
library("dplyr")
ellipses = lapply(rownames(centers), function(gr) {
  mutate(ellipse,
     sspg    = sspg    + centers[gr, "sspg"],
     glucose = glucose + centers[gr, "glucose"],
     group   = gr)
}) |> bind_rows()

## -----------------------------------------------------------------------------
#| label: fig-modeldiabetes-1
#| column: margin
#| fig-height: 4
#| fig-cap: "As @fig-scatterdiabetes-1, with the classification regions from the LDA model shown. The three ellipses represent the class centers and the covariance matrix of the LDA model; note that there is only one covariance matrix, which is the same for all three classes. Therefore also the sizes and orientations of the ellipses are the same for the three classes, only their centers differ. They represent contours of equal class membership probability."
ggdb + geom_raster(aes(fill = ghat),
            data = diabetes_grid, alpha = 0.25, interpolate = TRUE) +
    geom_point(data = as_tibble(centers), pch = "+", size = 8) +
    geom_path(aes(colour = group), data = ellipses) +
    scale_x_continuous(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0))

## -----------------------------------------------------------------------------
#| label: fig-diabetes-lda-uniform-prior-1
#| fig-width: 5
#| fig-height: 4
#| out-width: 50%
#| fig-cap: "As @fig-modeldiabetes-1, but with uniform class priors."
diabetes_up = lda(group ~ sspg + glucose, data = diabetes,
  prior = (\(n) rep(1/n, n)) (nlevels(diabetes$group)))

diabetes_grid$ghat_up =
  predict(diabetes_up, newdata = diabetes_grid)$class

stopifnot(all.equal(diabetes_up$means, diabetes_lda$means))

ellipse_up  = unitcircle %*% solve(diabetes_up$scaling) |> as_tibble()
ellipses_up = lapply(rownames(centers), function(gr) {
  mutate(ellipse_up,
     sspg    = sspg    + centers[gr, "sspg"],
     glucose = glucose + centers[gr, "glucose"],
     group   = gr)
}) |> bind_rows()

ggdb + geom_raster(aes(fill = ghat_up),
            data = diabetes_grid, alpha = 0.4, interpolate = TRUE) +
    geom_point(data = data.frame(centers), pch = "+", size = 8) +
    geom_path(aes(colour = group), data = ellipses_up) +
    scale_x_continuous(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0))

## -----------------------------------------------------------------------------
#| label: all5diab
diabetes_lda5 = lda(group ~ rw + fpg + glucose + sspg + insulin, data = diabetes)
diabetes_lda5
ghat5 = predict(diabetes_lda5)$class
table(ghat5, diabetes$group)
mean(ghat5 != diabetes$group)

## -----------------------------------------------------------------------------
#| label: loadHiiragi2
#| cache: false
library("Hiiragi2013")
data("x")
probes = c("1426642_at", "1418765_at", "1418864_at", "1416564_at")
embryoCells = t(Biobase::exprs(x)[probes, ]) |> as_tibble() |>
  mutate(Embryonic.day = x$Embryonic.day) |>
  dplyr::filter(x$genotype == "WT")

## -----------------------------------------------------------------------------
#| label: annoHiiragi
annotation(x)
library("mouse4302.db")
anno = AnnotationDbi::select(mouse4302.db, keys = probes,
                             columns = c("SYMBOL", "GENENAME"))
anno
mt = match(anno$PROBEID, colnames(embryoCells))
colnames(embryoCells)[mt] = anno$SYMBOL

## -----------------------------------------------------------------------------
#| label: assertprobeid
#| echo: false
stopifnot(!any(is.na(mt)))

## -----------------------------------------------------------------------------
#| label: fig-HiiragiFourGenesPairs-1
#| column: margin
#| fig-width: 6
#| fig-height: 6
#| fig-margin: false
#| fig-cap: "Expression values of the discriminating genes, with the prediction target Embryonic.day shown by color."
library("GGally")
ggpairs(embryoCells, mapping = aes(col = Embryonic.day),
  columns = anno$SYMBOL, upper = list(continuous = "points"))

## -----------------------------------------------------------------------------
#| label: ldacells
ec_lda = lda(Embryonic.day ~ Fn1 + Timd2 + Gata4 + Sox7,
             data = embryoCells)
round(ec_lda$scaling, 1)

## -----------------------------------------------------------------------------
#| label: fig-edcontour-1
#| column: margin
#| fig-width: 4.5
#| fig-height: 3.5
#| fig-cap: "LDA classification regions for Embryonic.day."
ec_rot = predict(ec_lda)$x |> as_tibble() |>
           mutate(ed = embryoCells$Embryonic.day)
ec_lda2 = lda(ec_rot[, 1:2], predict(ec_lda)$class)
ec_grid = with(ec_rot, expand.grid(
  LD1 = make1Dgrid(LD1),
  LD2 = make1Dgrid(LD2)))
ec_grid$edhat = predict(ec_lda2, newdata = ec_grid)$class
ggplot() +
  geom_point(aes(x = LD1, y = LD2, colour = ed), data = ec_rot) +
  geom_raster(aes(x = LD1, y = LD2, fill = edhat),
            data = ec_grid, alpha = 0.4, interpolate = TRUE) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  coord_fixed()

## -----------------------------------------------------------------------------
#| label: fig-qdamouse-1
#| fig-width: 9
#| fig-height: 9
#| fig-cap: "QDA for the mouse cell data. Shown are all pairwise plots of the four features. In each plot, the other two features are set to the median."
library("gridExtra")

ec_qda = qda(Embryonic.day ~ Fn1 + Timd2 + Gata4 + Sox7,
             data = embryoCells)

variables = colnames(ec_qda$means)
pairs = combn(variables, 2)
lapply(seq_len(ncol(pairs)), function(i) {
  grid = with(embryoCells,
    expand.grid(x = make1Dgrid(get(pairs[1, i])),
                y = make1Dgrid(get(pairs[2, i])))) |>
    `colnames<-`(pairs[, i])

  for (v in setdiff(variables, pairs[, i]))
    grid[[v]] = median(embryoCells[[v]])

  grid$edhat = predict(ec_qda, newdata = grid)$class

  x <- pairs[1,i]
  y <- pairs[2,i]
  ggplot() + 
    geom_point(
      data = embryoCells,
      aes(x = .data[[x]], y = .data[[y]], colour = Embryonic.day)
    ) +
    geom_raster(
      aes(x = .data[[x]], y = .data[[y]], fill = edhat),
      data = grid, alpha = 0.4, interpolate = TRUE
    ) +
    scale_x_continuous(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0)) +
    coord_fixed() +
    if (i != ncol(pairs)) theme(legend.position = "none")
}) |> (\(g) grid.arrange(grobs = g, ncol = 2))()

## -----------------------------------------------------------------------------
#| label: ladallvariables
#| warning.known: !expr c("variables are collinear")
#| results: hide
#| error: true
lda(t(Biobase::exprs(x))[, 1:1000], x$Embryonic.day)
warnings()
qda(t(Biobase::exprs(x))[, 1:1000], x$Embryonic.day)

## -----------------------------------------------------------------------------
#| label: fig-learnbyheart-1
#| warning.known: !expr c("variables are collinear")
#| column: margin
#| fig-width: 3
#| fig-height: 3
#| fig-cap: "Misclassification rate of LDA applied to random data. While the number of observations `n` is held constant (at 20), we are increasing the number of features `p` starting from 2 up to 21. The misclassification rate becomes almost zero as `p` approaches 20. The LDA model becomes so elaborate and over-parameterized that it manages to learn the random labels \"by heart\". (As `p` becomes even larger, the \"performance\" degrades again somewhat, apparently due to numerical properties of the `lda` implementation used here.)"
p = 2:21
n = 20

mcl = lapply(p, function(pp) {
  replicate(100, {
    xmat = matrix(rnorm(n * pp), nrow = n)
    resp = sample(c("apple", "orange"), n, replace = TRUE)
    fit  = lda(xmat, resp)
    pred = predict(fit)$class
    mean(pred != resp)
  }) |> mean() |> (\(x) tibble(mcl = x, p = pp))()
}) |> bind_rows()

ggplot(mcl, aes(x = p, y = mcl)) + 
  geom_line() + geom_point() +
  ylab("Misclassification rate")

## -----------------------------------------------------------------------------
#| label: book-chunk-1
#| echo: false

## -----------------------------------------------------------------------------
#| label: fig-mclcv-1
#| warning.known: !expr c("variables are collinear")
#| column: margin
#| fig-cap: "Cross-validation: the misclassification rate of LDA applied to random data, when evaluated on test data that were not used for learning, hovers around 0.5 independent of `p`. The misclassification rate on the training data is also shown. It behaves similar to what we already saw in @fig-learnbyheart-1."
estimate_mcl_loocv = function(x, resp) {
  vapply(seq_len(nrow(x)), function(i) {
    fit  = lda(x[-i, ], resp[-i])
    ptrn = predict(fit, newdata = x[-i,, drop = FALSE])$class
    ptst = predict(fit, newdata = x[ i,, drop = FALSE])$class
    c(train = mean(ptrn != resp[-i]), test = (ptst != resp[i]))
  }, FUN.VALUE = numeric(2)) |> rowMeans() |> t() |> as_tibble()
}

xmat = matrix(rnorm(n * last(p)), nrow = n)
resp = sample(c("apple", "orange"), n, replace = TRUE)

mcl = lapply(p, function(k) {
  estimate_mcl_loocv(xmat[, 1:k], resp)
}) |> bind_rows() |> data.frame(p) |> melt(id.var = "p")

ggplot(mcl, aes(x = p, y = value, col = variable)) + geom_line() +
  geom_point() + ylab("Misclassification rate")

## -----------------------------------------------------------------------------
#| label: fig-curseofdim
#| warning.known: !expr c("variables are collinear")
#| column: margin
#| fig-width: 3.5
#| fig-height: 3
#| fig-cap: "As we increase the number of features included in the model, the misclassification rate initially improves; as we start including more and more irrelevant features, it increases again, as we are fitting noise."
p   = 2:20
mcl = replicate(100, {
  xmat = matrix(rnorm(n * last(p)), nrow = n)
  resp = sample(c("apple", "orange"), n, replace = TRUE)
  xmat[, 1:6] = xmat[, 1:6] + as.integer(factor(resp))

  lapply(p, function(k) {
    estimate_mcl_loocv(xmat[, 1:k], resp)
  }) |> bind_rows() |> cbind(p = p) |> melt(id.var = "p")
}, simplify = FALSE) |> bind_rows()

mcl = group_by(mcl, p, variable) |> summarise(value = mean(value))

ggplot(mcl, aes(x = p, y = value, col = variable)) + geom_line() +
   geom_point() + ylab("Misclassification rate")

## -----------------------------------------------------------------------------
#| label: fig-BiasVarianceTradeoff
#| echo: false
#| column: margin

## -----------------------------------------------------------------------------
#| label: fig-cursedimans1-1
#| fig-width: 3.5
#| fig-height: 2.5
#| out-width: 50%
#| fig-cap: "Side length of a $p$-dimensional hybercube expected to contain 10 points out of 1 million uniformly distributed ones, as a function of the $p$. While for $p=1$, this length is conveniently small, namely $10/10^6=10^{-5}$, for larger $p$ it approaches 1, i.,e., becomes the same as the range of each the features. This means that a \"local neighborhood\" of 10 points encompasses almost the same data range as the whole dataset."
sideLength = function(p, pointDensity = 1e6, pointsNeeded = 10)
  (pointsNeeded / pointDensity) ^ (1 / p)
ggplot(tibble(p = 1:400, sideLength = sideLength(p)),
       aes(x = p, y = sideLength)) + geom_line(col = "red") +
  geom_hline(aes(yintercept = 1), linetype = 2)

## -----------------------------------------------------------------------------
#| label: fig-cursedimans2-1
#| fig-width: 3.5
#| fig-height: 2.5
#| out-width: 50%
#| fig-cap: "Fraction of a unit cube's total volume that is in its \"shell\" (here operationalised as those points that are closer than 0.01 to its surface) as a function of the dimension $p$."
tibble(
  p = 1:400,
  volOuterCube = 1 ^ p,
  volInnerCube = 0.98 ^ p,  # 0.98 = 1 - 2 * 0.01
  `V(shell)` = volOuterCube - volInnerCube) |>
ggplot(aes(x = p, y =`V(shell)`)) + geom_line(col = "blue")

## -----------------------------------------------------------------------------
#| label: fig-cursedimans3-1
#| fig-width: 3.5
#| fig-height: 2.5
#| out-width: 50%
#| fig-cap: "Coefficient of variation (CV) of the distance between randomly picked points in the unit hypercube, as a function of the dimension. As the dimension increases, everybody is equally far away from everyone else: there is almost no variation in the distances any more."
n = 1000
df = tibble(
  p = round(10 ^ seq(0, 4, by = 0.25)),
  cv = vapply(p, function(k) {
    x1 = matrix(runif(k * n), nrow = n)
    x2 = matrix(runif(k * n), nrow = n)
    d = sqrt(rowSums((x1 - x2)^2))
    sd(d) / mean(d)
  }, FUN.VALUE = numeric(1)))
ggplot(df, aes(x = log10(p), y = cv)) + geom_line(col = "orange") +
  geom_point()

## -----------------------------------------------------------------------------
#| label: confusiontable
#| eval: false
## table(truth, response)

## -----------------------------------------------------------------------------
#| child: devil.qmd
#| fig-cap: "Note that the $\\log\\text{loss}$ will be infinite if a prediction is totally confident ($\\hat{p}_i$ is exactly $0$ or $1$) but wrong."

## -----------------------------------------------------------------------------
#| echo: false

## -----------------------------------------------------------------------------
#| label: fig-supervised-bullseye
#| echo: false
#| column: margin
#| layout-nrow: 1
#| fig-subcap:
#|   - ""
#|   - ""
    IMGS, c('TargetBias.png','TargetVariance.png')))

## -----------------------------------------------------------------------------
#| child: devil.qmd
#| fig-cap: "Here, $|\\beta|^\\nu=\\sum_i\\beta_i^\\nu$ is the $L_\\nu$-norm of the vector $\\beta$. Variations are possible, for instead we could include in this summation only some but not all of the elements of $\\beta$; or we could scale different elements differently, for instance based on some prior belief of their scale and importance."

## -----------------------------------------------------------------------------
#| echo: false

## -----------------------------------------------------------------------------
#| label: colon1
#| cache: false
#| results: hide
library("ExperimentHub")
eh = ExperimentHub()
zeller = eh[["EH361"]]

## -----------------------------------------------------------------------------
#| label: colon1b
table(zeller$disease)

## -----------------------------------------------------------------------------
#| label: colon2
zellerNC = zeller[, zeller$disease %in% c("n", "cancer")]

## -----------------------------------------------------------------------------
#| label: ehzellertest
#| echo: false
stopifnot(is.numeric(Biobase::exprs(zellerNC)), !any(is.na(Biobase::exprs(zellerNC))))

## -----------------------------------------------------------------------------
#| label: zellerpData-1
pData(zellerNC)[ sample(ncol(zellerNC), 3), ]

## -----------------------------------------------------------------------------
#| child: devil.qmd
#| fig-cap: "We define the helper function `formatfn` to line wrap these long character strings for the available space here."

## -----------------------------------------------------------------------------
#| echo: false

## -----------------------------------------------------------------------------
#| label: zellerrownames
formatfn = function(x)
   gsub("|", "| ", x, fixed = TRUE) |> lapply(strwrap)

rownames(zellerNC)[1:4]
rownames(zellerNC)[nrow(zellerNC) + (-2:0)] |> formatfn()

## -----------------------------------------------------------------------------
#| label: fig-zellerHist-1
#| column: margin
#| fig-width: 3
#| fig-height: 4
#| fig-cap: "Histograms of the distributions for two randomly selected features. The distributions are highly skewed, with many zero values and a thin, long tail of non-zero values."
ggplot(melt(Biobase::exprs(zellerNC)[c(510, 527), ]), aes(x = value)) +
    geom_histogram(bins = 25) +
    facet_wrap( ~ Var1, ncol = 1, scales = "free")

## -----------------------------------------------------------------------------
#| label: glmnet
library("glmnet")
glmfit = glmnet(x = t(Biobase::exprs(zellerNC)),
                y = factor(zellerNC$disease),
                family = "binomial")

## -----------------------------------------------------------------------------
#| label: colonPred
predTrsf = predict(glmfit, newx = t(Biobase::exprs(zellerNC)),
                   type = "class", s = 0.04)
table(predTrsf, zellerNC$disease)

## -----------------------------------------------------------------------------
#| label: fig-plotglmfit-1
#| echo: !expr -c(1)
#| column: margin
#| fig-width: 3.6
#| fig-height: 3.2
#| fig-cap: "Regularization paths for `glmfit`."
par(mai = c(0.5, 0.5, 0.575, 0.05))
plot(glmfit, col = brewer.pal(8, "Dark2"), lwd = sqrt(3), ylab = "")

## -----------------------------------------------------------------------------
#| child: devil.qmd
#| fig-cap: "You'll already realize from the description of this strategy that if we optimize $\\lambda$ in this way, the resulting apparent classification performance will likely be exaggerated. We need a truly independent dataset, or at least another, outer cross-validation loop to get a more realistic impression of the generalizability. We will get back to this question at the end of the chapter."

## -----------------------------------------------------------------------------
#| echo: false

## -----------------------------------------------------------------------------
#| label: fig-colonCV-1
#| column: margin
#| fig-width: 4
#| fig-height: 4
#| fig-cap: "Diagnostic plot for `cv.glmnet`: shown is a measure of cross-validated prediction performance, the deviance, as a function of $\\lambda$. The dashed vertical lines show `lambda.min` and `lambda.1se`."
cvglmfit = cv.glmnet(x = t(Biobase::exprs(zellerNC)),
                     y = factor(zellerNC$disease),
                     family = "binomial")
plot(cvglmfit)

## -----------------------------------------------------------------------------
#| label: lambda.min
cvglmfit$lambda.min

## -----------------------------------------------------------------------------
#| label: lambda.1se
cvglmfit$lambda.1se

## -----------------------------------------------------------------------------
#| label: predictwithlambda1se
s0 = cvglmfit$lambda.1se
predict(glmfit, newx = t(Biobase::exprs(zellerNC)),type = "class", s = s0) |>
    table(zellerNC$disease)

## -----------------------------------------------------------------------------
#| label: zellercoef
coefs = coef(glmfit)[, which.min(abs(glmfit$lambda - s0))]
topthree = order(abs(coefs), decreasing = TRUE)[1:3]
as.vector(coefs[topthree])
formatfn(names(coefs)[topthree])

## -----------------------------------------------------------------------------
#| label: fig-colonCVTrsf-1
#| fig-width: 4
#| fig-height: 4
#| out-width: 50%
#| fig-cap: "like @fig-colonCV-1, but using an $\\text{asinh}$ transformation of the data."
cv.glmnet(x = t(asinh(Biobase::exprs(zellerNC))),
          y = factor(zellerNC$disease),
          family = "binomial") |> plot()

## -----------------------------------------------------------------------------
#| label: fig-mousecvglmfit-1
#| column: margin
#| fig-width: 4
#| fig-height: 4
#| fig-cap: "Cross-validated misclassification error versus penalty parameter for the mouse cells data."
sx = x[, x$Embryonic.day == "E3.25"]
embryoCellsClassifier = cv.glmnet(t(Biobase::exprs(sx)), sx$genotype,
                family = "binomial", type.measure = "class")
plot(embryoCellsClassifier)

## -----------------------------------------------------------------------------
#| label: checkclaimMouseCellsClassifier
#| echo: false
stopifnot(sum((diff(embryoCellsClassifier$cvm) * diff(embryoCellsClassifier$lambda)) < 0) <= 2)

## -----------------------------------------------------------------------------
#| label: fig-mousecellsrowttst-1
#| fig-width: 4
#| fig-height: 2.5
#| out-width: 50%
#| fig-cap: "Histogram of p-values for the per-feature $t$-tests between genotypes in the E3.25 cells."
mouse_de = rowttests(sx, "genotype")
ggplot(mouse_de, aes(x = p.value)) +
  geom_histogram(boundary = 0, breaks = seq(0, 1, by = 0.01))

## -----------------------------------------------------------------------------
#| label: mousecellsnn1
dists = as.matrix(dist(scale(t(Biobase::exprs(x)))))
diag(dists) = +Inf

## -----------------------------------------------------------------------------
#| label: mousecellsnn2
nn = sapply(seq_len(ncol(dists)), function(i) which.min(dists[, i]))
table(x$sampleGroup, x$sampleGroup[nn]) |> `colnames<-`(NULL)

## -----------------------------------------------------------------------------
#| label: caret1
library("caret")
caretMethods = names(getModelInfo())
head(caretMethods, 8)
length(caretMethods)

## -----------------------------------------------------------------------------
#| label: caret2
getModelInfo("nnet", regex = FALSE)[[1]]$parameter

## -----------------------------------------------------------------------------
#| label: caret3
#| results: hide
trnCtrl = trainControl(
  method = "repeatedcv",
  repeats = 3,
  classProbs = TRUE)
tuneGrid = expand.grid(
  size = c(2, 4, 8),
  decay = c(0, 1e-2, 1e-1))
nnfit = train(
  Embryonic.day ~ Fn1 + Timd2 + Gata4 + Sox7,
  data = embryoCells,
  method = "nnet",
  tuneGrid  = tuneGrid,
  trControl = trnCtrl,
  metric = "Accuracy")

## -----------------------------------------------------------------------------
#| label: fig-ML-nnfit
#| column: margin
#| fig-width: 3.75
#| fig-height: 4.25
#| fig-cap: "Parameter tuning of the neural net by cross-validation."
nnfit
plot(nnfit)
predict(nnfit) |> head(10)

## -----------------------------------------------------------------------------
#| label: kernelsvm
#| eval: false
#| echo: false
## library("kernlab")
## kfunction= function(linear =0, quadratic=0)
## {  k = function (v,w){ linear*sum((v)*(w)) + quadratic*sum((v^2)*(w^2))}
##   class(k) = "kernel"
##   return(k) }
## subx=subx[,2:3]
## svp = ksvm(subx,dftxy$tg,type="C-svc",C = 100, kernel=kfunction(1,0),scaled=c())
## plot(c(min(subx[,1]), max(subx[,1])),c(min(subx[,2]), max(subx[,2])),
##             type='n',xlab='x1',ylab='x2')
## ymat = ymatrix(svp)
## points(subx[-SVindex(svp),1], subx[-SVindex(svp),2],
##          pch = ifelse(ymat[-SVindex(svp)] < 0, 2, 1))
## points(subx[SVindex(svp),1], subx[SVindex(svp),2],
##          pch = ifelse(ymat[SVindex(svp)] < 0, 17, 16))
## 
## # Extract w and b from the model
## w = colSums(coef(svp)[[1]] * subx[SVindex(svp),])
## b = b(svp)
## # Draw the lines
## abline(b/w[2],-w[1]/w[2])
## abline((b+1)/w[2],-w[1]/w[2],lty=2)
## abline((b-1)/w[2],-w[1]/w[2],lty=2)