function (random.forest, rf.data, pred.data = rf.data, CI = FALSE,
tree.type = "rf", prog.bar = FALSE)
{
if (is.null(random.forest$inbag)) {
stop("Random forest must be trained with keep.inbag = TRUE")
}
if (length(unique(colSums(random.forest$inbag))) > 1) {
stop("The keep.inbag field must store the number of times each observation was used\n \nMake sure the latest version of the randomForest package is installed from CRAN")
}
N.weights <- random.forest$inbag
B <- ncol(N.weights)
n <- nrow(N.weights)
s <- sum(N.weights[, 1])
N <- Matrix::Matrix(N.weights, sparse = TRUE)
N.avg <- Matrix::Matrix(Matrix::rowMeans(N), nrow(N), 1)
if (tree.type == "rf")
pred <- predict(random.forest, newdata = pred.data, predict.all = TRUE)$individual
if (tree.type == "ci")
pred <- CB_cforest(rf = random.forest, pb = prog.bar,
rf.d = rf.data, p.d = pred.data)$preds
agg.preds <- rowMeans(pred)
pred.centered <- pred - agg.preds
pred.centered.sums <- Matrix::Matrix(rowSums(pred.centered),
1, nrow(pred.centered))
C = N %*% t(pred.centered) - N.avg %*% pred.centered.sums
raw.IJ <- Matrix::colSums(C^2)/B^2
N.var <- mean(Matrix::rowMeans(N^2) - Matrix::rowMeans(N)^2)
boot.var <- rowSums(pred.centered^2)/B
bias.correction <- n * N.var * boot.var/B
pred.ij.var <- raw.IJ - bias.correction
out <- data.frame(pred = agg.preds, pred.ij.var)
if (CI) {
out <- data.frame(out, l.ci = out$pred - (out$pred.ij.var *
qnorm(0.975, lower.tail = T)), u.ci = out$pred +
(out$pred.ij.var * qnorm(0.975, lower.tail = T)))
}
return(out)
}
以上代码是R中用来计算随机森林模型预测值的置信区间的函数,我想要一个按照这种计算方式计算gbm模型的函数,有没有专家帮助改一个的,谢谢了
基于Monster 组和GPT的调写:
以下是一个按照GBM模型计算预测值置信区间的函数,与上述随机森林模型函数类似,主要是基于构建置信区间所需的样本重复抽样(bootstrap)的思想,其中用到了gbm包的predict.gbm()函数:
library(gbm)
gbm_conf_int <- function(gbm.model, data, n.trees = 100, CI = FALSE, alpha = 0.05) {
n <- nrow(data)
preds <- matrix(0, ncol = n.trees, nrow = n)
for (i in 1:n.trees) {
preds[, i] <- predict(gbm.model, newdata = data, n.trees = i)
}
pred.centered <- preds - rowMeans(preds)
pred.centered.sums <- rowSums(pred.centered)
boot.var <- rowSums(pred.centered^2) / (n.trees - 1)
pred_ij_var <- boot.var - (n / n.trees) * var(pred.centered.sums)
out <- data.frame(pred = rowMeans(preds), pred_ij_var)
if (CI) {
z.alpha <- qnorm(1 - alpha / 2)
l.ci <- out$pred - z.alpha * sqrt(out$pred_ij_var)
u.ci <- out$pred + z.alpha * sqrt(out$pred_ij_var)
out <- data.frame(out, l.ci, u.ci)
}
return(out)
}
其中,gbm.model是一个已经拟合好的GBM模型;data是用于预测的数据集,可以与模型拟合时使用的数据集不同;n.trees是用于构建置信区间的决策树数量;CI为逻辑变量,表示是否计算置信区间;alpha为置信水平,取值范围为(0,1),默认为0.05。
函数主要实现步骤为:
构建预测矩阵,其中行数等于数据集中的观测个数,列数等于n.trees;
计算每个预测值减去所有预测值的平均值;
计算所有列的和,即预测值累加和;
计算每个观测的预测值的方差;
根据置信区间公式计算每个预测值的置信区间。
gbm_ci <- function(model, newdata, level = 0.95, degree = Inf) {
# 预测结果和标准误差
pred <- predict(model, newdata, n.trees = model$n.trees, type = "response")
se <- sqrt(model$variance.predictions)
# 置信度
alpha <- 1 - level
# 置信区间下限和上限
n <- nrow(newdata)
t_value <- qt(1 - alpha/2, degree)
lower <- pred - t_value * se / sqrt(n)
upper <- pred + t_value * se / sqrt(n)
# 返回置信区间
return(c(lower, upper))
}
您好,我是有问必答小助手,您的问题已经有小伙伴帮您解答,感谢您对有问必答的支持与关注!