通过R来完成随机森林模型拟合后,进行交叉验证
#读取 OTUs 丰度表
otu <- read.table('otu_table.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)
#合并分组,得到能够被 randomForest 识别计算的格式
group <- read.table('group.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)
otu <- data.frame(t(otu))
otu_group <- cbind(otu, group)
#将总数据集分为训练集(占 70%)和测试集(占 30%)
set.seed(123)
select_train <- sample(44, 44*0.7)
otu_train <- otu_group[select_train, ]
otu_test <- otu_group[-select_train, ]
#随机森林计算(生成 5000 棵决策树)
library(randomForest)
set.seed(123)
otu_train.forest <- randomForest(as.factor(groups) ~ ., data = otu_train, ntree=5000,importance = TRUE)
#训练集自身测试
train_predict <- predict(otu_train.forest, otu_train)
compare_train <- table(train_predict, otu_train$groups)
sum(diag(compare_train)/sum(compare_train))
#使用测试集评估
test_predict <- predict(otu_train.forest, otu_test)
compare_test <- table(otu_test$groups, test_predict, dnn = c('Actual', 'Predicted'))
#关键 OTUs 识别
importance_otu <- data.frame(importance(otu_train.forest))
head(importance_otu)
#根据某种重要性的高低排个序,例如根据“Mean Decrease Accuracy”指标
importance_otu <- importance_otu[order(importance_otu$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(importance_otu)
#交叉验证帮助选择特定数量的 OTUs
#5 次重复十折交叉验证
set.seed(123)
otu_train.cv <- replicate(5, rfcv(otu_train[-ncol(otu_train)], otu_train$group, cv.fold = 10,step = 1.5), simplify = FALSE)
Error in y - ymean : non-numeric argument to binary operator
In addition: Warning messages:
1: In randomForest.default(trainx[idx != i, , drop = FALSE], trainy[idx != :
The response has five or fewer unique values. Are you sure you want to do regression?
2: In mean.default(y) : 参数不是数值也不是逻辑值:回覆NA
原本导入数据为txt格式,后改为csv格式后,到交叉验证这一步还是一样的报错。
5 次重复十折交叉验证