R语言随机森林法补充缺失值


library(data.table)
library(tidyverse)
library(BGLR)
library(sommer)
library(asreml)
library(learnasreml)

rm(list=ls())
library(asreml)
library(ASRgenomics)
library(data.table)
M012 = fread("plink.raw")
M012[1:10,1:10]

m012 = as.matrix(M012[,!c(1:6)])
rownames(m012) = M012$IID
m012[1:10,1:10]


# 清洗基因组数据:maf,callrate
G_clean = qc.filtering(m012,marker.callrate = 0.1,ind.callrate = 0.1,maf = 0.05)
clean_m012 = G_clean$M.clean
dim(clean_m012)

#读取表型数据
dat = fread("phe_weight.csv",data.table = F)

dd = dat %>% drop_na("phe") %>% dplyr::select(ID =1 ,Sex = 2, yy = "phe")
gid = rownames(clean_m012)
loci = match(dd$ID,gid)
x1 = data.frame(ID = dd$ID)
head(x1)
clean_m012_1 = clean_m012[loci,]


means <- apply(clean_m012_1, 2, mean, na.rm = TRUE)#计算各变量的均值
clean_m012_1[is.na(clean_m012_1)] <- means[col(clean_m012_1)[is.na(clean_m012_1)]]#


library(BGLR)
bayes = BGLR(dd$yy,ETA = list(list(~dd$Sex,X=clean_m012_1,model="BayesA")),
             nIter = 1000,burnIn = 500,thin = 5,saveAt = "",
             S0 = NULL,R2 = 0.5, weights = NULL,verbose = TRUE,
             rmExistingFiles = TRUE, groups=NULL)

blup1 = data.frame(ID = dd$ID, blup = bayes$yHat)
head(blup1)


X = clean_m012_1
snp = bayes$ETA[[1]]$b
ghat = X %*% snp
blup2 = tiqu_blup(ghat)
head(blup2)

re = merge(blup1,blup2,by="ID")
head(re)
cor(re[,-1])

re = merge(dd,blup1,by = "ID")
head(re)
str(re)
cor(re[,3:4],use = "complete.obs")

#交叉验证

data=dd
pheno = "yy"
seed = 123
cross.validate=function(data, pheno, fold=5, seed=123, ...){
  cat("Performing",fold,"-fold cross-validation on",pheno,"with seed",seed,"\n")
  set.seed(seed)
  sets = cut(seq(1:nrow(data)), breaks = fold, labels = F)
  sets =  sample(sets, replace=F)    # Randomize
  
  # Go through and calculate
  accuracy=list()
  unbiased=list()
  for(s in sort(unique(sets))){
    tomask=sets==s                  # Identify which values to mask
    mydata=data                     # Copy values over
    mydata$MASKED = data[[pheno]]   # Make new phenotype so don't touch original    
    mydata$MASKED[tomask] = NA      # Mask phenotypes
    
    # Make model, Bayes_A
    library(BGLR)
    bayes = BGLR(mydata$MASKED,ETA = list(list(~mydata$Sex,X=clean_m012_1,model="BayesA")),
                 nIter = 5000,burnIn = 500,thin = 5,saveAt = "",
                 S0 = NULL,R2 = 0.5, weights = NULL,verbose = TRUE,
                 rmExistingFiles = TRUE, groups=NULL)
    
    blup1 = data.frame(ID = as.character(mydata$ID), blup = bayes$yHat)
    
    phe1 = mydata[tomask,c(1,3)] %>% rename(ID=1,y1=2) # 这里是1,3两列:ID和y1
    phe1$ID = as.character(phe1$ID)
    red1 = left_join(phe1,blup1,by="ID")
    # Tabulate accuracy 
    accuracy[[s]] = cor(red1$y1,red1$blup)
    unbiased[[s]] = coef(lm(red1$y1 ~ red1$blup))[[2]]
  }
  
  # Convert list back to vector and return
  acc_total = unlist(accuracy)    
  unb_total = unlist(unbiased)    
  re_total = list(acc = acc_total, unb = unb_total)
  return(re_total)
}

multi.validate=function(times, seed=123, ...){
  cat("Peforming",times,"cross-validations\n")
  accuracies = sapply(1:times, function(x){
    re=cross.validate(seed=x, ...)
    # ree = list(ac = map(re,mean), se = map(re,se))
    ree = map(re,mean)
    return(ree)
  })
  return(accuracies)
}



se <- function(x) {
  sd(x)/sqrt(length(x))
}



# y1
dd = dat %>% drop_na("phe") %>% dplyr::select(ID =1 ,Sex = 2, yy = "phe")
gid = rownames(clean_m012)
loci = match(dd$ID,gid)
x1 = data.frame(ID = dd$ID)
head(x1)
clean_m012_1 = clean_m012[loci,]


means <- apply(clean_m012_1, 2, mean, na.rm = TRUE)#计算各变量的均值
clean_m012_1[is.na(clean_m012_1)] <- means[col(clean_m012_1)[is.na(clean_m012_1)]]#

xx_phe = multi.validate(times = 2,seed=1,data = dd,fold = 5,pheno = "yy")
re_phe = NULL
re_phe[[1]] = xx_phe[seq(1,40,2)]  %>% unlist() %>% mean
re_phe[[2]] = xx_phe[seq(1,40,2)]  %>% unlist() %>% se
re_phe[[3]] = xx_phe[seq(2,40,2)]  %>% unlist() %>% mean
re_phe[[4]] = xx_phe[seq(2,40,2)]  %>% unlist() %>% se





re_phe
map(re_phe,mean)
map(re_phe,se)


re_phe$acc
# # [1] 0.4287721 0.2715412 0.1638310 0.3808796 0.1376268
# # 
re_phe$unb
# # [1] 1.9583870 1.2278015 0.4557312 1.5976151 0.4392364
# # 

你好,我想请问,为什么map(re_phe,se) re_phe$acc re_phe$unb这三个结果缺失呢

re_phe是一个列表,包含4个元素

re_phe[[1]] # accuracies mean
re_phe[[2]] # accuracies se
re_phe[[3]] # unbiased mean
re_phe[[4]] # unbiased se

但是你在后面试图提取re_phe$acc和re_phe$unb,而re_phe作为一个列表并没有名为acc和unb的元素,所以返回了NULL。map(re_phe, mean)和map(re_phe, se)也是作用于整个列表re_phe,而不是单个元素,所以也返回了NULL。如果要提取accuracy mean和se

map(re_phe[[1]], mean) 
map(re_phe[[2]], se)

对unbiased mean和se也同理

map(re_phe[[3]], mean)
map(re_phe[[4]], se) 

需要注意列表中元素的索引,不能直接当成数据框或向量进行提取。你如果想把结果都合并到一个向量

 bind_rows(re_phe) %>% pull(x)

这会把四个元素展开到同一个向量中,然后可以直接提取x列。所以主要是需要区分列表和数据框的不同,及其索引方式,这样可以正确获取元素。