library(data.table)
library(tidyverse)
library(BGLR)
library(sommer)
library(asreml)
library(learnasreml)
rm(list=ls())
library(asreml)
library(ASRgenomics)
library(data.table)
M012 = fread("plink.raw")
M012[1:10,1:10]
m012 = as.matrix(M012[,!c(1:6)])
rownames(m012) = M012$IID
m012[1:10,1:10]
# 清洗基因组数据:maf,callrate
G_clean = qc.filtering(m012,marker.callrate = 0.1,ind.callrate = 0.1,maf = 0.05)
clean_m012 = G_clean$M.clean
dim(clean_m012)
#读取表型数据
dat = fread("phe_weight.csv",data.table = F)
dd = dat %>% drop_na("phe") %>% dplyr::select(ID =1 ,Sex = 2, yy = "phe")
gid = rownames(clean_m012)
loci = match(dd$ID,gid)
x1 = data.frame(ID = dd$ID)
head(x1)
clean_m012_1 = clean_m012[loci,]
means <- apply(clean_m012_1, 2, mean, na.rm = TRUE)#计算各变量的均值
clean_m012_1[is.na(clean_m012_1)] <- means[col(clean_m012_1)[is.na(clean_m012_1)]]#
library(BGLR)
bayes = BGLR(dd$yy,ETA = list(list(~dd$Sex,X=clean_m012_1,model="BayesA")),
nIter = 1000,burnIn = 500,thin = 5,saveAt = "",
S0 = NULL,R2 = 0.5, weights = NULL,verbose = TRUE,
rmExistingFiles = TRUE, groups=NULL)
blup1 = data.frame(ID = dd$ID, blup = bayes$yHat)
head(blup1)
X = clean_m012_1
snp = bayes$ETA[[1]]$b
ghat = X %*% snp
blup2 = tiqu_blup(ghat)
head(blup2)
re = merge(blup1,blup2,by="ID")
head(re)
cor(re[,-1])
re = merge(dd,blup1,by = "ID")
head(re)
str(re)
cor(re[,3:4],use = "complete.obs")
#交叉验证
data=dd
pheno = "yy"
seed = 123
cross.validate=function(data, pheno, fold=5, seed=123, ...){
cat("Performing",fold,"-fold cross-validation on",pheno,"with seed",seed,"\n")
set.seed(seed)
sets = cut(seq(1:nrow(data)), breaks = fold, labels = F)
sets = sample(sets, replace=F) # Randomize
# Go through and calculate
accuracy=list()
unbiased=list()
for(s in sort(unique(sets))){
tomask=sets==s # Identify which values to mask
mydata=data # Copy values over
mydata$MASKED = data[[pheno]] # Make new phenotype so don't touch original
mydata$MASKED[tomask] = NA # Mask phenotypes
# Make model, Bayes_A
library(BGLR)
bayes = BGLR(mydata$MASKED,ETA = list(list(~mydata$Sex,X=clean_m012_1,model="BayesA")),
nIter = 5000,burnIn = 500,thin = 5,saveAt = "",
S0 = NULL,R2 = 0.5, weights = NULL,verbose = TRUE,
rmExistingFiles = TRUE, groups=NULL)
blup1 = data.frame(ID = as.character(mydata$ID), blup = bayes$yHat)
phe1 = mydata[tomask,c(1,3)] %>% rename(ID=1,y1=2) # 这里是1,3两列:ID和y1
phe1$ID = as.character(phe1$ID)
red1 = left_join(phe1,blup1,by="ID")
# Tabulate accuracy
accuracy[[s]] = cor(red1$y1,red1$blup)
unbiased[[s]] = coef(lm(red1$y1 ~ red1$blup))[[2]]
}
# Convert list back to vector and return
acc_total = unlist(accuracy)
unb_total = unlist(unbiased)
re_total = list(acc = acc_total, unb = unb_total)
return(re_total)
}
multi.validate=function(times, seed=123, ...){
cat("Peforming",times,"cross-validations\n")
accuracies = sapply(1:times, function(x){
re=cross.validate(seed=x, ...)
# ree = list(ac = map(re,mean), se = map(re,se))
ree = map(re,mean)
return(ree)
})
return(accuracies)
}
se <- function(x) {
sd(x)/sqrt(length(x))
}
# y1
dd = dat %>% drop_na("phe") %>% dplyr::select(ID =1 ,Sex = 2, yy = "phe")
gid = rownames(clean_m012)
loci = match(dd$ID,gid)
x1 = data.frame(ID = dd$ID)
head(x1)
clean_m012_1 = clean_m012[loci,]
means <- apply(clean_m012_1, 2, mean, na.rm = TRUE)#计算各变量的均值
clean_m012_1[is.na(clean_m012_1)] <- means[col(clean_m012_1)[is.na(clean_m012_1)]]#
xx_phe = multi.validate(times = 2,seed=1,data = dd,fold = 5,pheno = "yy")
re_phe = NULL
re_phe[[1]] = xx_phe[seq(1,40,2)] %>% unlist() %>% mean
re_phe[[2]] = xx_phe[seq(1,40,2)] %>% unlist() %>% se
re_phe[[3]] = xx_phe[seq(2,40,2)] %>% unlist() %>% mean
re_phe[[4]] = xx_phe[seq(2,40,2)] %>% unlist() %>% se
re_phe
map(re_phe,mean)
map(re_phe,se)
re_phe$acc
# # [1] 0.4287721 0.2715412 0.1638310 0.3808796 0.1376268
# #
re_phe$unb
# # [1] 1.9583870 1.2278015 0.4557312 1.5976151 0.4392364
# #
你好,我想请问,为什么map(re_phe,se) re_phe$acc re_phe$unb这三个结果缺失呢
re_phe是一个列表,包含4个元素
re_phe[[1]] # accuracies mean
re_phe[[2]] # accuracies se
re_phe[[3]] # unbiased mean
re_phe[[4]] # unbiased se
但是你在后面试图提取re_phe$acc和re_phe$unb,而re_phe作为一个列表并没有名为acc和unb的元素,所以返回了NULL。map(re_phe, mean)和map(re_phe, se)也是作用于整个列表re_phe,而不是单个元素,所以也返回了NULL。如果要提取accuracy mean和se
map(re_phe[[1]], mean)
map(re_phe[[2]], se)
对unbiased mean和se也同理
map(re_phe[[3]], mean)
map(re_phe[[4]], se)
需要注意列表中元素的索引,不能直接当成数据框或向量进行提取。你如果想把结果都合并到一个向量
bind_rows(re_phe) %>% pull(x)
这会把四个元素展开到同一个向量中,然后可以直接提取x列。所以主要是需要区分列表和数据框的不同,及其索引方式,这样可以正确获取元素。