library(survival)
library(caret)
library(glmnet)
library(survminer)
library(timeROC)
coxPfilter=0.05 #cox方法显著性过滤标准
setwd("C:\Users\Desktop\model") #设置工作目录
rt=read.table("TCGA.expTime.txt", header=T, sep="\t", check.names=F, row.names=1) #读取输入文件
rt$futime[rt$futime<=0]=1
rt$futime=rt$futime/365
#对数据进行分组,构建模型
n=1 #分组的次数
for(i in 1:n){
inTrain=createDataPartition(y=rt[,3], p=0.7, list=F)
train=rt[inTrain,]
test=rt[-inTrain,]
trainOut=cbind(id=row.names(train),train)
testOut=cbind(id=row.names(test),test)
#单因素cox分析
outUniTab=data.frame()
sigGenes=c("futime","fustat")
for(i in colnames(train[,3:ncol(train)])){
#cox分析
cox <- coxph(Surv(futime, fustat) ~ train[,i], data = train)
coxSummary = summary(cox)
coxP=coxSummary$coefficients[,"Pr(>|z|)"]
#保留显著性基因
if(coxP<coxPfilter){
sigGenes=c(sigGenes,i)
outUniTab=rbind(outUniTab,
cbind(id=i,
HR=coxSummary$conf.int[,"exp(coef)"],
HR.95L=coxSummary$conf.int[,"lower .95"],
HR.95H=coxSummary$conf.int[,"upper .95"],
pvalue=coxSummary$coefficients[,"Pr(>|z|)"])
)
}
}
uniSigExp=train[,sigGenes]
uniSigExpOut=cbind(id=row.names(uniSigExp),uniSigExp)
if(length(sigGenes)<5){next}
#lasso回归
x=as.matrix(uniSigExp[,c(3:ncol(uniSigExp))])
y=data.matrix(Surv(uniSigExp$futime,uniSigExp$fustat))
fit <- glmnet(x, y, family = "cox", maxit = 1000)
cvfit <- cv.glmnet(x, y, family="cox", maxit = 1000)
coef <- coef(fit, s = cvfit$lambda.min)
index <- which(coef != 0)
actCoef <- coef[index]
lassoGene=row.names(coef)[index]
lassoSigExp=uniSigExp[,c("futime", "fustat", lassoGene)]
lassoSigExpOut=cbind(id=row.names(lassoSigExp), lassoSigExp)
geneCoef=cbind(Gene=lassoGene, Coef=actCoef)
if(nrow(geneCoef)<2){next}
multiCox <- coxph(Surv(futime, fustat) ~ ., data = lassoSigExp)
multiCox=step(multiCox,direction = "both")
multiCoxSum=summary(multiCox)
#输出模型公式
outMultiTab=data.frame()
outMultiTab=cbind(
coef=multiCoxSum$coefficients[,"coef"],
HR=multiCoxSum$conf.int[,"exp(coef)"],
HR.95L=multiCoxSum$conf.int[,"lower .95"],
HR.95H=multiCoxSum$conf.int[,"upper .95"],
pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])
outMultiTab=cbind(id=row.names(outMultiTab),outMultiTab)
#输出train组风险文件
riskScore=predict(multiCox,type="risk",newdata=train) #利用train得到模型预测train样品风险
coxGene=rownames(multiCoxSum$coefficients)
coxGene=gsub("`","",coxGene)
outCol=c("futime","fustat",coxGene)
medianTrainRisk=median(riskScore)
risk=as.vector(ifelse(riskScore>medianTrainRisk,"high","low"))
trainRiskOut=cbind(id=rownames(cbind(train[,outCol],riskScore,risk)),cbind(train[,outCol],riskScore,Risk=risk))
#输出test组风险文件
riskScoreTest=predict(multiCox,type="risk",newdata=test) #利用train得到模型预测test样品风险
riskTest=as.vector(ifelse(riskScoreTest>medianTrainRisk,"high","low"))
testRiskOut=cbind(id=rownames(cbind(test[,outCol],riskScoreTest,riskTest)),cbind(test[,outCol],riskScore=riskScoreTest,Risk=riskTest))
#输出GEO的风险值
GEO=read.table("geo.expTime.txt", header=T, sep="\t", check.names=F, row.names=1)
GEO$futime=GEO$futime/365
geoScore=predict(multiCox, type="risk", newdata=GEO)
geoRisk=as.vector(ifelse(geoScore>medianTrainRisk, "high", "low"))
GEO=cbind(GEO[,outCol], riskScore=as.vector(geoScore), Risk=geoRisk)
geoRiskOut=cbind(id=rownames(GEO), GEO)
#生存差异pvalue
diff=survdiff(Surv(futime, fustat) ~Risk,data = trainRiskOut)
pValue=1-pchisq(diff$chisq, df=1)
diffTest=survdiff(Surv(futime, fustat) ~Risk,data = testRiskOut)
pValueTest=1-pchisq(diffTest$chisq, df=1)
diffGEO=survdiff(Surv(futime, fustat) ~Risk, data=GEO)
pValueGEO=1-pchisq(diffGEO$chisq, df=1)
#ROC曲线下面积
predictTime=1 #预测时间
roc=timeROC(T=train$futime, delta=train$fustat,
marker=riskScore, cause=1,
times=c(predictTime), ROC=TRUE)
rocTest=timeROC(T=test$futime, delta=test$fustat,
marker=riskScoreTest, cause=1,
times=c(predictTime), ROC=TRUE)
if((pValue<0.01) & (roc$AUC[2]>0.65) & (pValueTest<0.049) & (rocTest$AUC[2]>0.63) & (pValueGEO<0.049)){
#输出分组结果
write.table(trainOut,file="data.train.txt",sep="\t",quote=F,row.names=F)
write.table(testOut,file="data.test.txt",sep="\t",quote=F,row.names=F)
#输出单因素结果
write.table(outUniTab,file="uni.trainCox.txt",sep="\t",row.names=F,quote=F)
write.table(uniSigExpOut,file="uni.SigExp.txt",sep="\t",row.names=F,quote=F)
#lasso结果
write.table(lassoSigExpOut,file="lasso.SigExp.txt",sep="\t",row.names=F,quote=F)
pdf("lasso.lambda.pdf")
plot(fit, xvar = "lambda", label = TRUE)
dev.off()
pdf("lasso.cvfit.pdf")
plot(cvfit)
abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)), lty="dashed")
dev.off()
#输出多因素结果
outMultiTab=outMultiTab[,1:2]
write.table(outMultiTab,file="multiCox.txt",sep="\t",row.names=F,quote=F)
write.table(trainRiskOut,file="risk.TCGAtrain.txt",sep="\t",quote=F,row.names=F)
write.table(testRiskOut,file="risk.TCGAtest.txt",sep="\t",quote=F,row.names=F)
write.table(geoRiskOut,file="risk.GEO.txt",sep="\t",quote=F,row.names=F)
#所有样品的风险值
allRiskOut=rbind(trainRiskOut, testRiskOut)
write.table(allRiskOut,file="risk.TCGAall.txt",sep="\t",quote=F,row.names=F)
break
}
}