3.1 Survival data and analysis

Tools for survival analysis in R are in survival package. Let’s start survival analysis on lung data: survival in patients with advanced lung cancer from the North Central Cancer Treatment Group. See ?lung for details.

## create a survival object
## lung$status: 1-censored, 2-dead  -> 0-censored, 1-dead
sData = Surv(lung$time,event = lung$status == 2)
## Let's visualize it 
fit = survfit(sData ~ 1)

## Let's visualize it for male/female = survfit(sData ~ lung$sex)
plot(, col=c("blue","red"), = TRUE, mark.time=TRUE)

## Rank test for survival data = survdiff(sData ~ lung$sex)
## build Cox regression model
mod1 = coxph(sData ~ sex, data=lung) 
## build Cox regression model
mod2 = coxph(sData ~ sex + age, data=lung) 
3.2 Survival analysis with transcriptomics data

Now let’s take transcriptomics dataset from TCGA: melanoma (SKCM)

Download data:


Now, let’s work on the dataset.

Sur = Surv(time = Data$meta$surv_time, event = Data$meta$surv_event)
## keep expressed genes
ikeep = apply(Data$x,1,max) > 5
X = Data$x[ikeep,]

## build container and Cox regression for each gene
Res = data.frame(Gene = rownames(X), LHR=NA, PV = 1, FDR=1)
rownames(Res) = rownames(X)
for (ig in 1:nrow(X)){
    mod = coxph(Sur ~ X[ig,])
    Res$LHR[ig] = summary(mod)$coefficients[,"coef"]
    Res$PV[ig] = summary(mod)$sctest["pvalue"]
    ## write message every 1000 genes
    if(ig%%1000 == 0){
Res$FDR = p.adjust(Res$PV,method="fdr")

isig = Res$FDR<0.01
## make a KM plot
## gene -> factor
ibest = which.min(Res$FDR)
fgene = c("low","high")[1+as.integer(X[ibest,]>median(X[ibest,]))]
fgene = factor(fgene,levels=c("low","high"))
plot(survfit(Sur ~ fgene),col=c(4,2),lwd=c(2,1,1), = TRUE, mark.time=TRUE, main=sprintf("Best gene %s",rownames(X)[ibest]))

## lets combine genes
score = double(ncol(X))
for (ip in 1:ncol(X)){
    score[ip] = sum(Res$LHR[isig] * X[isig,ip])

mod = coxph(Sur~score)
## make a KM plot
## score -> factor
fscore = c("low","high")[1+as.integer(score>median(score))]
fscore = factor(fscore,levels=c("low","high"))
plot(survfit(Sur ~ fscore),col=c(4,2),lwd=c(2,1,1), = TRUE, mark.time=TRUE, main="Combined score")

Exercise 3

Look for melanoma data set from boot package (you may need to install). Investigate the dataset. Perform survival analysis and identify factors affecting the survival.

For SKCM dataset investigate other factors that can influence survival.

