You should have loaded the following data:
miR & mRNA - microRNA and mRNA
profiles of A375 cell line after stimulation with IFNg (adapted from
[Nazarov et al, Nucleic Acid Research, 2013])If not, please load:
## load the script from Internet
source("http://r.modas.lu/readAMD.r")
## miRNA data
miR = readAMD("http://edu.modas.lu/data/txt/mirna_ifng.amd.txt",stringsAsFactors = TRUE)
## mRNA data
mRNA = readAMD("http://edu.modas.lu/data/txt/mrna_ifng.amd.txt",
stringsAsFactors = TRUE,
index.column="GeneSymbol",
sum.func="mean")
Let us try to cluster standard iris dataset.We will use
PCA to present samples in 2D space, and next cluster with a standard
kmeans function.
centers - number of clusters (k);
to make analysis more reproducible - increase
nstart
par(mfcol=c(1,2))
## define shape - actual species
pch = c(15,16,17)[as.integer(iris$Species)]
PC = prcomp(iris[,-5],scale=TRUE)
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=as.integer(iris$Species),pch=pch,cex=1.2,
main="Flowers in PC-space: Species as Ground Truth")
## clustering -> 3 clusters
str(kmeans(iris[,-5],centers=3,nstart=10))
## List of 9
## $ cluster : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## $ centers : num [1:3, 1:4] 6.85 5.01 5.9 3.07 3.43 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 681
## $ withinss : num [1:3] 23.9 15.2 39.8
## $ tot.withinss: num 78.9
## $ betweenss : num 603
## $ size : int [1:3] 38 50 62
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
cl = kmeans(iris[,-5],centers=3,nstart=10)$cluster
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2,
main="Flowers in PC-space: k-means, k=3")

Try to change number of clusters.
par(mfcol=c(1,2))
cl = kmeans(iris[,-5],centers=2,nstart=10)$cluster
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2, main="Flowers in PC-space: k-means, k=2")
cl = kmeans(iris[,-5],centers=4,nstart=10)$cluster
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2, main="Flowers in PC-space: k-means, k=4")
We
can also cluster features - usually not very interesting with k-means.
Working with
source("http://r.modas.lu/plotPCA.r")
## Let's scale expression
Z = mRNA$X
Z[,] = t(scale(t(Z)))
PC2 = prcomp(Z,scale=TRUE)
genecl = kmeans(Z,centers=4,nstart=10)$cluster
par(mfcol=c(1,2))
plotPCA(t(Z),pc = PC2,col=genecl+1, cex=0.5, main = "Stand.expr., 4 cl.")
plotPCA(t(Z),pc = PC2, ipc=(3:4), col=genecl+1, cex=0.5, main = "Stand.expr., 4 cl.")

## clustering of the samples
hc = hclust(dist(iris[,-5]))
plot(hc,cex=0.6)

We need to “cut the tree” in order to get clusting:
## "cut the tree" to get defined clusters
cl = cutree(hc,k=3)
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2, main="Flowers, hclust with 3 cl.")

## play with k...
Let’s play with number of clusters
par(mfcol=c(1,2))
cl = cutree(hc,k=2)
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2, main="Flowers, hclust with 2 cl.")
cl = cutree(hc,k=4)
plot(PC$x[,"PC1"],PC$x[,"PC2"],col=cl+1,pch=pch,cex=1.2, main="Flowers, hclust with 4 cl.")

Hierarchical clustering allows more freedom (same tree can be cut at diferent depth), but it is not robust - removing one sample will change clustering! Usually people apply consensus hierarchical clustering to get defined and robust clusters.
Let’s install ant test package ConsensusClusterPlus.
Instead of iris, we will test it on mRNA data
#BiocManager::install("ConsensusClusterPlus")
library(ConsensusClusterPlus)
## select the most variable genes
thr = quantile(apply(mRNA$X,1,sd),0.95) ## thr <- st.dev. for 5% the most variable genes
ikeep = apply(mRNA$X,1,sd)> thr ## index of genes with st.dev > thr
table(ikeep) ## check numbers
## ikeep
## FALSE TRUE
## 19134 1007
## calculate standardized gene expression for the most variable genes
Z = mRNA$X[ikeep,]
Z[] = t(scale(t(Z)))
PC = prcomp(t(Z))
## you can skip this line!
setwd("d:/data/r")
## cons clustering of the samples
hc = ConsensusClusterPlus(t(Z),
maxK=6,
reps=100,
pItem=0.8,
pFeature=1,
title="CC_hc",
clusterAlg="hc",
distance="euclidean",
seed=1234567890,
plot="png")
## get clusters
cl=hc[[5]]$consensusClass
## plot
plotPCA(t(Z),col = cl+1)

Now we can calculate and plot mean expression for the clusters
## select 3 larger clusters
summary(as.factor(cl))
## 1 2 3 4 5
## 560 302 2 142 1
icl = order(summary(as.factor(cl)),decreasing=TRUE)[1:3]
## calculate mean profile over standardized genes for each cluster
z1 = apply(Z[cl == icl[1],],2,mean)
z2 = apply(Z[cl == icl[2],],2,mean)
z3 = apply(Z[cl == icl[3],],2,mean)
## plot results
par(mfcol = c(1,3))
boxplot(z1~mRNA$meta$time,las=2,col=1:nlevels(mRNA$meta$time), main="Mean expression in cl.1")
boxplot(z2~mRNA$meta$time,las=2,col=1:nlevels(mRNA$meta$time), main="Mean expression in cl.2")
boxplot(z3~mRNA$meta$time,las=2,col=1:nlevels(mRNA$meta$time), main="Mean expression in cl.3")

Note: despite many methods exist, from my experience
- there in no magic pill! Let’s calculate number of clusters for the
most variable mRNA in IFNg experiment. We will use package
factoextra. The standard methods are:
Silhouette - how similar each object is to other object of his cluster in comparison to other objects
WSS or WCSS - within cluster sum of squares
Gap statistics - gap statistics of Tibshirani
#install.packages("factoextra")
library(factoextra)
## this can be slow.. try it:
fviz_nbclust(Z, kmeans, method = "silhouette", print.summary =FALSE,verbose =FALSE)

fviz_nbclust(Z, kmeans, method = "wss", print.summary =FALSE,verbose =FALSE)

fviz_nbclust(Z, kmeans, method = "gap_stat", print.summary =FALSE,verbose =FALSE)

One of the most used methods for visualizing omics data is
heatmap. Here we will use heatmaps from the package
pheatmap.
#install.packages("pheatmap")
library(pheatmap)
## let's use previously defined markers
pheatmap(Z,
annotation_col = mRNA$meta,
fontsize_row = 2)

You could also try standard heatmap() and
gplots::hearmap.2(). They give more control over distance
measures and clustering.
Working with
Let’s read and visualize pre-analyzed dataset (GSE81547) - single-cell experiment on samples taken from pancreas. Data were downloaded, PCA-transformed and t-SNE was calculated on the first 100 components.
Panc = read.table("http://edu.modas.lu/data/txt/tsne_normpancreas.txt",sep="\t",header=TRUE,row.names=1,as.is=FALSE)
str(Panc)
## 'data.frame': 2544 obs. of 7 variables:
## $ individual : Factor w/ 8 levels "DID_scRSq01",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ age : int 21 21 21 21 21 21 21 21 21 21 ...
## $ nreads : int 447263 412082 167913 274448 323913 419043 600361 302079 262397 227087 ...
## $ inferred.cell.type: Factor w/ 7 levels "acinar cell",..: 4 4 1 4 4 4 3 3 1 1 ...
## $ tsne1 : num 33.2 32.6 20 31.8 31 ...
## $ tsne2 : num 14 15.2 -51.6 14 16.4 ...
## color - inferred cell type
col = as.integer(Panc$inferred.cell.type)+1
## shape - individual
pch=13+as.integer(Panc$individual)
## plot & legend
plot(Panc$tsne1,Panc$tsne2,pch=pch,col=col,main="tSNE with Inferred Cell Type",xlim=c(-45,50))
legend("bottomright",levels(Panc$inferred.cell.type),pch=15,col=1+(1:nlevels(Panc$inferred.cell.type)))
legend("bottomleft","shape - patient")

For single-cell data DBSCAN clustering is often used. We will work
with package dbscan:
#install.packages("dbscan")
library(dbscan)
cl = dbscan(Panc[,c("tsne1","tsne2")],eps=2,minPts=10)$cluster
## cl==0 means no cluster - grey. Otherwise use color from rainbow()
col=c("#00000033",rainbow(max(cl),alpha=0.5))[cl+1]
plot(Panc$tsne1,Panc$tsne2,pch=19,col=col,main="tSNE Clusterng with DBSCAN")
