1.2.1. Merged: Annotation-Metadata-Data
This is a custom format I used together with some collaborators to
simplify sample annotation
Example:
## load the function from Internet
source("http://r.modas.lu/readAMD.r")
## read AMD-file from Internet:
miR = readAMD("http://edu.modas.lu/data/txt/mirna_ifng.amd.txt",stringsAsFactors=TRUE)
str(miR)
## List of 5
## $ ncol: int 20
## $ nrow: int 2226
## $ anno:'data.frame': 2226 obs. of 7 variables:
## ..$ ID : chr [1:2226] "hp_hsa-let-7a-1_st" "hp_hsa-let-7a-1_x_st" "hp_hsa-let-7a-2_st" "hp_hsa-let-7a-2_x_st" ...
## ..$ Alignments : chr [1:2226] "9:96938239-96938318 (+)" "9:96938239-96938318 (+)" "11:122017230-122017301 (-)" "11:122017230-122017301 (-)" ...
## ..$ Sequence : chr [1:2226] "TGGGATGAGGTAGTAGGTTGTATAGTTTTAGGGTCACACCCACCACTGGGAGATAACTATACAATCTACTGTCTTTCCTA" "TGGGATGAGGTAGTAGGTTGTATAGTTTTAGGGTCACACCCACCACTGGGAGATAACTATACAATCTACTGTCTTTCCTA" "AGGTTGAGGTAGTAGGTTGTATAGTTTAGAATTACATCAAGGGAGATAACTGTACAGCCTCCTAGCTTTCCT" "AGGTTGAGGTAGTAGGTTGTATAGTTTAGAATTACATCAAGGGAGATAACTGTACAGCCTCCTAGCTTTCCT" ...
## ..$ Sequence.Length : int [1:2226] 80 80 72 72 74 83 83 84 84 87 ...
## ..$ Sequence.Type : chr [1:2226] "stem-loop" "stem-loop" "stem-loop" "stem-loop" ...
## ..$ Species.Scientific.Name : chr [1:2226] "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens" ...
## ..$ Transcript.ID.Array.Design.: chr [1:2226] "hsa-let-7a-1" "hsa-let-7a-1" "hsa-let-7a-2" "hsa-let-7a-2" ...
## $ meta:'data.frame': 20 obs. of 2 variables:
## ..$ time : Factor w/ 10 levels "T00","T005","T03",..: 1 1 2 2 3 3 4 4 5 5 ...
## ..$ replicate: Factor w/ 2 levels "r1","r2": 1 2 1 2 1 2 1 2 1 2 ...
## $ X : num [1:2226, 1:20] 2.75 1.318 1.995 1.696 0.906 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2226] "hp_hsa-let-7a-1_st" "hp_hsa-let-7a-1_x_st" "hp_hsa-let-7a-2_st" "hp_hsa-let-7a-2_x_st" ...
## .. ..$ : chr [1:20] "T000.1" "T000.2" "T005.1" "T005.2" ...
## reading & parsing large text files Internet can be long.
## => often it is better to download and read text-file from local storage.
## However, readAMD() includes downloading
mRNA = readAMD("http://edu.modas.lu/data/txt/mrna_ifng.amd.txt",stringsAsFactors=TRUE)
str(mRNA)
## List of 5
## $ ncol: int 17
## $ nrow: int 33297
## $ anno:'data.frame': 33297 obs. of 8 variables:
## ..$ ID : int [1:33297] 7892501 7892502 7892503 7892504 7892505 7892506 7892507 7892508 7892509 7892510 ...
## ..$ gene_assignment: chr [1:33297] "---" "---" "---" "---" ...
## ..$ GeneSymbol : chr [1:33297] "" "" "" "" ...
## ..$ RefSeq : chr [1:33297] "---" "---" "---" "---" ...
## ..$ seqname : chr [1:33297] "---" "---" "---" "---" ...
## ..$ start : int [1:33297] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ stop : int [1:33297] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ strand : chr [1:33297] "---" "---" "---" "---" ...
## $ meta:'data.frame': 17 obs. of 3 variables:
## ..$ time : Factor w/ 7 levels "T00","T03","T12",..: 1 1 2 2 3 3 4 4 4 5 ...
## ..$ treatment: Factor w/ 3 levels "IFNg","IFNg_JII",..: 3 3 1 1 1 1 1 1 1 1 ...
## ..$ replicate: Factor w/ 3 levels "r1","r2","r3": 1 2 1 2 1 2 1 2 3 1 ...
## $ X : num [1:33297, 1:17] 9.21 7.64 5 8.57 7.1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:33297] "7892501" "7892502" "7892503" "7892504" ...
## .. ..$ : chr [1:17] "T00.1" "T00.2" "T03.1" "T03.2" ...
It looks like many probesets (rows) are not annotated… Let’s
summarize the data to GeneSymbol
.
mRNA = readAMD("d:/data/r/mrna_ifng.amd.txt", ## put you path to the file
stringsAsFactors=TRUE,
index.column="GeneSymbol",
sum.func="mean")
str(mRNA)
## List of 5
## $ ncol: int 17
## $ nrow: int 20141
## $ anno:'data.frame': 20141 obs. of 8 variables:
## ..$ GeneSymbol : chr [1:20141] "" "A1BG" "A1CF" "A2BP1" ...
## ..$ ID : chr [1:20141] "7892501,7892502,7892503,7892504,7892505,7892506,7892507,7892508,7892509,7892510,7892511,7892512,7892513,7892514"| __truncated__ "8039748" "7933640" "7993083,7993110" ...
## ..$ gene_assignment: chr [1:20141] "---," "NM_130786 // A1BG // alpha-1-B glycoprotein // 19q13.4 // 1 /// NM_198458 // ZNF497 // zinc finger protein 497 "| __truncated__ "NM_138933 // A1CF // APOBEC1 complementation factor // 10q11.23 // 29974 /// NM_014576 // A1CF // APOBEC1 compl"| __truncated__ "NM_018723 // A2BP1 // ataxin 2-binding protein 1 // 16p13.3 // 54715 /// NM_145893 // A2BP1 // ataxin 2-binding"| __truncated__ ...
## ..$ RefSeq : chr [1:20141] "---," "NM_130786" "NM_138933" "NM_018723,ENST00000432184" ...
## ..$ seqname : chr [1:20141] "---,chr1,chr10,,chr11,chr12,chr13,chrUn_gl000212,chr14,chr15,chr16,chr17,chr17_gl000205_random,chr18,chr19,chr2"| __truncated__ "chr19" "chr10" "chr16" ...
## ..$ start : chr [1:20141] " 0, 53049, 63015, 564951, 566062, 568844, 1102484, 1103243, 1104373, 2316811, 6600914, "| __truncated__ " 58856544" " 52566314" " 6069132, 6753884" ...
## ..$ stop : chr [1:20141] " 0, 54936, 63887, 565019, 566129, 568913, 1102578, 1103332, 1104471, 2319922, 6600994, "| __truncated__ " 58867591" " 52645435" " 7763340, 6755559" ...
## ..$ strand : chr [1:20141] "---,+,-," "-" "-" "+" ...
## $ meta:'data.frame': 17 obs. of 2 variables:
## ..$ time : Factor w/ 7 levels "T00","T03","T12",..: 1 1 2 2 3 3 4 4 4 5 ...
## ..$ replicate: Factor w/ 3 levels "r1","r2","r3": 1 2 1 2 1 2 1 2 3 1 ...
## $ X : num [1:20141, 1:17] 6.6 6.15 5.1 4.64 5.78 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:20141] "" "A1BG" "A1CF" "A2BP1" ...
## .. ..$ : chr [1:17] "T00.1" "T00.2" "T03.1" "T03.2" ...
1.2.2. Read MaxQuant results (proteomics)
Another example - MaxQuant file, which is the result of spectra
analysis in MS/MS spectrometry. It contains inferred protein groups in
the samples with a lot of other annotation.
You can read such data using read.table()
and then
process, or write (ask to write) a small function for data import:
source("http://r.modas.lu/readMaxQuant.r") ## function
Prot = readMaxQuant("http://edu.modas.lu/data/txt/proteingroups.txt")
## 1484 features were read.
## Reverse and contaminants are removed, resulting in 1433 features
## Remove 4 uninformative features
## ` LFQ.intensity. ` was extracted for 12 samples.
str(Prot)
## List of 6
## $ nf : int 1429
## $ ns : int 12
## $ Anno:'data.frame': 1429 obs. of 5 variables:
## ..$ Majority.protein.IDs: chr [1:1429] "H0Y9X3;A0A024QZ42;A0A087WZ38;O75340" "P06493;A0A024QZP7;A0A087WZZ9" "A0A024R4E5;Q00341;H0Y394;H7C2D1" "A0A024R4M0;P46781;B5MCT8;C9JM19" ...
## ..$ Protein.names : chr [1:1429] "Programmed cell death protein 6" "Cyclin-dependent kinase 1" "Vigilin" "40S ribosomal protein S9" ...
## ..$ Gene.names : chr [1:1429] "PDCD6" "CDK1;CDC2" "HDLBP" "RPS9" ...
## ..$ Fasta.headers : chr [1:1429] "tr|H0Y9X3|H0Y9X3_HUMAN Programmed cell death protein 6 (Fragment) OS=Homo sapiens OX=9606 GN=PDCD6 PE=1 SV=1;tr"| __truncated__ "sp|P06493|CDK1_HUMAN Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3;tr|A0A024QZP7|A0A024QZ"| __truncated__ "tr|A0A024R4E5|A0A024R4E5_HUMAN High density lipoprotein binding protein (Vigilin), isoform CRA_a OS=Homo sapien"| __truncated__ "tr|A0A024R4M0|A0A024R4M0_HUMAN 40S ribosomal protein S9 OS=Homo sapiens OX=9606 GN=RPS9 PE=1 SV=1;sp|P46781|RS9"| __truncated__ ...
## ..$ Number.of.proteins : int [1:1429] 4 29 12 6 5 2 2 9 9 2 ...
## $ Meta:'data.frame': 12 obs. of 1 variable:
## ..$ ID: chr [1:12] "136.R1" "136.R2" "136.R3" "136.R4" ...
## $ X0 : num [1:1429, 1:12] 0.00 0.00 0.00 1.08e+08 2.20e+06 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:1429] "H0Y9X3;A0A024QZ42;A0A087WZ38;O75340" "P06493;A0A024QZP7;A0A087WZZ9;E5RIU6;E7ESI2;F5H6Z0;A0A087WZU2;E5RGN0;G3V5T9;E7EUK8;K7ELV5;H0YAZ9;K7EJ83;F8VYH9;F"| __truncated__ "A0A024R4E5;Q00341;H0Y394;H7C2D1;C9JT62;C9JHS7;C9JK79;C9JHZ8;C9JES8;C9JZI8;C9J5E5;C9JIZ1" "A0A024R4M0;P46781;B5MCT8;C9JM19;F2Z3C0;A8MXK4" ...
## .. ..$ : chr [1:12] "136.R1" "136.R2" "136.R3" "136.R4" ...
## $ LX : num [1:1429, 1:12] 0 0 0 7.68 2.36 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:1429] "H0Y9X3;A0A024QZ42;A0A087WZ38;O75340" "P06493;A0A024QZP7;A0A087WZZ9;E5RIU6;E7ESI2;F5H6Z0;A0A087WZU2;E5RGN0;G3V5T9;E7EUK8;K7ELV5;H0YAZ9;K7EJ83;F8VYH9;F"| __truncated__ "A0A024R4E5;Q00341;H0Y394;H7C2D1;C9JT62;C9JHS7;C9JK79;C9JHZ8;C9JES8;C9JZI8;C9J5E5;C9JIZ1" "A0A024R4M0;P46781;B5MCT8;C9JM19;F2Z3C0;A8MXK4" ...
## .. ..$ : chr [1:12] "136.R1" "136.R2" "136.R3" "136.R4" ...