I recently launched a PCA with the prcomp () function in R, and now I need to (objectively) decide which samples from two different groups are outliers and should be removed from further analysis.
Earlier, I saw PCA plots where confidence / variance ellipses (not sure of terminology) are placed around the samples, leaving outside those considered outliers (for example, say something like more than three standard deviations from the cluster center). How can I achieve something like this in R?
Note. I looked a little at the βcarβ , but itβs still unclear how to use data.ellipse for eg projection graph of PC1 and PC2. Any help / related resources appreciated!
Thanks!
Edit: the R object that I am using and one of the graphs that I would like to use for marking:
countsTable <- read.table('sample.txt', header=T) transpose.counts.table <- t(countsTable) input.for.pca <- transpose.counts.table[, colSums(abs(transpose.counts.table)) != 0] my.prc <- prcomp(input.for.pca, center=T, scale=T) pdf("PCA_Results_PC1_PC2_prcomp_counts.pdf") plot(my.prc$x[,1], my.prc$x[,2], type='p', cex=0.0, pch=20, main="PCA: Samples' projection on PC1 and PC2 (raw counts)", xlab="PC1", ylab="PC2") text(my.prc$x[,1], my.prc$x[,2], labels=rownames(my.prc$x), cex=1.2) dev.off()
Updated input.for.pca object that contains a categorical type column:
> dput(input.for.pca) structure(list(A1BG = c(190L, 125L, 95L, 115L, 483L, 94L, 87L, 211L, 153L, 135L, 116L, 110L, 75L, 159L, 148L, 159L, 177L, 103L, 103L, 88L, 112L, 87L, 272L, 100L, 313L, 169L, 130L, 164L, 114L, 154L, 168L, 197L, 125L, 95L, 118L, 154L, 197L, 203L, 184L, 86L, 142L, 111L, 140L, 63L), A1BG.AS1 = c(77L, 94L, 53L, 52L, 56L, 67L, 55L, 112L, 95L, 51L, 28L, 50L, 35L, 87L, 44L, 93L, 44L, 16L, 21L, 24L, 42L, 43L, 159L, 59L, 125L, 108L, 50L, 68L, 55L, 81L, 81L, 39L, 64L, 67L, 66L, 57L, 114L, 82L, 51L, 21L, 126L, 24L, 53L, 3L), A1CF = c(1L, 3L, 3L, 2L, 0L, 0L, 1L, 5L, 15L, 0L, 1L, 1L, 2L, 1L, 0L, 0L, 3L, 0L, 2L, 1L, 0L, 1L, 2L, 0L, 0L, 1L, 0L, 3L, 2L, 0L, 0L, 6L, 1L, 0L, 0L, 0L, 5L, 1L, 4L, 0L, 2L, 2L, 2L, 0L), A2LD1 = c(94L, 51L, 52L, 57L, 64L, 40L, 48L, 61L, 83L, 53L, 49L, 31L, 40L, 66L, 50L, 43L, 54L, 14L, 73L, 58L, 50L, 36L, 132L, 88L, 96L, 73L, 47L, 73L, 100L, 49L, 40L, 54L, 34L, 34L, 45L, 56L, 77L, 66L, 90L, 62L, 67L, 47L, 80L, 9L), A2M = c(4407L, 4755L, 1739L, 2049L, 3219L, 2598L, 2531L, 3894L, 2067L, 2703L, 3776L, 774L, 3129L, 2924L, 1997L, 5803L, 3147L, 5472L, 9608L, 3315L, 6164L, 1250L, 5911L, 4688L, 2775L, 4561L, 7165L, 3605L, 8228L, 4835L, 7124L, 4689L, 5306L, 3643L, 3190L, 3290L, 4932L, 1990L, 9610L, 7476L, 4533L, 4035L, 3275L, 1326L), A2ML1 = c(195L, 207L, 63L, 291L, 24L, 126L, 168L, 251L, 39L, 145L, 213L, 126L, 179L, 169L, 141L, 272L, 185L, 115L, 588L, 156L, 111L, 45L, 301L, 182L, 155L, 146L, 91L, 160L, 155L, 73L, 44L, 103L, 182L, 71L, 164L, 405L, 245L, 165L, 162L, 317L, 188L, 153L, 228L, 11L), A4GALT = c(191L, 86L, 64L, 200L, 39L, 118L, 106L, 64L, 11L, 40L, 144L, 53L, 134L, 101L, 138L, 138L, 214L, 138L, 406L, 145L, 497L, 72L, 473L, 86L, 41L, 213L, 172L, 77L, 657L, 73L, 123L, 126L, 106L, 44L, 125L, 106L, 56L, 114L, 756L, 328L, 151L, 210L, 213L, 42L), A4GNT = c(3L, 3L, 0L, 5L, 7L, 1L, 0L, 2L, 4L, 3L, 0L, 0L, 0L, 2L, 2L, 2L, 3L, 0L, 1L, 0L, 1L, 2L, 2L, 5L, 4L, 4L, 1L, 1L, 2L, 1L, 1L, 0L, 2L, 2L, 3L, 3L, 5L, 2L, 3L, 2L, 0L, 0L, 1L, 0L), AA06 = c(0L, 0L, 0L, 2L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), AAA1 = c(1L, 5L, 5L, 4L, 0L, 1L, 5L, 0L, 0L, 1L, 0L, 2L, 1L, 12L, 1L, 5L, 6L, 0L, 3L, 2L, 0L, 0L, 14L, 2L, 3L, 0L, 3L, 4L, 0L, 7L, 3L, 4L, 0L, 1L, 4L, 1L, 8L, 8L, 1L, 2L, 4L, 2L, 1L, 1L), AAAS = c(829L, 1042L, 844L, 805L, 1700L, 953L, 809L, 1052L, 1266L, 781L, 618L, 929L, 699L, 992L, 1001L, 1423L, 845L, 1054L, 808L, 711L, 938L, 756L, 1384L, 944L, 1689L, 1052L, 703L, 890L, 1293L, 727L, 804L, 1227L, 668L, 794L, 835L, 877L, 1514L, 1287L, 1435L, 941L, 1115L, 868L, 923L, 288L), AACS = c(2350L, 1953L, 1884L, 1702L, 421L, 1530L, 1435L, 3619L, 815L, 1320L, 859L, 1708L, 1096L, 2124L, 1029L, 1930L, 1241L, 724L, 867L, 893L, 1797L, 447L, 4854L, 1670L, 2675L, 2471L, 1874L, 1620L, 2515L, 3156L, 2079L, 1345L, 1684L, 1615L, 1650L, 1386L, 3470L, 1958L, 2278L, 1076L, 3459L, 1115L, 1369L, 121L), AACSP1 = c(19L, 6L, 11L, 13L, 1L, 11L, 13L, 27L, 5L, 12L, 4L, 7L, 4L, 6L, 5L, 18L, 17L, 0L, 7L, 6L, 4L, 1L, 19L, 16L, 30L, 11L, 12L, 20L, 11L, 10L, 11L, 3L, 4L, 10L, 16L, 4L, 8L, 7L, 10L, 5L, 18L, 6L, 5L, 0L), AADAC = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), AADACL2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AADACL3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AADACL4 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 3L, 0L, 0L, 0L, 1L, 0L), AADAT = c(387L, 416L, 297L, 392L, 682L, 422L, 287L, 704L, 50L, 373L, 306L, 234L, 225L, 340L, 220L, 443L, 387L, 324L, 304L, 261L, 259L, 181L, 801L, 428L, 925L, 498L, 270L, 524L, 654L, 472L, 334L, 395L, 414L, 440L, 318L, 306L, 645L, 418L, 350L, 277L, 468L, 302L, 298L, 48L), AAGAB = c(1235L, 1231L, 1026L, 981L, 477L, 877L, 808L, 2217L, 764L, 914L, 670L, 974L, 538L, 1362L, 492L, 1078L, 764L, 297L, 582L, 615L, 923L, 307L, 3055L, 1195L, 1673L, 1673L, 1070L, 1052L, 1761L, 2198L, 1221L, 813L, 1050L, 997L, 865L, 930L, 2065L, 1190L, 1243L, 578L, 1931L, 664L, 874L, 75L), AAK1 = c(6457L, 6538L, 4706L, 4917L, 1252L, 4055L, 4063L, 11627L, 9127L, 3604L, 2439L, 4221L, 3968L, 5065L, 2450L, 5690L, 3065L, 1082L, 2756L, 2886L, 3763L, 1360L, 15237L, 4771L, 7881L, 8349L, 5177L, 4888L, 6532L, 7856L, 5373L, 3487L, 4885L, 4461L, 3893L, 4152L, 9055L, 4656L, 4501L, 2598L, 8079L, 3187L, 3655L, 337L), AAMP = c(2282L, 2585L, 2113L, 2197L, 2226L, 1776L, 2097L, 3614L, 2494L, 2215L, 1707L, 2109L, 1740L, 2620L, 1703L, 2357L, 1965L, 1697L, 1724L, 1623L, 2299L, 1109L, 5555L, 2550L, 4239L, 3149L, 2127L, 2487L, 3966L, 2817L, 2043L, 1967L, 2092L, 2031L, 2123L, 2661L, 4203L, 2884L, 3224L, 1678L, 3876L, 1963L, 2362L, 473L), AANAT = c(33L, 51L, 14L, 26L, 23L, 12L, 36L, 14L, 27L, 24L, 30L, 17L, 11L, 45L, 31L, 28L, 23L, 67L, 77L, 26L, 44L, 17L, 86L, 70L, 16L, 39L, 10L, 27L, 20L, 22L, 23L, 20L, 10L, 12L, 18L, 28L, 41L, 40L, 85L, 40L, 48L, 30L, 46L, 8L), AARS = c(6383L, 9377L, 6772L, 8134L, 5605L, 4734L, 5902L, 13757L, 6832L, 6566L, 4009L, 5377L, 7209L, 7749L, 4105L, 6969L, 5120L, 5484L, 5486L, 4935L, 6604L, 3151L, 24172L, 7615L, 12786L, 12676L, 7009L, 8208L, 11328L, 11550L, 7054L, 4789L, 6547L, 6686L, 6109L, 6456L, 14576L, 8317L, 8057L, 4626L, 13162L, 5801L, 6090L, 1498L), AARS2 = c(1032L, 858L, 687L, 735L, 527L, 655L, 641L, 1480L, 1713L, 753L, 561L, 541L, 459L, 819L, 462L, 867L, 605L, 404L, 571L, 497L, 637L, 343L, 1761L, 1082L, 1379L, 815L, 841L, 844L, 1150L, 1121L, 973L, 665L, 696L, 672L, 824L, 511L, 1313L, 861L, 998L, 626L, 1258L, 555L, 623L, 115L), AARSD1 = c(918L, 1218L, 793L, 877L, 573L, 867L, 916L, 2030L, 1198L, 1015L, 715L, 909L, 437L, 1245L, 566L, 1083L, 985L, 325L, 584L, 621L, 871L, 353L, 2033L, 887L, 1412L, 1205L, 1143L, 1037L, 1592L, 1413L, 1183L, 1216L, 1121L, 888L, 1021L, 846L, 2189L, 1182L, 1412L, 656L, 1708L, 797L, 988L, 80L), type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control", "diseased"), class = "factor")), .Names = c("A1BG", "A1BG.AS1", "A1CF", "A2LD1", "A2M", "A2ML1", "A4GALT", "A4GNT", "AA06", "AAA1", "AAAS", "AACS", "AACSP1", "AADAC", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", "AAMP", "AANAT", "AARS", "AARS2", "AARSD1", "type"), row.names = c("C_2", "C_4", "C_6", "C_8", "C_9", "C_10", "C_14", "C_15", "C_18", "C_21", "C_29", "P_3", "P_6", "P_13", "P_15", "P_18", "P_19", "P_21", "P_22", "P_29", "P_31", "C_3", "C_5", "C_11", "C_12", "C_13", "C_16", "C_17", "C_19", "C_20", "C_22", "C_23", "C_24", "C_25", "C_26", "P_14", "P_16", "P_20", "P_23", "P_26", "P_27", "P_28", "P_30", "P_33"), class = "data.frame")
Thanks to entering DWin, I looked into the FactoMineR package, which was able to calculate the type of trust ellipses I was asking about. This is the code used:
res.pca <- PCA(input.for.pca, scale.unit=T, ncp=5, quali.sup = 26, graph = F) concat = cbind.data.frame(input.for.pca[,26], res.pca$ind$coord) ellipse.coord = coord.ellipse(concat, level.conf = 0.99999, bary=T) plot.PCA(res.pca, ellipse = ellipse.coord, axes=c(1, 2), choix="ind", habillage=26)
You may notice the level.conf parameter for the coord.ellipse function. By changing this default option to 0.95, I was able to increase the size of the ellipses.
I found this link useful for understanding how to work with FactoMineR.