PCA in R: prcomp and trust ellipses

I recently launched a PCA with the prcomp () function in R, and now I need to (objectively) decide which samples from two different groups are outliers and should be removed from further analysis.

Earlier, I saw PCA plots where confidence / variance ellipses (not sure of terminology) are placed around the samples, leaving outside those considered outliers (for example, say something like more than three standard deviations from the cluster center). How can I achieve something like this in R?

Note. I looked a little at the β€œcar” , but it’s still unclear how to use data.ellipse for eg projection graph of PC1 and PC2. Any help / related resources appreciated!

Thanks!

Edit: the R object that I am using and one of the graphs that I would like to use for marking:

countsTable <- read.table('sample.txt', header=T) transpose.counts.table <- t(countsTable) input.for.pca <- transpose.counts.table[, colSums(abs(transpose.counts.table)) != 0] my.prc <- prcomp(input.for.pca, center=T, scale=T) pdf("PCA_Results_PC1_PC2_prcomp_counts.pdf") plot(my.prc$x[,1], my.prc$x[,2], type='p', cex=0.0, pch=20, main="PCA: Samples' projection on PC1 and PC2 (raw counts)", xlab="PC1", ylab="PC2") text(my.prc$x[,1], my.prc$x[,2], labels=rownames(my.prc$x), cex=1.2) dev.off() 

Updated input.for.pca object that contains a categorical type column:

 > dput(input.for.pca) structure(list(A1BG = c(190L, 125L, 95L, 115L, 483L, 94L, 87L, 211L, 153L, 135L, 116L, 110L, 75L, 159L, 148L, 159L, 177L, 103L, 103L, 88L, 112L, 87L, 272L, 100L, 313L, 169L, 130L, 164L, 114L, 154L, 168L, 197L, 125L, 95L, 118L, 154L, 197L, 203L, 184L, 86L, 142L, 111L, 140L, 63L), A1BG.AS1 = c(77L, 94L, 53L, 52L, 56L, 67L, 55L, 112L, 95L, 51L, 28L, 50L, 35L, 87L, 44L, 93L, 44L, 16L, 21L, 24L, 42L, 43L, 159L, 59L, 125L, 108L, 50L, 68L, 55L, 81L, 81L, 39L, 64L, 67L, 66L, 57L, 114L, 82L, 51L, 21L, 126L, 24L, 53L, 3L), A1CF = c(1L, 3L, 3L, 2L, 0L, 0L, 1L, 5L, 15L, 0L, 1L, 1L, 2L, 1L, 0L, 0L, 3L, 0L, 2L, 1L, 0L, 1L, 2L, 0L, 0L, 1L, 0L, 3L, 2L, 0L, 0L, 6L, 1L, 0L, 0L, 0L, 5L, 1L, 4L, 0L, 2L, 2L, 2L, 0L), A2LD1 = c(94L, 51L, 52L, 57L, 64L, 40L, 48L, 61L, 83L, 53L, 49L, 31L, 40L, 66L, 50L, 43L, 54L, 14L, 73L, 58L, 50L, 36L, 132L, 88L, 96L, 73L, 47L, 73L, 100L, 49L, 40L, 54L, 34L, 34L, 45L, 56L, 77L, 66L, 90L, 62L, 67L, 47L, 80L, 9L), A2M = c(4407L, 4755L, 1739L, 2049L, 3219L, 2598L, 2531L, 3894L, 2067L, 2703L, 3776L, 774L, 3129L, 2924L, 1997L, 5803L, 3147L, 5472L, 9608L, 3315L, 6164L, 1250L, 5911L, 4688L, 2775L, 4561L, 7165L, 3605L, 8228L, 4835L, 7124L, 4689L, 5306L, 3643L, 3190L, 3290L, 4932L, 1990L, 9610L, 7476L, 4533L, 4035L, 3275L, 1326L), A2ML1 = c(195L, 207L, 63L, 291L, 24L, 126L, 168L, 251L, 39L, 145L, 213L, 126L, 179L, 169L, 141L, 272L, 185L, 115L, 588L, 156L, 111L, 45L, 301L, 182L, 155L, 146L, 91L, 160L, 155L, 73L, 44L, 103L, 182L, 71L, 164L, 405L, 245L, 165L, 162L, 317L, 188L, 153L, 228L, 11L), A4GALT = c(191L, 86L, 64L, 200L, 39L, 118L, 106L, 64L, 11L, 40L, 144L, 53L, 134L, 101L, 138L, 138L, 214L, 138L, 406L, 145L, 497L, 72L, 473L, 86L, 41L, 213L, 172L, 77L, 657L, 73L, 123L, 126L, 106L, 44L, 125L, 106L, 56L, 114L, 756L, 328L, 151L, 210L, 213L, 42L), A4GNT = c(3L, 3L, 0L, 5L, 7L, 1L, 0L, 2L, 4L, 3L, 0L, 0L, 0L, 2L, 2L, 2L, 3L, 0L, 1L, 0L, 1L, 2L, 2L, 5L, 4L, 4L, 1L, 1L, 2L, 1L, 1L, 0L, 2L, 2L, 3L, 3L, 5L, 2L, 3L, 2L, 0L, 0L, 1L, 0L), AA06 = c(0L, 0L, 0L, 2L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), AAA1 = c(1L, 5L, 5L, 4L, 0L, 1L, 5L, 0L, 0L, 1L, 0L, 2L, 1L, 12L, 1L, 5L, 6L, 0L, 3L, 2L, 0L, 0L, 14L, 2L, 3L, 0L, 3L, 4L, 0L, 7L, 3L, 4L, 0L, 1L, 4L, 1L, 8L, 8L, 1L, 2L, 4L, 2L, 1L, 1L), AAAS = c(829L, 1042L, 844L, 805L, 1700L, 953L, 809L, 1052L, 1266L, 781L, 618L, 929L, 699L, 992L, 1001L, 1423L, 845L, 1054L, 808L, 711L, 938L, 756L, 1384L, 944L, 1689L, 1052L, 703L, 890L, 1293L, 727L, 804L, 1227L, 668L, 794L, 835L, 877L, 1514L, 1287L, 1435L, 941L, 1115L, 868L, 923L, 288L), AACS = c(2350L, 1953L, 1884L, 1702L, 421L, 1530L, 1435L, 3619L, 815L, 1320L, 859L, 1708L, 1096L, 2124L, 1029L, 1930L, 1241L, 724L, 867L, 893L, 1797L, 447L, 4854L, 1670L, 2675L, 2471L, 1874L, 1620L, 2515L, 3156L, 2079L, 1345L, 1684L, 1615L, 1650L, 1386L, 3470L, 1958L, 2278L, 1076L, 3459L, 1115L, 1369L, 121L), AACSP1 = c(19L, 6L, 11L, 13L, 1L, 11L, 13L, 27L, 5L, 12L, 4L, 7L, 4L, 6L, 5L, 18L, 17L, 0L, 7L, 6L, 4L, 1L, 19L, 16L, 30L, 11L, 12L, 20L, 11L, 10L, 11L, 3L, 4L, 10L, 16L, 4L, 8L, 7L, 10L, 5L, 18L, 6L, 5L, 0L), AADAC = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), AADACL2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AADACL3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AADACL4 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 3L, 0L, 0L, 0L, 1L, 0L), AADAT = c(387L, 416L, 297L, 392L, 682L, 422L, 287L, 704L, 50L, 373L, 306L, 234L, 225L, 340L, 220L, 443L, 387L, 324L, 304L, 261L, 259L, 181L, 801L, 428L, 925L, 498L, 270L, 524L, 654L, 472L, 334L, 395L, 414L, 440L, 318L, 306L, 645L, 418L, 350L, 277L, 468L, 302L, 298L, 48L), AAGAB = c(1235L, 1231L, 1026L, 981L, 477L, 877L, 808L, 2217L, 764L, 914L, 670L, 974L, 538L, 1362L, 492L, 1078L, 764L, 297L, 582L, 615L, 923L, 307L, 3055L, 1195L, 1673L, 1673L, 1070L, 1052L, 1761L, 2198L, 1221L, 813L, 1050L, 997L, 865L, 930L, 2065L, 1190L, 1243L, 578L, 1931L, 664L, 874L, 75L), AAK1 = c(6457L, 6538L, 4706L, 4917L, 1252L, 4055L, 4063L, 11627L, 9127L, 3604L, 2439L, 4221L, 3968L, 5065L, 2450L, 5690L, 3065L, 1082L, 2756L, 2886L, 3763L, 1360L, 15237L, 4771L, 7881L, 8349L, 5177L, 4888L, 6532L, 7856L, 5373L, 3487L, 4885L, 4461L, 3893L, 4152L, 9055L, 4656L, 4501L, 2598L, 8079L, 3187L, 3655L, 337L), AAMP = c(2282L, 2585L, 2113L, 2197L, 2226L, 1776L, 2097L, 3614L, 2494L, 2215L, 1707L, 2109L, 1740L, 2620L, 1703L, 2357L, 1965L, 1697L, 1724L, 1623L, 2299L, 1109L, 5555L, 2550L, 4239L, 3149L, 2127L, 2487L, 3966L, 2817L, 2043L, 1967L, 2092L, 2031L, 2123L, 2661L, 4203L, 2884L, 3224L, 1678L, 3876L, 1963L, 2362L, 473L), AANAT = c(33L, 51L, 14L, 26L, 23L, 12L, 36L, 14L, 27L, 24L, 30L, 17L, 11L, 45L, 31L, 28L, 23L, 67L, 77L, 26L, 44L, 17L, 86L, 70L, 16L, 39L, 10L, 27L, 20L, 22L, 23L, 20L, 10L, 12L, 18L, 28L, 41L, 40L, 85L, 40L, 48L, 30L, 46L, 8L), AARS = c(6383L, 9377L, 6772L, 8134L, 5605L, 4734L, 5902L, 13757L, 6832L, 6566L, 4009L, 5377L, 7209L, 7749L, 4105L, 6969L, 5120L, 5484L, 5486L, 4935L, 6604L, 3151L, 24172L, 7615L, 12786L, 12676L, 7009L, 8208L, 11328L, 11550L, 7054L, 4789L, 6547L, 6686L, 6109L, 6456L, 14576L, 8317L, 8057L, 4626L, 13162L, 5801L, 6090L, 1498L), AARS2 = c(1032L, 858L, 687L, 735L, 527L, 655L, 641L, 1480L, 1713L, 753L, 561L, 541L, 459L, 819L, 462L, 867L, 605L, 404L, 571L, 497L, 637L, 343L, 1761L, 1082L, 1379L, 815L, 841L, 844L, 1150L, 1121L, 973L, 665L, 696L, 672L, 824L, 511L, 1313L, 861L, 998L, 626L, 1258L, 555L, 623L, 115L), AARSD1 = c(918L, 1218L, 793L, 877L, 573L, 867L, 916L, 2030L, 1198L, 1015L, 715L, 909L, 437L, 1245L, 566L, 1083L, 985L, 325L, 584L, 621L, 871L, 353L, 2033L, 887L, 1412L, 1205L, 1143L, 1037L, 1592L, 1413L, 1183L, 1216L, 1121L, 888L, 1021L, 846L, 2189L, 1182L, 1412L, 656L, 1708L, 797L, 988L, 80L), type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control", "diseased"), class = "factor")), .Names = c("A1BG", "A1BG.AS1", "A1CF", "A2LD1", "A2M", "A2ML1", "A4GALT", "A4GNT", "AA06", "AAA1", "AAAS", "AACS", "AACSP1", "AADAC", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", "AAMP", "AANAT", "AARS", "AARS2", "AARSD1", "type"), row.names = c("C_2", "C_4", "C_6", "C_8", "C_9", "C_10", "C_14", "C_15", "C_18", "C_21", "C_29", "P_3", "P_6", "P_13", "P_15", "P_18", "P_19", "P_21", "P_22", "P_29", "P_31", "C_3", "C_5", "C_11", "C_12", "C_13", "C_16", "C_17", "C_19", "C_20", "C_22", "C_23", "C_24", "C_25", "C_26", "P_14", "P_16", "P_20", "P_23", "P_26", "P_27", "P_28", "P_30", "P_33"), class = "data.frame") 

Thanks to entering DWin, I looked into the FactoMineR package, which was able to calculate the type of trust ellipses I was asking about. This is the code used:

 res.pca <- PCA(input.for.pca, scale.unit=T, ncp=5, quali.sup = 26, graph = F) concat = cbind.data.frame(input.for.pca[,26], res.pca$ind$coord) ellipse.coord = coord.ellipse(concat, level.conf = 0.99999, bary=T) plot.PCA(res.pca, ellipse = ellipse.coord, axes=c(1, 2), choix="ind", habillage=26) 

You may notice the level.conf parameter for the coord.ellipse function. By changing this default option to 0.95, I was able to increase the size of the ellipses.

I found this link useful for understanding how to work with FactoMineR.

+4
source share
1 answer

In the absence of data for work, I can suggest looking at the FactoMineR package, which prescribes some kind of PCA graph with additional ellipses: plot.PCA "Draw the graphs of the main component (PCA): Setting the" ellipse "argument for a value other than NULL should : "draw ellipses around individuals and use coord.ellipse results coord.ellipse "

Working with your data with FactoMiner::PCA I can get the same plot as the results of your prcomp call.

 require(FactoMineR) PCAres <-PCA(input.for.pca) # draws two plots as a side-effect 

I cannot get data ellipses using my inline argument in a string. Having studied the example of this procedure on the help page, I believe that this is due to the fact that identifying the group membership requires a factor category identifier so that the component values ​​can be labels and ellipses drawn around the group centroids.

+3
source

All Articles