How to match 2 columns of dataframe and extract column values ​​and column names?

I have a matrix called mymat . I have a vector called geno <- c("01","N1","11","1N","10") . I have another table called key.table . I want that I want to map the key column in key.table to the key column in mymat and if the column values ​​in any of the corresponding rows have any of the geno elements I want to extract the name of this column from mymat along with the corresponding geno element and insert it into a new column in matched.extract in key.table in the corresponding rows for each key and get the result.

  mymat <- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", "11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list( c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R"))) key.table<- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", "chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", "9920071", "9920072", "9920073", "9920074", "9920075", "9920076", "9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list(c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), c("key", "variantId"))) 

Result

  key variantId matched.extract 34 "chr5:12111" "9920068" NA 35 "chr5:12111" "9920069" NA 36 "chr5:12113" "9920070" AMLM12001KP (1N),AMLM12014N-R (1N) 37 "chr5:12114" "9920071" AMAS-11.3-Diagnostic (11) 38 "chr5:12118" "9920072" AMAS-11.3-Diagnostic (10) 39 "chr5:12122" "9920073" NA 40 "chr5:12123" "9920074" NA 41 "chr5:12123" "9920075" NA 42 "chr5:12125" "9920076" NA 43 "chr5:12127" "9920077" NA 44 "chr5:12129" "9920078" NA 
+6
source share
3 answers

Using data.table, I would approach like this:

 library(data.table) # convert the 'key.table' matrix to a data.table kt <- as.data.table(key.table, keep.rownames=TRUE) # convert the 'mymat' matrix to a data.table and melt into long format # filter on the needed geno-types # paste the needed values together into the requested format mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")") ][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key) ][val=="", val:=NA] # join the 'mm' and 'kt' data.tables kt[mm, matched := val, on=c("rn","key")] 

which gives:

 > kt rn key variantId matched 1: 34 chr5:12111 9920068 NA 2: 35 chr5:12111 9920069 NA 3: 36 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N) 4: 37 chr5:12114 9920071 AMAS-11.3-Diagnostic (11) 5: 38 chr5:12118 9920072 AMAS-11.3-Diagnostic (10) 6: 39 chr5:12122 9920073 NA 7: 40 chr5:12123 9920074 NA 8: 41 chr5:12123 9920075 NA 9: 42 chr5:12125 9920076 NA 10: 43 chr5:12127 9920077 NA 11: 44 chr5:12129 9920078 NA 

Explanation:

  • kt <- as.data.table(key.table, keep.rownames=TRUE) converts the matrix key.table to data.table (which is an extended data.frame ) and stores the names of the growths in the rn column.
  • mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key")) converts the mymat matrix into data.table , saves the data.table names in the rn column and melts the table data.table in a long format.
  • part of [value %in% c("1N","11","10"), val := paste0(variable," (",value,")")] will insert variable values ​​(which were columns in mymat ) with value values ​​only in those cases when value is 1N , 11 or 10 .
  • part [, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)] will insert rows not NA val together with the variables rn and key .
  • part [val=="", val:=NA] converts empty strings for val to NA values
  • finally kt[mm, matched := val, on=c("rn","key")] updates kt -data.table with a link to the val values ​​of the mm -data.table table for the corresponding rn and key variables.

WARNING When using data.table, it is better not to use key as the variable name, since key also a parameter in data.table . See ?key for more information.

+7
source

I am not familiar with dplyr functions. You can try the basic merge function R:

 mm <- merge(key.table,mymat,by="key",all.x=T) mm 

To insert column names into a fabric type:

 get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA) a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5])) 

final information frame:

 mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=",")) mm[, -3:-5] key variantId result 1 chr5:12111 9920068 2 chr5:12111 9920068 3 chr5:12111 9920069 4 chr5:12111 9920069 5 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N) 6 chr5:12114 9920071 AMAS-11.3-Diagnostic (11) 7 chr5:12118 9920072 AMAS-11.3-Diagnostic (10) 8 chr5:12122 9920073 9 chr5:12123 9920074 10 chr5:12123 9920075 11 chr5:12125 9920076 12 chr5:12127 9920077 13 chr5:12129 9920078 
+4
source

Not quite sure what you want, but it may be close to this:

 library(reshape2) mymat <- structure( c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", "11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list( c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R"))) key.table<- structure( c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", "chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", "9920071", "9920072", "9920073", "9920074", "9920075", "9920076", "9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list( c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), c("key", "variantId"))) # work with dataframes mmdf <- data.frame(mymat) ktdf <- data.frame(key.table) tdf <- merge(mmdf,ktdf,by="key") mltdf <- melt(tdf,id.vars=c("key","variantId")) mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,] mltdf1 

Yielding:

  key variantId variable value 5 chr5:12113 9920070 AMLM12001KP 1N 13 chr5:12114 9920071 AMAS.11.3.Diagnostic 11 14 chr5:12118 9920072 AMAS.11.3.Diagnostic 10 19 chr5:12113 9920070 AMLM12014N.R 1N 
+1
source

All Articles