If fullfiles two rules store names in a vector

Question

If fullfiles two rules store names in a vector

Start with the data:

> dput(Data1) structure(list(X1 = structure(c(17L, 14L, 20L, 16L, 1L, 2L, 3L, 4L, 15L, 8L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L, 18L, 19L, 5L, 6L, 7L), .Label = c("Astra_1", "Astra_2", "Astra_3", "Astra_4", "Audi_1", "Audi_2", "Audi_3", "BMW_1", "BMW_2", "BMW_3", "BMW_4", "BMW_5", "Fiat_1", "Mazda_2", "Mercedes_1", "Nexia_1", "Porsche_1", "Scania_1", "Scania_2", "Tico_1", "VW_1", "VW_2", "VW_3"), class = "factor"), X2 = structure(c(2L, 3L, 10L, 7L, 8L, 12L, 9L, 14L, 11L, 4L, 5L, 6L, 15L, 13L, 4L, 5L, 9L, 14L, 11L, 1L, 3L, 10L, 16L), .Label = c("Astra_1", "Astra_3", "Astra_4", "Audi_1", "Audi_2", "Audi_3", "BMW_1", "BMW_2", "Mazda_2", "Mercedes_1", "Nexia_1", "Porsche_1", "Scania_2", "Tico_1", "VW_2", "VW_3" ), class = "factor"), AUC_1 = c(5860133.702, 1296009.939, 333123.4932, 250348.9407, 1376193.334, 4080502.863, 3777603.233, 3503973.487, 99101538.62, 231873.8462, 87258.75465, 147430.9913, 1028986.892, 1451482.832, 8136.72382, 25311.41683, 131352.7137, 565410.8186, 30196.23792, 70184.82268, 2526321.019, 381643.2138, 819687.9824), AUC_2 = c(4849720.322, 928980.4715, 320547.6185, 223287.2029, 1340641.323, 4720329.699, 4369150.434, 3371021.243, 108591253.3, 266489.7601, 85384.84604, 165726.7626, 1052130.559, 1470876.65, 9499.927679, 49309.74984, 138482.765, 444600.7911, 25132.73714, 55453.67019, 2038911.81, 422559.3293, 1445477.433 ), ratio = c(1.20834467, 1.395088463, 1.03923247, 1.121196994, 1.02651866, 0.864452935, 0.864608186, 1.039439753, 0.91261069, 0.87010415, 1.021946618, 0.889602795, 0.978003046, 0.98681479, 0.856503765, 0.513314647, 0.948513078, 1.271726974, 1.201470327, 1.265647926, 1.2390536, 0.90317072, 0.567070757), Country = structure(c(1L, 1L, 2L, 3L, 5L, 1L, 5L, 1L, 4L, 7L, 4L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 8L, 8L, 6L, 6L, 7L), .Label = c("France", "Germany", "Italy", "Norway", "Poland", "Spain", "Sweden", "Ukraine" ), class = "factor"), Comp = structure(c(3L, 5L, 16L, 9L, 8L, 9L, 12L, 14L, 4L, 15L, 11L, 14L, 16L, 17L, 10L, 10L, 12L, 13L, 1L, 2L, 5L, 6L, 7L), .Label = c("11,12", "12,13", "12,13,14", "14,15", "14,15,16", "15,16,17", "16,17,18", "2,3", "2,3,4", "3,4", "3,4,5", "4,5,6", "5,6", "5,6,7", "5,6,7,8", "6,7,8", "7,8,9"), class = "factor")), .Names = c("X1", "X2", "AUC_1", "AUC_2", "ratio", "Country", "Comp"), class = "data.frame", row.names = c(NA, -23L))

The data chapter looks like this:

  X1 X2 AUC_1 AUC_2 ratio Country Comp 1 Porsche_1 Astra_3 5860133.7 4849720.3 1.2083447 France 12,13,14 2 Mazda_2 Astra_4 1296009.9 928980.5 1.3950885 France 14,15,16 3 Tico_1 Mercedes_1 333123.5 320547.6 1.0392325 Germany 6,7,8 4 Nexia_1 BMW_1 250348.9 223287.2 1.1211970 Italy 2,3,4 5 Astra_1 BMW_2 1376193.3 1340641.3 1.0265187 Poland 2,3 6 Astra_2 Porsche_1 4080502.9 4720329.7 0.8644529 France 2,3,4

Now we will focus on the last two columns: Country and Comp . I would like to extract all the rows that contain the same country, and compare if any of the numbers in the Comp column are the same, the rows from X1 and X2 should be stored together - perhaps in separate vectors or in a matrix. It is possible that one row may belong to different "clusters" / "vectors".

An example of the desired conclusion. This is just an example and clustering is completely random. Any output visualization method is valid.

  Country 1 2 3 4 5 6 1 France Astra_3 Scania_2 Tico_1 NA NA NA 2 Poland Astra_4 Mazda_2 VW_3 Tico_2 NA NA 3 Sweden Mercedes_1 BMW_1 BMW_2 Audi_1 VW_3 NA 4 Norway BMW_1 Astra_1 Scania_2 Audi_3 NA NA

+6

r

Shaxi liver Apr 14 '16 at 10:48

source share

2 answers

Assuming dat is your data.

 library(data.table) library(stringr) setDT(dat) dat[, `:=`(X1 = as.character(X1), X2 = as.character(X2), Comp = str_split(as.character(Comp), ","))] dat[, lapply(.SD, unlist), by = 1:nrow(dat) ][, .(X = paste(sort(unique(c(X1, X2))), collapse = ",")), by = .(Country, Comp) ][, .(SharedComp = paste(Comp, collapse = ",")), by = .(Country, X)] -> result head(result) Country X SharedComp 1: France Astra_3,Porsche_1 12,13 2: France Astra_3,Astra_4,Mazda_2,Porsche_1 14 3: France Astra_4,Mazda_2 15,16 4: Germany Mercedes_1,Tico_1 6,7,8 5: Italy BMW_1,Nexia_1 2,3,4 6: Poland Astra_1,BMW_2 2,3

If you want the result to be more like your question, you need to do some permutation.

 dcast(result[, .(Country, SharedComp, X = str_split(X, ",")) ][, lapply(.SD, unlist), by = 1:nrow(result) ][, i := seq_len(.N), by = nrow], nrow + Country ~ i, value.var = "X") nrow Country 1 2 3 4 5 6 7 8 1: 1 France Astra_3 Porsche_1 NA NA NA NA NA NA 2: 2 France Astra_3 Astra_4 Mazda_2 Porsche_1 NA NA NA NA 3: 3 France Astra_4 Mazda_2 NA NA NA NA NA NA 4: 4 Germany Mercedes_1 Tico_1 NA NA NA NA NA NA 5: 5 Italy BMW_1 Nexia_1 NA NA NA NA NA NA 6: 6 Poland Astra_1 BMW_2 NA NA NA NA NA NA --- 11: 11 Sweden Audi_1 Audi_3 BMW_1 BMW_3 NA NA NA NA 12: 12 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 VW_2 NA NA 13: 13 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 BMW_5 Scania_2 VW_2 --- 25: 25 Spain Audi_2 Mercedes_1 NA NA NA NA NA NA 26: 26 Sweden Audi_3 VW_3 NA NA NA NA NA NA nrow Country 1 2 3 4 5 6 7 8

+3

inscaven Apr 19 '16 at 10:44

source share

Queak · Accepted Answer · 2016-04-18T20:32:30+0000

I suppose you want: find all lines with a given country, say, in Spain. Then in these lines take all the lines where a certain number appears in the Comp column, for example. 4. On these lines, extract the contents of columns X1 and X2 and connect them.

Perhaps you need this code:

 countries <- levels(data[,"Country"]) results <- list() cn <- 1 for (i in 1:length(countries)) { # find all row numbers with that country: idx <- which(data[,"Country"] == countries[i]) # get all numbers which occur for that country: numbers <- unique(as.numeric(unlist(strsplit(as.character(data[idx,"Comp"]), ",")))) for (j in 1:length(numbers)) { # split all the numbers in the column "Comp" by ",": CompList <- strsplit(as.character(data[idx,"Comp"]), ",") # get all the row numbers for that country where numbers[j] is contained in the column "Comp": rows <- idx[unlist(lapply(CompList, function(x) {any(x == as.character(numbers[j]))}))] # assuming you want a number in the column "Comp" to occur at least in two rows: if (length(rows) > 1) { results[[cn]] <- list("Country"= countries[i], "Cars"= as.vector(as.matrix(data[rows, c("X1", "X2")])), "ValueOfComp"=numbers[j]) cn <- cn + 1 } } }

This gives you something like the following:

 > results [[1]] [[1]]$Country [1] "France" [[1]]$Cars [1] "Porsche_1" "Mazda_2" "Astra_3" "Astra_4" [[1]]$ValueOfComp [1] 14 [[2]] [[2]]$Country [1] "Spain" [[2]]$Cars [1] "Fiat_1" "VW_1" "Audi_1" "Audi_2" [[2]]$ValueOfComp [1] 3

If fullfiles two rules store names in a vector

More articles: