For each value that determines whether another column contains a larger or smaller number

I am reviewing some data for bolts. If I had, for example,

diameter thread 1 4 1 6 1 4 2 5 2 7 3 9 

I want to create a new column that tells me whether it is the largest or smallest stream for each diameter. For each diameter there are no more than two thread sizes, but sometimes there are only 1, and in this case I would like it to look as big. For instance:

 diameter thread size 1 4 small 1 6 large 1 4 small 2 5 small 2 7 large 3 9 large 
+5
source share
5 answers

Pretty easy to use dplyr

 library(dplyr) data <- data.frame(diameter=c(1,1,2,2,3),thread=c(4,6,5,7,9)) data %>% group_by(diameter) %>% mutate(size=ifelse(thread==max(thread),"large","small")) diameter thread size (dbl) (dbl) (chr) 1 1 4 small 2 1 6 large 3 2 5 small 4 2 7 large 5 3 9 large 
+5
source

How about this (using base R):

 dt$size="small" a=aggregate(dt$thread~dt$diameter, dt, max)[,"dt$thread"] dt[dt$thread %in% a,]$size="large" 

OUTPUT

  diameter thread size 1 1 4 small 2 1 6 large 3 1 4 small 4 2 5 small 5 2 7 large 6 3 9 large 

DATA

 dt=structure(list(diameter = c(1L, 1L, 1L, 2L, 2L, 3L), thread = c(4L, 6L, 4L, 5L, 7L, 9L)), .Names = c("diameter", "thread"), class = "data.frame", row.names = c(NA, -6L)) 

REFERENCE

 library(dplyr) library(microbenchmark) dt=structure(list(diameter = c(1L, 1L, 1L, 2L, 2L, 3L), thread = c(4L, 6L, 4L, 5L, 7L, 9L)), .Names = c("diameter", "thread"), class = "data.frame", row.names = c(NA, -6L)) func_ZachTurn <- function(data){data %>% group_by(diameter) %>% mutate(size=ifelse(thread==max(thread),"large","small"))} func_m0h3n <- function(dt){dt$size="small";a=aggregate(dt$thread~dt$diameter, dt, max)[,"dt$thread"];dt[dt$thread %in% a,]$size="large";dt} func_Psidom <- function(df){data.table::setDT(df);df[, size := c("small", "large")[(thread == max(thread)) + 1L], .(diameter)];df[];} f <- function(x) (if(length(x)==1) 1L else x == max(x)) + 1L func_docendo.discimus <- function(dat){dat$size <- c("small", "large")[ave(dat$thread, dat$diameter, FUN = f)];dat;} func_Ernest.A <- function(df){df$size <- factor(unsplit(lapply(split(df$thread, df$diameter), function(x) ifelse(x == max(x), 'large', 'small')), df$diameter));df;} r <- func_ZachTurn(dt) all(r == func_m0h3n(dt)) # [1] TRUE all(r == func_docendo.discimus(dt)) # [1] TRUE all(r == func_Ernest.A(dt)) # [1] TRUE all(r == as.data.frame(func_Psidom(dt))) # [1] TRUE microbenchmark(func_ZachTurn(dt), func_m0h3n(dt), func_docendo.discimus(dt), func_Ernest.A(dt), func_Psidom(dt)) # Unit: microseconds # expr min lq mean median uq max neval # func_ZachTurn(dt) 3477.835 3609.147 3833.5482 3679.079 3860.6490 7136.169 100 # func_m0h3n(dt) 4436.367 4601.042 4879.2726 4743.474 4859.8150 8578.031 100 # func_docendo.discimus(dt) 854.168 923.673 999.2991 956.180 992.9645 4422.252 100 # func_Ernest.A(dt) 1032.101 1086.636 1165.4361 1129.195 1167.9040 4882.057 100 # func_Psidom(dt) 1537.245 1622.577 1731.0602 1678.822 1742.3395 5424.840 100 
+3
source

Here's the basic R option, using ave to group by diameter . I also create a small helper function f to improve readability.

 # define function f: f <- function(x) (if(length(x)==1) 1L else x == max(x)) + 1L # apply f to each thread by group of diameter: dat$size <- c("small", "large")[ave(dat$thread, dat$diameter, FUN = f)] # diameter thread size #1 1 4 small #2 1 6 large #3 2 5 small #4 2 7 large #5 3 9 large 
+2
source

Here is the dplyr solution:

 library(dplyr); df %>% group_by(diameter) %>% mutate(size = ifelse(thread == min(thread), "small", "large")) Source: local data frame [5 x 3] Groups: diameter [3] diameter thread size (int) (int) (chr) 1 1 4 small 2 1 6 large 3 2 5 small 4 2 7 large 5 3 9 large 

Or using data.table :

 data.table::setDT(df) df[, size := c("small", "large")[(thread == max(thread)) + 1L], .(diameter)] df diameter thread size 1: 1 4 small 2: 1 6 large 3: 2 5 small 4: 2 7 large 5: 3 9 large 
+1
source

Using split + ifelse + unsplit

 df$size <- factor(unsplit(lapply(split(df$thread, df$diameter), function(x) ifelse(x == max(x), 'large', 'small')), df$diameter)) 
+1
source

All Articles