To display the comparison of a row vector with each row of a data block in R?

Suppose I have a data frame that comes from reading in the following file Foo.csv

A,B,C
1,2,3
2,2,4
1,7,3

I would like to count the number of matching elements between the first line and subsequent lines. For example, the first line corresponds to the second line in one position and corresponds to the third line in two positions. Here is some code that will achieve the desired effect.

foo = read.csv("Foo.csv")                      

numDiffs = rep(0,dim(foo)[1])                  
for (i in 2:dim(foo)[1]) {                     
   numDiffs[i] = sum(foo[i,] == foo[1,])       
}                                              
print(numDiffs)                                

My question is: can this be vectorized in order to kill the cycle and possibly shorten the working time ? My first attempt is given below, but it leaves an error because it is ==not defined for this type of comparison.

colSums(foo == foo[1,])
+4
source share
4 answers
> rowSums(sapply(foo, function(x) c(0,x[1] == x[2:nrow(foo)])))
[1] 0 1 2
+4

, , - :

as.vector(c(0, rowSums(foo[rep(1, nrow(foo) - 1), ] == foo[-1, ])))
# [1] 0 1 2

, data.frame .


, . "N", data.frame. @nacnudus .

set.seed(1)
N <- 10000000
mydf <- data.frame(matrix(sample(10, N, replace = TRUE), ncol = 10))
dim(mydf)
# [1] 1000000      10

fun1 <- function(data) rowSums(sapply(data, function(x) c(0,x[1] == x[2:nrow(data)])))
fun2 <- function(data) as.vector(c(0, rowSums(data[rep(1, nrow(data) - 1), ] == data[-1, ])))
fun3 <- function(data) {
  bar <- as.matrix(data)
  c(0, rowSums(t(t(bar[-1, ]) == bar[1, ])))
}

library(microbenchmark)

## On your original sample data
microbenchmark(fun1(foo), fun2(foo), fun3(foo))
# Unit: microseconds
#       expr     min       lq   median       uq     max neval
#  fun1(foo) 109.903 119.0975 122.5185 127.0085 228.785   100
#  fun2(foo) 333.984 354.5110 367.1260 375.0370 486.650   100
#  fun3(foo) 233.490 250.8090 264.7070 269.8390 518.295   100

## On the sample data created above--I don't want to run this 100 times!
system.time(fun1(mydf))
#    user  system elapsed 
#   15.53    0.06   15.60
system.time(fun2(mydf))
#    user  system elapsed 
#    2.05    0.01    2.06 
system.time(fun3(mydf))
#    user  system elapsed 
#    0.32    0.00    0.33 

, Codoremifa vapply sapply, ! 15 0,24 1 .

fun4 <- function(data) {
  rowSums(vapply(data, function(x) c(0, x[1] == x[2:nrow(data)]), 
                 vector("numeric", length=nrow(data))))
} 

microbenchmark(fun3(mydf), fun4(mydf), times = 20)
# Unit: milliseconds
#        expr      min       lq   median       uq      max neval
#  fun3(mydf) 369.5957 422.9507 438.8742 462.6958 486.3757    20
#  fun4(mydf) 238.1093 316.9685 323.0659 328.0969 341.5154    20
+4

:

bar <- as.matrix(foo)
c(0, rowSums(t(t(bar[-1, ]) == bar[1, ])))
# [1] 0 1 2

t() , - , .

+4

eh, , .

c(foo[1,]) == foo
#         A     B     C
#[1,]  TRUE  TRUE  TRUE
#[2,] FALSE  TRUE FALSE
#[3,]  TRUE FALSE  TRUE

.. foo[1,,drop=TRUE] == foo...

, ...

rowSums( c( foo[1,] ) == foo[-1,] )
#[1] 3 1 2

, f[1,] - data.frame. == , . , vapply, @AnandaMahto .

fun3 fun4 @AnandaMahto , data.frame, my.df...

microbenchmark(fun3(mydf), fun4(mydf), fun6(mydf) , times = 20)
#Unit: milliseconds
#       expr      min       lq   median       uq      max neval
# fun3(mydf) 320.7485 344.9249 356.1657 365.7576 399.5334    20
# fun4(mydf) 299.6660 313.7105 319.1700 327.8196 555.4625    20
# fun6(mydf) 196.8244 241.4866 252.6311 258.8501 262.7968    20

fun6 ...

fun6 <- function(data) rowSums( c( data[1,] ) == data )
+4

All Articles