How to add a column that counts duplicates in a sequence?

Question

How to add a column that counts duplicates in a sequence?

I want to add a column to a data frame (integrates2) that counts duplicates in a sequence. The following is the data:

name    program  date of contact   helper column
John     ffp        10/11/2014          2
John     TP         10/27/2014          2
Carlos   TP         11/19/2015          3
Carlos   ffp        12/1/2015           3
Carlos   wfd        12/31/2015          3
Jen      ffp        9/9/2014            2
Jen      TP         9/30/2014           2

This is a list of people who attended certain programs on certain dates. I added a helper column to count duplicates and sort contact dates. I am looking to count combinations of existing programs (e.g. ffp-tp, tp-ffp-wfd).

To do this, I want to implement the following code to transpose ordered combinations using a new column named "program2":

 #transpose the programs 
 require(reshape2) dcast(integrates2, name ~ program2, value.var="program")

Then I plan to use the following code to convert the result into a table and data frame and count the frequencies:

 res = table(integrates2)
 resdf = as.data.frame(res)

: dataframe ext

"program2" , :

  Name    program  date of contact   helper column   program2
  John     ffp        10/11/2014          2             1
  John     TP         10/27/2014          2             2
  Carlos   TP         11/19/2015          3             1
  Carlos   ffp        12/1/2015           3             2
  Carlos   wfd        12/31/2015          3             3

, "program2" , . :

    program  pro1   pro2   freq      
     ffp     tp             2   
     TP      ffp    wfd     1

, , , . !

+4

r

LoF10 14 . '16 0:02

2

:

dplyr,

library(dplyr)
integrates2 %>% group_by(name) %>% summarise(prg1 = program[1],
                                             prg2 = program[2],
                                             prg3 = program[3]) %>% 
  select(prg1, prg2, prg3) %>% group_by(prg1, prg2, prg3) %>% summarise(freq = n())

Source: local data frame [2 x 4]
Groups: prg1, prg2 [?]

    prg1   prg2   prg3  freq
  (fctr) (fctr) (fctr) (int)
1    ffp     TP     NA     2
2     TP    ffp    wfd     1

mydf2

Source: local data frame [3 x 4]
Groups: prg1, prg2 [?]

   prg1  prg2  prg3  freq
  (chr) (chr) (chr) (int)
1   ffp    TP    NA     1
2    TP   ffp    NA     1
3   wfd    TP   ffp     1

group_by on name ;
summarise program ;
select, ;
group_by prg*,
summarise freq .

R, , ( , ):

tab <- table(sapply(split(integrages2$program, integrates2$name), 
             function(x){paste(x, collapse = '-')}))
prgs <- strsplit(names(tab), '-')
programs <- do.call(rbind, lapply(prgs, function(x){
  c(x, rep(NA, max(sapply(prgs, length)-length(x))))
  }))
programs <- cbind(as.data.frame(programs), matrix(tab))
names(programs) <- c(paste0('prgm', seq(length(programs)-1)), 'freq')

, :

table(sapply(split(integrates2$program, integrates2$name), 
             function(x){paste(x, collapse = '-')}))

ffp-TP TP-ffp-wfd 
     2          1

as.matrix,

           [,1]
ffp-TP        2
TP-ffp-wfd    1

:

reshape2, dcast, data.frame ( name s, , [,-1]):

library(reshape2)
programs <- dcast(integrates2, name ~ program, value.var = 'program')[,-1]

programs :

> programs
  ffp TP  wfd
1 ffp TP  wfd
2 ffp TP <NA>
3 ffp TP <NA>

dplyr programs ( , group_by(ffp, TP, wfd), , ) summarise, n() :

library(dplyr)
programs %>% group_by_(.dots = names(programs)) %>% summarise(freq = n())

Source: local data frame [2 x 4]
Groups: ffp, TP [?]

    ffp    TP   wfd  freq
  (chr) (chr) (chr) (int)
1   ffp    TP   wfd     1
2   ffp    TP    NA     2

0

alistaire 14 . '16 1:40

jazzurro · Accepted Answer · 2016-03-14T01:27:03+0000

, , . , . , , .

setDT(mydf)[, list(type = paste(program, collapse = "-")), by = name][,
           list(total = .N), by = type]

#         type total
#1:     ffp-TP     2
#2: TP-ffp-wfd     1

, cSplit() splitstackshape.

setDT(mydf)[, list(type = paste(program, collapse = "-")), by = name][,
              list(total = .N), by = type] -> temp

cSplit(temp, splitCols = "type", sep = "-")

#   total type_1 type_2 type_3
#1:     2    ffp     TP     NA
#2:     1     TP    ffp    wfd

dplyr:

group_by(mydf, name) %>%
summarise(type = paste(program, collapse = "-")) %>%
count(type)

#        type     n
#       (chr) (int)
#1     ffp-TP     2
#2 TP-ffp-wfd     1

DATA

mydf <- structure(list(name = c("John", "John", "Carlos", "Carlos", "Carlos", 
"Jen", "Jen"), program = c("ffp", "TP", "TP", "ffp", "wfd", "ffp", 
"TP"), dateOfContact = c("10/11/2014", "10/27/2014", "11/19/2015", 
"12/1/2015", "12/31/2015", "9/9/2014", "9/30/2014"), helperColumn = c(2L, 
2L, 3L, 3L, 3L, 2L, 2L)), .Names = c("name", "program", "dateOfContact", 
"helperColumn"), class = "data.frame", row.names = c(NA, -7L))

How to add a column that counts duplicates in a sequence?

:

:

More articles: