R: select the first of n consecutive lines above a certain threshold value

Question

R: select the first of n consecutive lines above a certain threshold value

I have a data frame with MRN, dates and test value.

I need to select all first rows for MRN that have three consecutive values above 0.5.

This is an example version of the data:

   MRN Collected_Date   ANC
1  001     2015-01-02 0.345
2  001     2015-01-03 0.532
3  001     2015-01-04 0.843
4  001     2015-01-05 0.932
5  002     2015-03-03 0.012
6  002     2015-03-05 0.022
7  002     2015-03-06 0.543
8  002     2015-03-07 0.563
9  003     2015-08-02 0.343
10 003     2015-08-03 0.500
11 003     2015-08-04 0.734
12 003     2015-08-05 0.455
13 004     2014-01-02 0.001
14 004     2014-01-03 0.500
15 004     2014-01-04 0.562
16 004     2014-01-05 0.503

Code example:

df <- data.frame(MRN = c('001','001','001','001',
                         '002','002','002','002',
                         '003','003','003','003',
                         '004','004','004','004'), 
                 Collected_Date = as.Date(c('01-02-2015','01-03-2015','01-04-2015','01-05-2015',
                                            '03-03-2015','03-05-2015','03-06-2015','03-07-2015',
                                            '08-02-2015','08-03-2015','08-04-2015','08-05-2015',
                                            '01-02-2014','01-03-2014','01-04-2014','01-05-2014'), 
                                            format = '%m-%d-%Y'), 
                 ANC = as.numeric(c('0.345','0.532','0.843','0.932',
                         '0.012','0.022','0.543','0.563',
                         '0.343','0.500','0.734','0.455',
                         '0.001','0.500','0.562','0.503')))

I am currently using a very inconvenient approach, using the delay function to calculate the date difference, then filter for all values> = 0.5, and then sum the values, which helps to choose the date of the THIRD value. Then I subtract two days to get the date of the first value:

   df %>% group_by(MRN) %>% 
    mutate(., days_diff = abs(Collected_Date[1] - Collected_Date)) %>% 
        filter(ANC >= 0.5) %>%
            mutate(days = days_diff + lag((days_diff))) %>%
                filter(days == 5) %>%
                    mutate(Collected_Date = Collected_Date - 2) %>%
                        select(MRN, Collected_Date)

Conclusion:

Source: local data frame [2 x 2] Groups: MRN

  MRN Collected_Date
1 001     2015-01-03
2 004     2014-01-03

There should be a simpler / more elegant way. In addition, it does not give accurate results if there are gaps between the test dates.

:

   MRN Collected_Date   ANC     
1  001     2015-01-03 0.532
2  004     2014-01-03 0.500

, → 0,5, FIRST .

, , >= 0,5, NA.

!

+4

r dataframe dplyr row

col. slade 12 . '15 22:35

4

:

rle, 3

df <- data.frame(MRN = c('001','001','001','001','002','002','002','002','003','003','003','003','004','004','004','004'), Collected_Date = as.Date(c('01-02-2015','01-03-2015','01-04-2015','01-05-2015', '03-03-2015','03-05-2015','03-06-2015','03-07-2015', '08-02-2015','08-03-2015','08-04-2015','08-05-2015', '01-02-2014','01-03-2014','01-04-2014','01-05-2014'), format = '%m-%d-%Y'), ANC = as.numeric(c('0.345','0.532','0.843','0.932', '0.012','0.022','0.543','0.563', '0.343','0.500','0.734','0.455', '0.001','0.500','0.562','0.503')))

df[as.logical(with(df, ave(ANC, MRN, FUN = function(x)
   cumsum(x >= .5 & with(rle(x >= .5), rep(lengths, lengths)) >= 3) == 1))), ]

#    MRN Collected_Date   ANC 
# 2  001     2015-01-03 0.532
# 14 004     2014-01-03 0.500

,

df[as.logical(with(df, ave(ANC, MRN, FUN = function(x) {
     r <- rle(x >= .5)
     r <- rep(r$lengths, r$lengths)
     cumsum(r == 3 & x >= .5) == 1
    }))), ]

df <- df[c(1:4,4,4,4,5,5,5,5:16), ]
df[as.logical(with(df, ave(ANC, MRN, FUN = function(x)
  cumsum(x >= .5 & with(rle(x >= .5), rep(lengths, lengths)) >= 3) == 1))), ]

#    MRN Collected_Date   ANC
# 2  001     2015-01-03 0.532
# 14 004     2014-01-03 0.500

+3

rawr 13 . '15 5:30

, x, , :

high_run <- function(x, threshold) {
    high <- x >= threshold
    streak <- high[1]
    for(h in high[2:length(high)]){
        streak <- c(streak, streak[length(streak)]*h + h)
    }
    run
}

, :

high_run_start <- function(x, threshold, run){
    match(run, high_run(x, threshold)) - run + 1
}

:

> df %>% group_by(MRN) %>%
+ filter(row_number()==high_run_start(ANC,0.5,3))
Source: local data frame [2 x 3]
Groups: MRN

  MRN Collected_Date   ANC
1 001     2015-01-03 0.532
2 004     2014-01-03 0.500

+2

tegancp 13 . '15 1:30

a ddply (, %>%, , , ).

, "" , , ( , ) .

, rle ( ), "" ANC >= 0.5, 3. "" . r.

r.i , 3 , TRUE.

x, sum , , 1, ( sum(r$lengths[1:(r.i - 1)]) +1).

ddply(df,
.(MRN),
function (x) {
    r <- rle(x$ANC >= 0.5) # find 'runs' of x$ANC >= 0.5
    # find index of first run of length >=3 with ANC >= .5
    r.i <- which(r$lengths >= 3 & r$values)[1] 
    if (!is.na(r.i)) {
        # get index of first row in that run and return it.
        return(x[sum(r$lengths[seq_len(r.i - 1)]) + 1, ])
    }
    return(NULL)
})

, , . x <- subset(df, MRN == '001') , , r, r.i.

+1

mathematical.coffee 13 . '15 0:02

chappers · Accepted Answer · 2015-07-12T23:29:30+0000

- zoo dplyr. zoo rollapply, .

, .

df %>% group_by(MRN) %>%
  mutate(ANC=rollapply(ANC, width=3, min, align="left", fill=NA, na.rm=TRUE)) %>%
  filter(ANC >= 0.5) %>%  
  filter(row_number() == 1)

#   MRN Collected_Date   ANC
# 1 001     2015-01-03 0.532
# 2 004     2014-01-03 0.500

rollapply . , , :

rollapply(1:6, width=3, min, align="left", fill=NA) # [1]  1  2  3  4 NA NA
rollapply(1:6, width=3, min, align="center", fill=NA) # [1] NA  1  2  3  4 NA
rollapply(1:6, width=3, min, align="right", fill=NA) # [1] NA NA  1  2  3  4

, , .

, .

R: select the first of n consecutive lines above a certain threshold value

More articles: