R: Replace NA values ​​in several columns in the first dfrm with values ​​from the second dfrm after matching rows by date

It is my effort to restore the question that appeared yesterday, and which I worked most of the morning to resolve, but can no longer find the question. The 2 datasets, df1 and smaller df2, were provided with the same column names with a query to replace only the NA values ​​in the rows where the column datematched. I suppose a merger could do this, and perhaps be less burdensome, but I was looking for an match()indexing and indexing strategy and eventually found it:

df1 <- structure(list(date = c(20040101L, 20040115L, 20040131L, 20040205L, 
20040228L, 20040301L, 20040315L, 20040331L), X11A = c(100L, 200L, 
NA, NA, NA, 150L, NA, NA), X11A.1 = c(150L, NA, 165L, NA, NA, 
155L, NA, NA), X21B = c(NA, 200L, 180L, NA, NA, 170L, 180L, NA
), X3CC = c(NA, NA, 190L, NA, NA, 150L, 190L, 175L), X3CC.1 = c(140L, 
NA, 190L, NA, NA, 160L, 200L, 180L)), .Names = c("date", "X11A", 
"X11A.1", "X21B", "X3CC", "X3CC.1"), class = "data.frame", row.names = c(NA, 
-8L))

df2 <- structure(list(date = c(20040228L, 20040131L, 20040331L), X11A = c(140L, 
170L, NA), X11A.1 = c(145L, NA, 145L), X21B = c(165L, NA, 160L
), X3CC = c(150L, NA, NA), X3CC.1 = c(155L, NA, NA)), .Names = c("date", 
"X11A", "X11A.1", "X21B", "X3CC", "X3CC.1"), class = "data.frame", row.names = c(NA, 
-3L))

What was really proposed:

df1:

  date       11A    11A    21B    3CC    3CC
 20040101    100    150     NA     NA    140
 20040115    200     NA    200     NA     NA
 20040131     NA    165    180    190    190
 20040205     NA     NA     NA     NA     NA
 20040228     NA     NA     NA     NA     NA
 20040301    150    155    170    150    160
 20040315     NA      NA    180    190    200
 20040331     NA      NA     NA    175    180

df2:

 date        11A    11A    21B    3CC    3CC
 20040228    140    145    165    150    155
 20040131    170     NA     NA     NA     NA
 20040331     NA    145    160     NA     NA
+4
source share
2 answers

is.na "" dataframe. , , match date. which arr.ind = TRUE, , [<- [:

valpos <- which(is.na(df1)[match(df2$date, df1$date), ], arr.ind=TRUE)

, ( "" ), "" :

targpos <- cbind( match(df2$date, df1$date)[ valpos[,'row'] ] , 
                  valpos[,'col'])

:

> df1[targpos] <- df2[valpos]
> df1
      date X11A X11A.1 X21B X3CC X3CC.1
1 20040101  100    150   NA   NA    140
2 20040115  200     NA  200   NA     NA
3 20040131  170    165  180  190    190
4 20040205   NA     NA   NA   NA     NA
5 20040228  140    145  165  150    155
6 20040301  150    155  170  150    160
7 20040315   NA     NA  180  190    200
8 20040331   NA    145  160  175    180

, . , .

+4

(1) df2 df1 date (2) data.frames. , , df1 df2 NA, , df2.

:

  • ; .
  • , , NA- df1, df2, NA.
  • . IOW, df1 , .

rms <- match(df2$date,df1$date);
cms <- intersect(names(df1)[-1L],names(df2)[-1L]);
for (cm in cms) { n <- is.na(df1[[cm]][rms]); df1[[cm]][rms][n] <- df2[[cm]][n]; };
df1;
##       date X11A X11A.1 X21B X3CC X3CC.1
## 1 20040101  100    150   NA   NA    140
## 2 20040115  200     NA  200   NA     NA
## 3 20040131  170    165  180  190    190
## 4 20040205   NA     NA   NA   NA     NA
## 5 20040228  140    145  165  150    155
## 6 20040301  150    155  170  150    160
## 7 20040315   NA     NA  180  190    200
## 8 20040331   NA    145  160  175    180

library(microbenchmark);

`42` <- function(df1,df2) { valpos <- which(is.na(df1)[match(df2$date,df1$date),],arr.ind=TRUE); targpos <- cbind(match(df2$date,df1$date)[valpos[,'row']],valpos[,'col']); df1[targpos] <- df2[valpos]; df1; };
bgoldst <- function(df1,df2) { rms <- match(df2$date,df1$date); cms <- intersect(names(df1)[-1L],names(df2)[-1L]); for (cm in cms) { n <- is.na(df1[[cm]][rms]); df1[[cm]][rms][n] <- df2[[cm]][n]; }; df1; };

identical(`42`(df1,df2),bgoldst(df1,df2));
## [1] TRUE
microbenchmark(`42`(df1,df2),bgoldst(df1,df2));
## Unit: microseconds
##               expr     min       lq     mean   median       uq      max neval
##     `42`(df1, df2) 297.219 309.1935 340.1425 319.0295 333.9975 1236.771   100
##  bgoldst(df1, df2) 175.766 181.7530 192.9317 188.1670 198.2180  316.463   100

set.seed(1L);
NR1 <- 10000L; NC1 <- 300L; NR2 <- 1000L; NC2 <- 300L; probNA1 <- 0.5; probNA2 <- 0.1;
df1 <- data.frame(date=as.integer(format(sort(sample(seq(as.Date('2004-01-01'),by=1L,len=NR1*5L),NR1)),'%Y%m%d')));
df1[paste0('X',seq_len(NC1))] <- matrix(sample(c(NA,100:200),NR1*NC1,T,c(probNA1,rep((1-probNA1)/101,101L))),NR1);
df2 <- data.frame(date=sample(df1$date,NR2));
df2[paste0('X',seq_len(NC2))] <- matrix(sample(c(NA,100:200),NR2*NC2,T,c(probNA2,rep((1-probNA2)/101,101L))),NR2);

identical(`42`(df1,df2),bgoldst(df1,df2));
## [1] TRUE
microbenchmark(`42`(df1,df2),bgoldst(df1,df2));
## Unit: milliseconds
##               expr       min        lq      mean    median        uq       max neval
##     `42`(df1, df2) 149.61503 194.66606 216.16916 231.25129 233.68079 277.24701   100
##  bgoldst(df1, df2)  29.17145  31.32318  37.85904  32.15154  33.24013  75.47765   100
0

All Articles