%%%% Shot 1 - Got something working
extract <- function(filenam="file.txt"){ txt <- readLines(filenam) ## date of current run: ## assumed to be on 2nd line following the first line matching "current_run" ii <- 2 + grep("current_run",txt, fixed=TRUE)[1] line_current_run <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) date_current_run <- paste(line_current_run[5:8], collapse=" ") ## Cpu: ## assumed to be on line following the first line matching "Server Summary" ## which comes after the first line matching "Engine Utilization ..." jj <- grep("Engine Utilization (Tick %)", txt, fixed=TRUE)[1] ii <- grep("Server Summary",txt, fixed=TRUE) ii <- 1 + min(ii[ii>jj]) line_Cpu <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) Cpu <- line_Cpu[2] ## Count: ## assumed to be on 2nd line following the first line matching "Transaction Summary" ii <- 2 + grep("Transaction Summary",txt, fixed=TRUE)[1] line_count <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) count <- line_count[5] data.frame(Date=date_current_run, Cpu=Cpu, Count=count, stringsAsFactors=FALSE) } print(extract("file.txt")) ##file.list <- dir("./") file.list <- rep("file.txt",3) merged <- do.call("rbind", lapply(file.list, extract)) print(merged) file.list <- rep("file.txt",2000) print(system.time(merged <- do.call("rbind", lapply(file.list, extract)))) ## runs in about 2.5 secs on my laptop
%%% Shot 2: first attempt to extract (potentially variable) number of device columns
extractv2 <- function(filenam="file2.txt"){ txt <- readLines(filenam) ## date of current run: ## assumed to be on 2nd line following the first line matching "current_run" ii <- 2 + grep("current_run",txt, fixed=TRUE)[1] line_current_run <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) date_current_run <- paste(line_current_run[5:8], collapse=" ") ## Cpu: ## assumed to be on line following the first line matching "Server Summary" ## which comes after the first line matching "Engine Utilization ..." jj <- grep("Engine Utilization (Tick %)", txt, fixed=TRUE)[1] ii <- grep("Server Summary",txt, fixed=TRUE) ii <- 1 + min(ii[ii>jj]) line_Cpu <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) Cpu <- line_Cpu[2] ## Count: ## assumed to be on 2nd line following the first line matching "Transaction Summary" ii <- 2 + grep("Transaction Summary",txt, fixed=TRUE)[1] line_count <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) count <- line_count[5] ## Total I/Os ## 1. Each line "Device:" is assumed to be the header of a block of lines ## containing info about a single device (there are 4 such blocks ## in your example); ## 2. each block is assumed to contain one or more lines matching ## "Total I/Os"; ## 3. the relevant count data is assumed to be contained in the last ## of such lines (at column 4), for each block. ## Approach: loop on the line numbers of those lines matching "Device:" ## to get: A. counts; B. device names ii_block_dev <- grep("Device:", txt, fixed=TRUE) ii_lines_IOs <- grep("Total I/Os", txt, fixed=TRUE) nblocks <- length(ii_block_dev) ## A. get counts for each device ## for each block, select *last* line matching "Total I/Os" ii_block_dev_aux <- c(ii_block_dev, Inf) ## just a hack to get a clean code ii_lines_IOs_dev <- sapply(1:nblocks, function(block){ ## select matching liens to "Total I/Os" within each block IOs_per_block <- ii_lines_IOs[ ii_lines_IOs > ii_block_dev_aux[block ] & ii_lines_IOs < ii_block_dev_aux[block+1] ] tail(IOs_per_block, 1) ## get the last line of each block (if more than one match) }) lines_IOs <- lapply(txt[ii_lines_IOs_dev], function(strng){ Filter(function(v) v!="", strsplit(strng," ")[[1]]) }) IOs_counts <- sapply(lines_IOs, function(v) v[5]) ## B. get device names: ## assumed to be on lines following each "Device:" match ii_devices <- 1 + ii_block_dev device_names <- sapply(ii_devices, function(ii){ Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) }) ## Create a data.frame with "device_names" as column names and "IOs_counts" as ## the values of a single row. ## Sorting the device names by order() will help produce the same column names ## if different sysmon files list the devices in different order ord <- order(device_names) devices <- as.data.frame(structure(as.list(IOs_counts[ord]), names=device_names[ord]), check.names=FALSE) ## Prevent R from messing with our device names data.frame(stringsAsFactors=FALSE, check.names=FALSE, Date=date_current_run, Cpu=Cpu, Count=count, devices) } print(extractv2("file2.txt")) ## WATCH OUT: ## merging will ONLY work if all devices have the same names across sysmon files!! file.list <- rep("file2.txt",3) merged <- do.call("rbind", lapply(file.list, extractv2)) print(merged)
%%%%%%%%% Shot 3: extract two tables, one with one row and the second with a variable number of rows (depending on which devices are listed in each sysmon file).
extractv3 <- function(filenam="file2.txt"){ txt <- readLines(filenam) ## date of current run: ## assumed to be on 2nd line following the first line matching "current_run" ii <- 2 + grep("current_run",txt, fixed=TRUE)[1] line_current_run <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) date_current_run <- paste(line_current_run[5:8], collapse=" ") ## Cpu: ## assumed to be on line following the first line matching "Server Summary" ## which comes after the first line matching "Engine Utilization ..." jj <- grep("Engine Utilization (Tick %)", txt, fixed=TRUE)[1] ii <- grep("Server Summary",txt, fixed=TRUE) ii <- 1 + min(ii[ii>jj]) line_Cpu <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) Cpu <- line_Cpu[2] ## Count: ## assumed to be on 2nd line following the first line matching "Transaction Summary" ii <- 2 + grep("Transaction Summary",txt, fixed=TRUE)[1] line_count <- Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) count <- line_count[5] ## first part of output: fixed three-column structure fixed <- data.frame(stringsAsFactors=FALSE, Date=date_current_run, Cpu=Cpu, Count=count) ## Total I/Os ## 1. Each line "Device:" is assumed to be the header of a block of lines ## containing info about a single device (there are 4 such blocks ## in your example); ## 2. each block is assumed to contain one or more lines matching ## "Total I/Os"; ## 3. the relevant count data is assumed to be contained in the last ## of such lines (at column 4), for each block. ## Approach: loop on the line numbers of those lines matching "Device:" ## to get: A. counts; B. device names ii_block_dev <- grep("Device:", txt, fixed=TRUE) if(length(ii_block_dev)==0){ variable <- data.frame(stringsAsFactors=FALSE, date_current_run=date_current_run, device_names=NA, IOs_counts=NA) }else{ ii_lines_IOs <- grep("Total I/Os", txt, fixed=TRUE) nblocks <- length(ii_block_dev) if(length(ii_block_dev)==0){ sprintf("WEIRD datapoint at date %s: I have %d devices but 0 I/O lines??") ##stop() } ## A. get counts for each device ## for each block, select *last* line matching "Total I/Os" ii_block_dev_aux <- c(ii_block_dev, Inf) ## just a hack to get a clean code ii_lines_IOs_dev <- sapply(1:nblocks, function(block){ ## select matching lines to "Total I/Os" within each block IOs_per_block <- ii_lines_IOs[ ii_lines_IOs > ii_block_dev_aux[block ] & ii_lines_IOs < ii_block_dev_aux[block+1] ] tail(IOs_per_block, 1) ## get the last line of each block (if more than one match) }) lines_IOs <- lapply(txt[ii_lines_IOs_dev], function(strng){ Filter(function(v) v!="", strsplit(strng," ")[[1]]) }) IOs_counts <- sapply(lines_IOs, function(v) v[5]) ## B. get device names: ## assumed to be on lines following each "Device:" match ii_devices <- 1 + ii_block_dev device_names <- sapply(ii_devices, function(ii){ Filter(function(v) v!="", strsplit(txt[ii]," ")[[1]]) }) ## Create a data.frame with three columns: date, device, counts variable <- data.frame(stringsAsFactors=FALSE, date_current_run=rep(date_current_run, length(IOs_counts)), device_names=device_names, IOs_counts=IOs_counts) } list(fixed=fixed, variable=variable) } print(extractv3("file2.txt")) file.list <- c("file.txt","file2.txt","file3.txt") res <- lapply(file.list, extractv3) fixed.merged <- do.call("rbind", lapply(res, function(r) r$fixed)) print(fixed.merged) variable.merged <- do.call("rbind", lapply(res, function(r) r$variable)) print(variable.merged)