R - transform the sequence of repeating amino acids, for example. NNNN - (N4)

I compute haplotypes from several sequences of sequences and get repeat segments such as RNNNNNNNT and RNNNT. There are many options that make data difficult to understand.

The data is listed below and I'm interested in generating a haplotypes_2 column based on haplotypes_1 , as indicated:

hap_code haplotypes_1 haplotypes_2 1 SKNNNRNNNNNKNNNNNNNKF SK(N3)R(N5)K(N7)KF 2 SKNNNNNNNNNKNNNNNNNNKF SK(N9)K(N8)KF 3 SKNNNNNNNNNNNNNNNNKF SK(N16)KF 
+8
regex r
source share
6 answers

Almost the same as @db, but converted to a couple of functions, so it's reusable and easy to read:

 abbreviate_letters <- function(type_letters) { runs <- rle(type_letters) run_codes <- ifelse( runs[["lengths"]] == 1, yes = runs[["values"]], no = paste0("(", runs[["values"]], runs[["lengths"]], ")") ) paste0(run_codes, collapse = "") } condense_haplotype <- function(haplotype_long) { split_terms <- strsplit(haplotype_long, split = "") vapply( X = split_terms, FUN = abbreviate_letters, FUN.VALUE = character(1) ) } haplotypes <- c( "SKNNNRNNNNNKNNNNNNNKF", "SKNNNNNNNNNKNNNNNNNNKF", "SKNNNNNNNNNNNNNNNNKF" ) condense_haplotype(haplotypes) # [1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF" 
+1
source share

Using stringr and a custom function:

 library(stringr) replace_string <- function(x) { sprintf("(%s%i)", str_sub(x, end = 1L), str_length(x)) } df1$hapnew <- str_replace_all(df1$haplotypes_1, "N+", replace_string) hap_code haplotypes_1 haplotypes_2 hapnew 1 1 SKNNNRNNNNNKNNNNNNNKF SK(N3)R(N5)K(N7)KF SK(N3)R(N5)K(N7)KF 2 2 SKNNNNNNNNNKNNNNNNNNKF SK(N9)K(N8)KF SK(N9)K(N8)KF 3 3 SKNNNNNNNNNNNNNNNNKF SK(N16)KF SK(N16)KF 
+7
source share
 x = c("SKNNNRNNNNNKNNNNNNNKF", "SKNNNNNNNNNKNNNNNNNNKF", "SKNNNNNNNNNNNNNNNNKF") sapply(strsplit(x, ""), function(mystr) with(rle(mystr), paste(paste0(ifelse(lengths > 1, paste0("(",values), values), ifelse(lengths > 1, paste0(lengths,")"), ""), collapse = "")))) #[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF" 
+5
source share

Here is an option using gsubfn and str_count

 library(stringr) library(gsubfn) gsubfn("N+", ~paste0("(", substr(x, 1, 1), str_count(x), ")"), df1$haplotypes_1) #[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF" 

Or, as suggested by G. G. Grothendieck, str_count can be replaced with nchar

 gsubfn("N+", ~sprintf("(%0.1s%d)", x, nchar(x)), df1$haplotypes_1) #[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF" 
+2
source share
 prev <- "" count <- 1 output <- "" for (character in string) { if (character==prev) { count <- count + 1 } else { if (count > 1) { output <- output + prev + toString(count) } else { output <- output + prev } } prev <- character } 

This refers to my comment, maybe the problem lies in this, but there is an essence.

+1
source share

I suggest one more:

 df <- data.frame(haplotypes_1 = c("SKNNNRNNNNNKNNNNNNNKF", "SKNNNNNNNNNKNNNNNNNNKF", "SKNNNNNNNNNNNNNNNNKF")) df$haplotypes_2 <- sapply(df$haplotypes_1, function(q){ x <- rle(strsplit(as.character(q), "N")[[1]]) res <- rep("", length(x$lengths)) res[x$lengths != 1] <- paste("(N", x$lengths[x$lengths != 1], ")", sep="") res[x$lengths == 1] <- x$values[x$values != ""] paste(res, collapse = "") }) 
+1
source share

All Articles