Compute all permutations of "CTCCT" and combine them into a regular expression:
CCCTT|CCTCT|CCTTC|CTCCT|CTCTC|CTTCC|TCCCT|TCCTC|TCTCC|TTCCC
This template can be optimized :
C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)
var regex = new RegExp(/C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)/g); var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC"; console.log(regex.exec(string));
This pattern does not find matching matches, e. CCCTTCCC will be only one match in CCCTTCCC .
To find matching matches, use lookahead:
C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)
var regex = new RegExp(/C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)/g); var string = "CCCTTCCC"; while ((match = regex.exec(string)) != null) { console.log(match.index, string.substring(match.index, match.index + 5)); }
Regex can only work with a fairly limited number of permutations. If you want to fit segments of arbitrary size, use a solution without regular expressions:
function c3t2_optimized(str) { var c = 0, t = 0; for (var i = 0; i < str.length; ++i) { var last = str.charAt(i); if (last == 'C') ++c; else if (last == 'T') ++t; if (i > 4) { var first = str.charAt(i - 5); if (first == 'C') --c; else if (first == 'T') --t; } if (c == 3 && t == 2) return i - 4; } return -1; } var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC"; console.log(c3t2_optimized(string));
Or the same as above, just like a generator going through all possible matching matches:
function* c3t2_optimized(str) { var c = 0, t = 0; for (var i = 0; i < str.length; ++i) { var last = str.charAt(i); if (last == 'C') ++c; else if (last == 'T') ++t; if (i > 4) { var first = str.charAt(i - 5); if (first == 'C') --c; else if (first == 'T') --t; } if (c == 3 && t == 2) yield i - 4; } } var string = "CCCTTCCC"; for (i of c3t2_optimized(string)) { console.log(i, string.substring(i, i + 5)); }
Performance Comparison: https://jsfiddle.net/24qguege/7/
Firefox 47:
- 68.83ms - regex (see above)
- 97.51ms - not a regular expression (see above)
- 9582.39ms - Reply by Andrew Rueckert (more readability)
source share