JavaScript strings are encoded in UCS-2 encoding, but can represent Unicode code points outside the base multilingual panel ( U+0000 - U+D7FF and U+E000 - U+FFFF ) using two 16-bit numbers (surrogate pair UTF-16) , the first of which should be in the range U+D800 - U+DFFF .
Based on this, it is easy to determine whether a string contains any characters that lie outside the base multilingual plane (which, I think, you ask: do you want to determine whether the string contains any characters that lie outside the range of code points that JavaScript represents as one character):
function containsSurrogatePair(str) { return /[\uD800-\uDFFF]/.test(str); } alert( containsSurrogatePair("foo") ); // false alert( containsSurrogatePair("f𝌆") ); // true
Generating exactly what code points are in your string is a bit more complicated and requires a UTF-16 decoder. The following converts the string to an array of Unicode codes:
var getStringCodePoints = (function() { function surrogatePairToCodePoint(charCode1, charCode2) { return ((charCode1 & 0x3FF) << 10) + (charCode2 & 0x3FF) + 0x10000; } // Read string in character by character and create an array of code points return function(str) { var codePoints = [], i = 0, charCode; while (i < str.length) { charCode = str.charCodeAt(i); if ((charCode & 0xF800) == 0xD800) { codePoints.push(surrogatePairToCodePoint(charCode, str.charCodeAt(++i))); } else { codePoints.push(charCode); } ++i; } return codePoints; } })(); alert( getStringCodePoints("f𝌆").join(",") ); // 102,119558
Tim Down 03 Feb 2018-11-11T00: 00Z
source share