Next, the code will be decoded and return how much you need to increase the line (how much was "chewed"). Note that xs_utf16 is unsigned short. Additional information: http://sree.kotay.com/2006/12/unicode-is-pain-in.html
enum { xs_UTF_Max = 0x0010FFFFUL, xs_UTF_Replace = 0x0000FFFDUL, xs_UTF16_HalfBase = 0x00010000UL, xs_UTF16_HighStart = 0x0000D800UL, xs_UTF16_HighEnd = 0x0000DBFFUL, xs_UTF16_LowStart = 0x0000DC00UL, xs_UTF16_LowEnd = 0x0000DFFFUL, xs_UTF16_MaxUCS2 = 0x0000FFFFUL, xs_UTF16_HalfMask = 0x000003FFUL, xs_UTF16_HalfShift = 10 }; int32 xs_UTF16Decode (uint32 &code, const xs_utf16* str, int32 len, bool strict) { if (str==0||len==0) {code=0; return 0;} uint32 c1 = str[0]; //note: many implementations test from HighStart to HighEnd, // this may be a partial code point, and is incorrect(?) // trivial checking should exclude the WHOLE surrogate range if (c1<xs_UTF16_HighStart || c1>xs_UTF16_LowEnd) return 1; //really an error if we're starting in the low range //surrogate pair if (len<=1 || str[1]==0) {code=xs_UTF_Replace; return strict ? 0 : 1;} //error uint32 c2 = str[1]; code = ((c1-xs_UTF16_HighStart)<<xs_UTF16_HalfShift) + (c2-xs_UTF16_LowStart) + xs_UTF16_HalfBase; if (strict==false) return 2; //check for errors if (c1>=xs_UTF16_LowStart && c1<=xs_UTF16_LowEnd) {code=xs_UTF_Replace; return 0;} //error if (c2<xs_UTF16_LowStart || c2>xs_UTF16_LowEnd) {code=xs_UTF_Replace; return 0;} //error if (code>xs_UTF_Max) {code=xs_UTF_Replace; return 0;} //error //success return 2; }
source share