. :
- cp1252, cp1252 utf8
- utf8, cp1252 utf8
(, )
, , ?
-, , cp1252 unicode. , (, 0x9D), cp1252.
cp1252 utf8, - , cp1252. , , . , , . " " .
-, utf-8, :
$ perl -CO -MEncode -e '$a=decode("utf-8",
"\xC3\xA2\xE2\x82\xAC\xC5\x93" .
"four score" .
"\xC3\xA2\xE2\x82\xAC\xC2\x9D");
for $c (split(//,$a)) {printf "%x ",ord($c);}' | fmt
:
e2 20ac 153 66 6f 75 72 20 73 63 6f 72 65 e2 20ac 9d
( "fmt" - unix, , )
cp1252, unicode cp1252, , . ( , ). , , , utf8.
$ perl -CO -MEncode -e '$a=decode("utf-8",
"\xC3\xA2\xE2\x82\xAC\xC5\x93" .
"four score" .
"\xC3\xA2\xE2\x82\xAC\xC2\x9D");
$a=encode("cp-1252", $a, sub { chr($_[0]) } );
for $c (split(//,$a)) {printf "%x ",ord($c);}' | fmt
- , .
:
e2 80 9c 66 6f 75 72 20 73 63 6f 72 65 e2 80 9d
utf8. ? , perl utf8:
$ perl -CO -MEncode -e '$a=decode("utf-8",
"\xC3\xA2\xE2\x82\xAC\xC5\x93" .
"four score" .
"\xC3\xA2\xE2\x82\xAC\xC2\x9D");
$a=encode("cp-1252", $a, sub { chr($_[0]) } );
$a=decode("utf-8", $a, 1);
for $c (split(//,$a)) {printf "%x ",ord($c);}' | fmt
"1" , , . :
201c 66 6f 75 72 20 73 63 6f 72 65 201d
:
$ perl -CO -MEncode -e '$a=decode("utf-8",
"\xC3\xA2\xE2\x82\xAC\xC5\x93" .
"four score" .
"\xC3\xA2\xE2\x82\xAC\xC2\x9D");
$a=encode("cp-1252", $a, sub { chr($_[0]) } );
$a=decode("utf-8", $a, 1);
print "$a\n"'
"four score"
, :
- mysql. $bytestream.
- $bytestream utf8:
- $bytestream $good
- $bytestream - -ASCII (.. 0x80), while... valid utf8.
- $bytestream "demangle ($ bytestream)", . cp1252-to-utf8, , , .
- $good , undef. $good , , $bytestream cp1252 utf8. (, , 2 ..)
.
sub demangle {
my($a) = shift;
eval {
local $SIG{__WARN__} = sub {};
$a = decode("utf-8", $a, 1);
encode("cp-1252", $a, sub {$_[0] <= 255 or die $_[0]; chr($_[0])});
}
}
, , ASCII, utf-8, utf-8. , , .
:
, , , "". , cp1252-to-utf8, , , . , , utf8 , :
$ perl -CO -MEncode -e '$a=decode("utf-8",
"bob\xC3\xAF\xC2\xBF\xC2\xBDs");
for $c (split(//,$a)) {printf "%x ",ord($c);}' | fmt
:
62 6f 62 ef bf bd 73
, ef bf bd unicode cp1252. , Unicode cp1252 :
62 6f 62 ef bf bd 73
, . utf-8, , , :
$ perl -CO -MEncode -e '$a=decode("utf-8",
"bob\xC3\xAF\xC2\xBF\xC2\xBDs");
$a=encode("cp-1252", $a, sub { chr(shift) } );
$a=decode("utf-8", $a, 1);
for $c (split(//,$a)) {printf "%x ",ord($c);}' | fmt
62 6f 62 fffd 73
utf-8, utf-8, 0xFFFD, " ". , , * -to-utf8 , , "". .
, , utf8 ( , ) , 0xFFFD. - :
sub is_valid_utf8 {
defined(eval { decode("utf-8", $_[0], 1) })
}