, - , , Python (, , " " - ?), Unicode. , Unicode, , Python , Unicode. , , superuser.com, .
, , , Python , :
>>> with open("countryCity2.json", "r", encoding="utf-16") as f:
... x = f.read()
...
>>> print(x)
["Xinhua","Ürümqi"]
. . Ürümqi - , . , mojibake , UTF-8, - - Unicode. , 1:1 , UTF-8:
>>> print(x.encode("iso-8859-1").decode("utf-8"))
["Xinhua","Ürümqi"]
" Ürümqi" " Xinhua". , UTF-8, :
>>> "Ürümqi".encode("iso-8859-1").decode("utf-8")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xdc in position 0:
invalid continuation byte
, .
, , , - :
for fname in input_files:
with open(fname, "r", encoding="utf-16") as f:
contents = f.read()
try:
contents = contents.encode("iso-8859-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
pass
process_file(fname, contents)
ISO 8859.1 , - , , Python iso-8859-1 codec - U + 0000..U + 00FF 0x00..0xFF. ( , IANA ISO_8859-1:1987 ECMA-94: 1985, 0x00..0x1F 0x7F..0x9F undefined.)
>>> "".join(chr(c) for c in range(256)).encode('iso-8859-1') == bytes(range(256))
True
, , , Unicode, .encode('iso-8859-1').
. - Python 3.