Python and Turkish capitalization

I did not find a good description of how to deal with this problem on Windows , so I do it here.

In Turkish ı( I) and I( İ) there are two letters that python does not handle correctly.

>>> [char for char in 'Mayıs']
['M', 'a', 'y', 'i', 's']

>>> 'ı'.upper().lower()
'i'

How it should be, given the correct locale:

>>> [char for char in 'Mayıs']
['M', 'a', 'y', 'ı', 's']

>>> 'ı'.upper().lower()
'ı'

and

>>> 'i'.upper()
'İ'

>>> 'ı'.upper()
'I'

I tried locale.setlocale(locale.LC_ALL,'Turkish_Turkey.1254')or even 'ı'.encode('cp857'), but that didn't help.

How to make python handle these two letters correctly?

+4
source share
2 answers

You must use PyICU

>>> from icu import UnicodeString, Locale
>>> tr = Locale("TR")
>>> s = UnicodeString("i")
>>> print(unicode(s.toUpper(tr)))
İ
>>> s = UnicodeString("I")
>>> print(unicode(s.toLower(tr)))
ı
>>>
+6
source

.

import re

def tr_upper(self):
    self = re.sub(r"i", "İ", self)
    self = re.sub(r"ı", "I", self)
    self = re.sub(r"ç", "Ç", self)
    self = re.sub(r"ş", "Ş", self)
    self = re.sub(r"ü", "Ü", self)
    self = re.sub(r"ğ", "Ğ", self)
    self = self.upper() # for the rest use default upper
    return self


def tr_lower(self):
    self = re.sub(r"İ", "i", self)
    self = re.sub(r"I", "ı", self)
    self = re.sub(r"Ç", "ç", self)
    self = re.sub(r"Ş", "ş", self)
    self = re.sub(r"Ü", "ü", self)
    self = re.sub(r"Ğ", "ğ", self)
    self = self.lower() # for the rest use default lower
    return self

:

>>>print("ulvido".upper())
ULVIDO

:

>>>print(tr_upper("ulvido"))
ULVİDO

, .py . : trtextstyle.py .

trtextstyle.py - :

from .trtextstyle import tr_upper, tr_lower

, .

+1

All Articles