Approximate RegEx in python with TRE: weird Unicode behavior

Question

Approximate RegEx in python with TRE: weird Unicode behavior

I am trying to use a TRE library in python to map input to an error.
It is important that it handles utf-8 encoded strings well.

Example: The
German capital is Berlin, but the pronunciation is the same if people write “Bärlin”

It still works, but if the non-ASCII character is in the first or second position of the detected string, neither the range nor the detected string itself is correct.

# -*- coding: utf-8 -*-
import tre

def apro_match(word, list):
    fz = tre.Fuzzyness(maxerr=3)
    pt = tre.compile(word)
    for i in l:
        m = pt.search(i,fz)
        if m:
            print m.groups()[0],' ', m[0]

if __name__ == '__main__':
    string1 = u'Berlín'.encode('utf-8')
    string2 = u'Bärlin'.encode('utf-8')    
    string3 = u'B\xe4rlin'.encode('utf-8')
    string4 = u'Berlän'.encode('utf-8')
    string5 = u'London, Paris, Bärlin'.encode('utf-8')
    string6 = u'äerlin'.encode('utf-8')
    string7 = u'Beälin'.encode('utf-8')

    l = ['Moskau', string1, string2, string3, string4, string5, string6, string7]

    print '\n'*2
    print "apro_match('Berlin', l)"
    print "="*20
    apro_match('Berlin', l)
    print '\n'*2

    print "apro_match('.*Berlin', l)"
    print "="*20
    apro_match('.*Berlin', l)

Output

apro_match('Berlin', l)
====================
(0, 7)   Berlín
(1, 7)   ärlin
(1, 7)   ärlin
(0, 7)   Berlän
(16, 22)   ärlin
(1, 7)   ?erlin
(0, 7)   Beälin



apro_match('.*Berlin', l)
====================
(0, 7)   Berlín
(0, 7)   Bärlin
(0, 7)   Bärlin
(0, 7)   Berlän
(0, 22)   London, Paris, Bärlin
(0, 7)   äerlin
(0, 7)   Beälin

Not for regular expression, '.*Berlin'it works fine, but for regular expression'Berlin'

u'Bärlin'.encode('utf-8')    
u'B\xe4rlin'.encode('utf-8')
u'äerlin'.encode('utf-8')

don't work as well

u'Berlín'.encode('utf-8')
u'Berlän'.encode('utf-8')
u'London, Paris, Bärlin'.encode('utf-8')
u'Beälin'.encode('utf-8')

works as expected.

Is there something I'm doing wrong with the encoding? Do you know any trick?

+5

python regex fuzzy-comparison tre-library

vikingosegundo 04 . '11 18:10

3

TRE . - !

Python, utf8 , . , , . , TRE - , .

AFAIK TRE , (0.8.0) , (, "2004" "2004 $" 2, - 1).

, Python !

+2

j-a 05 '12 18:20

, , - , , , , "" ( , ) . , TRE UTF-8 ( , )?

, (, , ) . , char wchar. , Python?

, wchar ++, Python , Python ↔ Python str ( UTF-16LE) ↔ ++ wchar - ?

, "" 6- (0, 7), ( 6) ( ?, UTF-8), , (char) - - .

, , latin1 cp1252 .

:

3 - , 2.

, string5 "", , , string2 string3 "".

; , , ""!

, "" ASCII; :

Berlxn Berlxyn
Bxrlin Bxyrlin
xerlin xyerlin
Bexlin Bexylin
xBerlin xyBerlin
Bxerlin Bxyerlin
Berlinx Berlinxy
erlin Brlin Berli

-ASCII- x and y` .

". * " , "" .

-1

John Machin 24 . '11 5:00

jfs · Accepted Answer · 2011-09-24T06:38:01+0000

regex, Unicode 6.0 :

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from itertools import ifilter, imap
import regex as re

def apro_match(word_re, lines, fuzzy='e<=1'):
    search = re.compile(ur'('+word_re+'){'+fuzzy+'}').search
    for m in ifilter(None, imap(search, lines)):
        print m.span(), m[0]

def main():
    lst = u'Moskau Berlín Bärlin B\xe4rlin Berlän'.split()
    lst += [u'London, Paris, Bärlin']
    lst += u'äerlin Beälin'.split()
    print
    print "apro_match('Berlin', lst)"
    print "="*25
    apro_match('Berlin', lst)
    print 
    print "apro_match('.*Berlin', lst)"
    print "="*27
    apro_match('.*Berlin', lst)

if __name__ == '__main__':
    main()

'e<=1' , . :

, "i"
, "d"
, "s"

apro_match('Berlin', lst)
=========================
(0, 6) Berlín
(0, 6) Bärlin
(0, 6) Bärlin
(0, 6) Berlän
(15, 21) Bärlin
(0, 6) äerlin
(0, 6) Beälin

apro_match('.*Berlin', lst)
===========================
(0, 6) Berlín
(0, 6) Bärlin
(0, 6) Bärlin
(0, 6) Berlän
(0, 21) London, Paris, Bärlin
(0, 6) äerlin
(0, 6) Beälin

Approximate RegEx in python with TRE: weird Unicode behavior

More articles: