Collapse a space in a string
I have a line that looks something like this:
"stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD"
and I want to disable all punctuation marks, make all uppercase letters and collapse all spaces so that they look like this:
"STUFF MORE STUFF STUFF DD"
Is this possible with one regex or do I need to combine more than two? This is what I have so far:
def normalize(string):
import re
string = string.upper()
rex = re.compile(r'\W')
rex_s = re.compile(r'\s{2,}')
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex.sub('', result) # this reduces all those spaces
return result
The only thing that doesn't work is failures. Any ideas?
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex.sub('', result) # this reduces all those spaces
rex_s . , , - , .
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex_s.sub(' ', result) # this reduces all those spaces
python3, , . , , .
def collapse_whitespace_characters(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
,
def collapse_whitespace(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or \
(cur_char.isspace() and not prev_char.isspace()):
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
β> collapse_whitespace_characters ('we like spaces and \ t \ t Tabs AND WHATEVER \ xa0 \ xa0IS')
'we like spaces and \ t TABS \ tAND WHATEVER \ xa0IS'
β> collapse_whitespace ('we like spaces and \ t \ t Tabs And WHATEVER \ xa0 \ xa0IS ')
' we like spaces and \ tTABS \ tAND WHATEVER \ xa0IS '
for punctuation
def collapse_punctuation(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if cur_char.isalnum() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
to answer the question
orig = 'stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD'
collapse_whitespace(''.join([(c.upper() if c.isalnum() else ' ') for c in orig]))
as said regexp will be something like
re.sub('\W+', ' ', orig).upper()