Replace semicolon with new line in python code

Question

Replace semicolon with new line in python code

I would like to parse Python code containing semicolons ; to separate commands and create code that replaces them with newline characters \n . For example, from

 def main(): a = "a;b"; return a

I would like to create

 def main(): a = "a;b" return a

Any clues?

+6

python parsing compilation

Nico schlömer Jun 13 '16 at 11:28

source share

2 answers

Here's a pyaring solution - see comments in the code below:

 from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line SEMI = Literal(';') patt = SEMI + restOfLine patt.ignore(quotedString) patt.ignore(pythonStyleComment) def split_at(s, locs): """ break up s into pieces, given list of break locations """ current = 0 ret = [] for loc in locs: ret.append(s[current:loc].lstrip()) current = loc+1 ret.append(s[current:].lstrip()) return ret def split_on_semicolon(s,l,tokens): """ parse time callback, when finding first unquoted ';' on a line """ current_line = line(l,s) line_body = current_line.lstrip() indent = current_line.index(line_body) indent = current_line[:indent] # may be more than one ';' on this line, find them all # (the second token contains everything after the ';') remainder = tokens[1] if remainder.strip(): all_semis = [s for _,s,_ in SEMI.scanString(remainder)] # break line into pieces pieces = split_at(remainder, all_semis) # rejoin pieces, with leading indents return '\n'+'\n'.join(indent+piece for piece in pieces) else: return '' patt.addParseAction(split_on_semicolon) sample = """ def main(): this_semi_does_nothing(); neither_does_this_but_there_are_spaces_afterward(); a = "a;b"; return a # this is a comment; it has a semicolon! def b(): if False: z=1000;b("; in quotes"); c=200;return z return ';' class Foo(object): def bar(self): '''a docstring; with a semicolon''' a = 10; b = 11; c = 12 # this comment; has several; semicolons if self.spam: x = 12; return x # so; does; this; one x = 15;;; y += x; return y def baz(self): return self.bar """ print(patt.transformString(sample))

gives:

 def main(): this_semi_does_nothing() neither_does_this_but_there_are_spaces_afterward() a = "a;b" return a # this is a comment; it has a semicolon! def b(): if False: z=1000 b("; in quotes") c=200 return z return ';' class Foo(object): def bar(self): '''a docstring; with a semicolon''' a = 10 b = 11 c = 12 # this comment; has several; semicolons if self.spam: x = 12 return x # so; does; this; one x = 15 y += x return y def baz(self): return self.bar

+1

Paulmcg Jun 13 '16 at 13:15

source share

Martijn pieters · Accepted Answer · 2016-06-13T11:37:04+0000

Use tokenize library to search for token.OP tokens where the second element is ; ^* . Replace these tokens with token.NEWLINE token .

You will need to adjust the marker offsets and create the corresponding indent; therefore, after NEWLINE you will need to adjust the line numbers (increment by the offset that you increase for each NEWLINE added), and the “next” line (the rest of the current line) should have indexes adjusted to match the current indentation level:

 import tokenize TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat def semicolon_to_newline(tokens): line_offset = 0 last_indent = None col_offset = None # None or an integer for ttype, tstr, (slno, scol), (elno, ecol), line in tokens: slno, elno = slno + line_offset, elno + line_offset if ttype in (tokenize.INDENT, tokenize.DEDENT): last_indent = ecol # block is indented to this column elif ttype == tokenize.OP and tstr == ';': # swap out semicolon with a newline ttype = tokenize.NEWLINE tstr = '\n' line_offset += 1 if col_offset is not None: scol, ecol = scol - col_offset, ecol - col_offset col_offset = 0 # next tokens should start at the current indent elif col_offset is not None: if not col_offset: # adjust column by starting column of next token col_offset = scol - last_indent scol, ecol = scol - col_offset, ecol - col_offset if ttype == tokenize.NEWLINE: col_offset = None yield TokenInfo( ttype, tstr, (slno, scol), (elno, ecol), line) with open(sourcefile, 'r') as source, open(destination, 'w') as dest: generator = tokenize.generate_tokens(source.readline) dest.write(tokenize.untokenize(semicolon_to_newline(generator)))

Note that I am not correcting the value of line ; this is only informative, the data that was read from the file is not actually used for non-tokenization.

Demo:

 >>> from io import StringIO >>> source = StringIO('''\ ... def main(): ... a = "a;b"; return a ... ''') >>> generator = tokenize.generate_tokens(source.readline) >>> result = tokenize.untokenize(semicolon_to_newline(generator)) >>> print(result) def main(): a = "a;b" return a

and a little harder:

 >>> source = StringIO('''\ ... class Foo(object): ... def bar(self): ... a = 10; b = 11; c = 12 ... if self.spam: ... x = 12; return x ... x = 15; return y ... ... def baz(self): ... return self.bar; ... # note, nothing after the semicolon ... ''') >>> generator = tokenize.generate_tokens(source.readline) >>> result = tokenize.untokenize(semicolon_to_newline(generator)) >>> print(result) class Foo(object): def bar(self): a = 10 b = 11 c = 12 if self.spam: x = 12 return x x = 15 return y def baz(self): return self.bar # note, nothing after the semicolon >>> print(result.replace(' ', '.')) class.Foo(object): ....def.bar(self): ........a.=.10 ........b.=.11 ........c.=.12 ........if.self.spam: ............x.=.12 ............return.x ........x.=.15 ........return.y ....def.baz(self): ........return.self.bar ........ ........#.note,.nothing.after.the.semicolon

^* The Python 3 version of tokenize displays more informative TokenInfo tuple names that have the optional exact_type attribute, which can be used instead of performing textual matching: tok.exact_type == tokenize.SEMI , however, I have preserved compatibility with Python 2 and 3.

Replace semicolon with new line in python code

More articles: