Replace semicolon with new line in python code

I would like to parse Python code containing semicolons ; to separate commands and create code that replaces them with newline characters \n . For example, from

 def main(): a = "a;b"; return a 

I would like to create

 def main(): a = "a;b" return a 

Any clues?

+6
source share
2 answers

Use tokenize library to search for token.OP tokens where the second element is ; * . Replace these tokens with token.NEWLINE token .

You will need to adjust the marker offsets and create the corresponding indent; therefore, after NEWLINE you will need to adjust the line numbers (increment by the offset that you increase for each NEWLINE added), and the β€œnext” line (the rest of the current line) should have indexes adjusted to match the current indentation level:

 import tokenize TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat def semicolon_to_newline(tokens): line_offset = 0 last_indent = None col_offset = None # None or an integer for ttype, tstr, (slno, scol), (elno, ecol), line in tokens: slno, elno = slno + line_offset, elno + line_offset if ttype in (tokenize.INDENT, tokenize.DEDENT): last_indent = ecol # block is indented to this column elif ttype == tokenize.OP and tstr == ';': # swap out semicolon with a newline ttype = tokenize.NEWLINE tstr = '\n' line_offset += 1 if col_offset is not None: scol, ecol = scol - col_offset, ecol - col_offset col_offset = 0 # next tokens should start at the current indent elif col_offset is not None: if not col_offset: # adjust column by starting column of next token col_offset = scol - last_indent scol, ecol = scol - col_offset, ecol - col_offset if ttype == tokenize.NEWLINE: col_offset = None yield TokenInfo( ttype, tstr, (slno, scol), (elno, ecol), line) with open(sourcefile, 'r') as source, open(destination, 'w') as dest: generator = tokenize.generate_tokens(source.readline) dest.write(tokenize.untokenize(semicolon_to_newline(generator))) 

Note that I am not correcting the value of line ; this is only informative, the data that was read from the file is not actually used for non-tokenization.

Demo:

 >>> from io import StringIO >>> source = StringIO('''\ ... def main(): ... a = "a;b"; return a ... ''') >>> generator = tokenize.generate_tokens(source.readline) >>> result = tokenize.untokenize(semicolon_to_newline(generator)) >>> print(result) def main(): a = "a;b" return a 

and a little harder:

 >>> source = StringIO('''\ ... class Foo(object): ... def bar(self): ... a = 10; b = 11; c = 12 ... if self.spam: ... x = 12; return x ... x = 15; return y ... ... def baz(self): ... return self.bar; ... # note, nothing after the semicolon ... ''') >>> generator = tokenize.generate_tokens(source.readline) >>> result = tokenize.untokenize(semicolon_to_newline(generator)) >>> print(result) class Foo(object): def bar(self): a = 10 b = 11 c = 12 if self.spam: x = 12 return x x = 15 return y def baz(self): return self.bar # note, nothing after the semicolon >>> print(result.replace(' ', '.')) class.Foo(object): ....def.bar(self): ........a.=.10 ........b.=.11 ........c.=.12 ........if.self.spam: ............x.=.12 ............return.x ........x.=.15 ........return.y ....def.baz(self): ........return.self.bar ........ ........#.note,.nothing.after.the.semicolon 

* The Python 3 version of tokenize displays more informative TokenInfo tuple names that have the optional exact_type attribute, which can be used instead of performing textual matching: tok.exact_type == tokenize.SEMI , however, I have preserved compatibility with Python 2 and 3.

+4
source

Here's a pyaring solution - see comments in the code below:

 from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line SEMI = Literal(';') patt = SEMI + restOfLine patt.ignore(quotedString) patt.ignore(pythonStyleComment) def split_at(s, locs): """ break up s into pieces, given list of break locations """ current = 0 ret = [] for loc in locs: ret.append(s[current:loc].lstrip()) current = loc+1 ret.append(s[current:].lstrip()) return ret def split_on_semicolon(s,l,tokens): """ parse time callback, when finding first unquoted ';' on a line """ current_line = line(l,s) line_body = current_line.lstrip() indent = current_line.index(line_body) indent = current_line[:indent] # may be more than one ';' on this line, find them all # (the second token contains everything after the ';') remainder = tokens[1] if remainder.strip(): all_semis = [s for _,s,_ in SEMI.scanString(remainder)] # break line into pieces pieces = split_at(remainder, all_semis) # rejoin pieces, with leading indents return '\n'+'\n'.join(indent+piece for piece in pieces) else: return '' patt.addParseAction(split_on_semicolon) sample = """ def main(): this_semi_does_nothing(); neither_does_this_but_there_are_spaces_afterward(); a = "a;b"; return a # this is a comment; it has a semicolon! def b(): if False: z=1000;b("; in quotes"); c=200;return z return ';' class Foo(object): def bar(self): '''a docstring; with a semicolon''' a = 10; b = 11; c = 12 # this comment; has several; semicolons if self.spam: x = 12; return x # so; does; this; one x = 15;;; y += x; return y def baz(self): return self.bar """ print(patt.transformString(sample)) 

gives:

 def main(): this_semi_does_nothing() neither_does_this_but_there_are_spaces_afterward() a = "a;b" return a # this is a comment; it has a semicolon! def b(): if False: z=1000 b("; in quotes") c=200 return z return ';' class Foo(object): def bar(self): '''a docstring; with a semicolon''' a = 10 b = 11 c = 12 # this comment; has several; semicolons if self.spam: x = 12 return x # so; does; this; one x = 15 y += x return y def baz(self): return self.bar 
+1
source

All Articles