Parsing python grammar with boost :: spirit - problem

I am trying to write a python parser using boost :: spirit library. Here is the code:

template <typename Iterator> class Parser : public qi::grammar<Iterator, space_type> { public: Parser() : Parser::base_type(small_stmt) { NEWLINE = lit("<NEWLINE>"); INDENT = lit("<INDENT>"); DEDENT = lit("<DEDENT>"); ENDMARKER = lit("<EOT>"); NAME = identifier.alias(); NUMBER = integer|longinteger|floatnumber|imagnumber; STRING = stringliteral.alias(); identifier = (alpha | '_') >> *(alpha | digit | '_'); stringliteral = -stringprefix >> (shortstring | longstring); stringprefix = lit("r") | lit("u") | lit("ur") | lit("R") | lit("U") | lit("UR") | lit("Ur") | lit("uR") | lit("b") | lit("B") | lit("br") | lit("Br") | lit("bR") | lit("BR"); shortstring = "'" >> *(shortstringitem - "'") >> "'" | "\"" >> *(shortstringitem - "\"") >> "\""; longstring = "'''" >> *longstringitem >> "'''" | "\"\"\"" >> *longstringitem >> "\"\"\""; shortstringitem = shortstringchar | escapeseq; longstringitem = longstringchar | escapeseq; shortstringchar = char_ - "\\" - "\n"; longstringchar = char_ - "\\"; escapeseq = '\\' >> char_; longinteger = integer >> (lit("l") | lit("L")); integer = decimalinteger | octinteger | hexinteger | bininteger; decimalinteger = nonzerodigit >> *digit | lit("0"); octinteger = lit("0") >> (lit("o") | lit("O")) >> +octdigit | lit("0") >> +octdigit; hexinteger = lit("0") >> (lit("x") | lit("X")) >> +hexdigit; bininteger = lit("0") >> (lit("b") | lit("B")) >> +bindigit; nonzerodigit = char_('1', '9'); octdigit = char_('0', '7'); bindigit = lit("0") | lit("1"); hexdigit = digit | char_('a', 'f') | char_('A', 'F'); floatnumber = pointfloat | exponentfloat; pointfloat = -intpart >> fraction | intpart >> "."; exponentfloat = (intpart | pointfloat) >> exponent; intpart = +digit; fraction = "." >> +digit; exponent = (lit("e") | lit("E")) >> -(lit("+") | lit("-")) >> +digit; imagnumber = (floatnumber | intpart) >> (lit("j") | lit("J")); single_input = NEWLINE|simple_stmt|compound_stmt >> NEWLINE; file_input = *(NEWLINE|stmt) >> ENDMARKER; eval_input = testlist >> *NEWLINE >> ENDMARKER; decorator = lit("@") >> dotted_name >> -( lit("(") >> -(arglist) >> lit(")") ) >> NEWLINE; decorators = +decorator; decorated = decorators >> (classdef|funcdef); funcdef = lit("def") >> NAME >> parameters >> lit(":") >> suite; parameters = lit("(") >> -(varargslist) >> lit(")"); varargslist = (*(fpdef >> -(lit("=") >> test) >> lit(",")) >> (lit("*") >> NAME >> -(lit(",") >> lit("**") >> NAME)|lit("**") >> NAME)|fpdef >> -(lit("=") >> test) >> *(lit(",") >> fpdef >> -(lit("=") >> test)) >> -(lit(","))); fpdef = NAME|lit("(") >> fplist >> lit(")"); fplist = fpdef >> *(lit(",") >> fpdef) >> -(lit(",")); stmt = simple_stmt|compound_stmt; simple_stmt = small_stmt >> *(lit(";") >> small_stmt) >> -(lit(";")) >> NEWLINE; small_stmt = (expr_stmt|print_stmt|del_stmt|pass_stmt|flow_stmt|import_stmt|global_stmt|exec_stmt|assert_stmt); expr_stmt = testlist >> (augassign >> (yield_expr|testlist)|*(lit("=") >> (yield_expr|testlist))); augassign = (lit("+=")|lit("-=")|lit("*=")|lit("/=")|lit("%=")|lit("&=")|lit("|=")|lit("^=")|lit("<<=")|lit(">>=")|lit("**=")|lit("//=")); print_stmt = lit("print") >> ( -( test >> *(lit(",") >> test) >> -(lit(",")) )|lit(">>") >> test >> -( +(lit(",") >> test) >> -(lit(",")) ) ); del_stmt = lit("del") >> exprlist; pass_stmt = lit("pass"); flow_stmt = break_stmt|continue_stmt|return_stmt|raise_stmt|yield_stmt; break_stmt = lit("break"); continue_stmt = lit("continue"); return_stmt = lit("return") >> -(testlist); yield_stmt = yield_expr.alias(); raise_stmt = lit("raise") >> -(test >> -(lit(",") >> test >> -(lit(",") >> test))); import_stmt = import_name|import_from; import_name = lit("import") >> dotted_as_names; import_from = (lit("from") >> (*lit(".") >> dotted_name|+lit(".")) >> lit("import") >> (lit("*")|lit("(") >> import_as_names >> lit(")")|import_as_names)); import_as_name = NAME >> -(lit("as") >> NAME); dotted_as_name = dotted_name >> -(lit("as") >> NAME); import_as_names = import_as_name >> *(lit(",") >> import_as_name) >> -(lit(",")); dotted_as_names = dotted_as_name >> *(lit(",") >> dotted_as_name); dotted_name = NAME >> *(lit(".") >> NAME); global_stmt = lit("global") >> NAME >> *(lit(",") >> NAME); exec_stmt = lit("exec") >> expr >> -(lit("in") >> test >> -(lit(",") >> test)); assert_stmt = lit("assert") >> test >> -(lit(",") >> test); compound_stmt = if_stmt|while_stmt|for_stmt|try_stmt|with_stmt|funcdef|classdef|decorated; if_stmt = lit("if") >> test >> lit(":") >> suite >> *(lit("elif") >> test >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite); while_stmt = lit("while") >> test >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite); for_stmt = lit("for") >> exprlist >> lit("in") >> testlist >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite); try_stmt = (lit("try") >> lit(":") >> suite >> (+(except_clause >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite) >> -(lit("finally") >> lit(":") >> suite)|lit("finally") >> lit(":") >> suite)); with_stmt = lit("with") >> with_item >> *(lit(",") >> with_item) >> lit(":") >> suite; with_item = test >> -(lit("as") >> expr); except_clause = lit("except") >> -(test >> -((lit("as")|lit(",")) >> test)); suite = simple_stmt|NEWLINE >> INDENT >> +stmt >> DEDENT; testlist_safe = old_test >> -(+(lit(",") >> old_test) >> -(lit(","))); old_test = or_test|old_lambdef; old_lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> old_test; test = or_test >> -(lit("if") >> or_test >> lit("else") >> test)|lambdef; or_test = and_test >> *(lit("or") >> and_test); and_test = not_test >> *(lit("and") >> not_test); not_test = lit("not") >> not_test|comparison; comparison = expr >> *(comp_op >> expr); comp_op = lit("<")|lit(">")|lit("==")|lit(">=")|lit("<=")|lit("<>")|lit("!=")|lit("in")|lit("not in")|lit("is")|lit("is not"); expr = xor_expr >> *(lit("|") >> xor_expr); xor_expr = and_expr >> *(lit("^") >> and_expr); and_expr = shift_expr >> *(lit("&") >> shift_expr); shift_expr = arith_expr >> *((lit("<<")|lit(">>")) >> arith_expr); arith_expr = term >> *((lit("+")|lit("-")) >> term); term = factor >> *((lit("*")|lit("/")|lit("%")|lit("//")) >> factor); factor = (lit("+")|lit("-")|lit("~")) >> factor|power; power = atom >> *trailer >> -(lit("**") >> factor); atom = (lit("(") >> -(yield_expr|testlist_comp) >> lit(")")|lit("-(") >> -(listmaker) >> lit(")")|lit("{") >> -(dictorsetmaker) >> lit("}")|lit("`") >> testlist1 >> lit("`")|NAME|NUMBER|+STRING); listmaker = test >> ( list_for|*(lit(",") >> test) >> -(lit(",")) ); testlist_comp = test >> ( comp_for|*(lit(",") >> test) >> -(lit(",")) ); lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> test; trailer = lit("(") >> -(arglist) >> lit(")")|lit("[") >> subscriptlist >> lit("]")|lit(".") >> NAME; subscriptlist = subscript >> *(lit(",") >> subscript) >> -(lit(",")); subscript = lit(".") >> lit(".") >> lit(".")|test|-(test) >> lit(":") >> -(test) >> -(sliceop); sliceop = lit(":") >> -(test); exprlist = expr >> *(lit(",") >> expr) >> -(lit(",")); testlist = test >> *(lit(",") >> test) >> -(lit(",")); dictorsetmaker = ( (test >> lit(":") >> test >> (comp_for|*(lit(",") >> test >> lit(":") >> test) >> -(lit(","))))|(test >> (comp_for|*(lit(",") >> test) >> -(lit(",")))) ); classdef = lit("class") >> NAME >> -(lit("(") >> -(testlist) >> lit(")")) >> lit(":") >> suite; arglist = *(argument >> lit(",")) >> (argument >> -(lit(","))|lit("*") >> test >> *(lit(",") >> argument) >> -(lit(",") >> lit("**") >> test)|lit("**") >> test); argument = test >> -(comp_for)|test >> lit("=") >> test; list_iter = list_for|list_if; list_for = lit("for") >> exprlist >> lit("in") >> testlist_safe >> -(list_iter); list_if = lit("if") >> old_test >> -(list_iter); comp_iter = comp_for|comp_if; comp_for = lit("for") >> exprlist >> lit("in") >> or_test >> -(comp_iter); comp_if = lit("if") >> old_test >> -(comp_iter); testlist1 = test >> *(lit(",") >> test); encoding_decl = NAME.alias(); yield_expr = lit("yield") >> -(testlist); } // LEXEMS qi::rule<Iterator, space_type> NEWLINE; qi::rule<Iterator, space_type> INDENT; qi::rule<Iterator, space_type> DEDENT; qi::rule<Iterator, space_type> ENDMARKER; qi::rule<Iterator, space_type> NAME; qi::rule<Iterator, space_type> NUMBER; qi::rule<Iterator, space_type> STRING; // IDENTIFIER qi::rule<Iterator, space_type> identifier; // STRING LITERAL qi::rule<Iterator, space_type> stringliteral; qi::rule<Iterator, space_type> stringprefix; qi::rule<Iterator, space_type> shortstring; qi::rule<Iterator, space_type> longstring; qi::rule<Iterator, space_type> shortstringitem; qi::rule<Iterator, space_type> longstringitem; qi::rule<Iterator, space_type> shortstringchar; qi::rule<Iterator, space_type> longstringchar; qi::rule<Iterator, space_type> escapeseq; // INTEGER LITERAL qi::rule<Iterator, space_type> longinteger; qi::rule<Iterator, space_type> integer; qi::rule<Iterator, space_type> decimalinteger; qi::rule<Iterator, space_type> octinteger; qi::rule<Iterator, space_type> hexinteger; qi::rule<Iterator, space_type> bininteger; qi::rule<Iterator, space_type> nonzerodigit; qi::rule<Iterator, space_type> octdigit; qi::rule<Iterator, space_type> bindigit; qi::rule<Iterator, space_type> hexdigit; // FLOAT LITERAL qi::rule<Iterator, space_type> floatnumber; qi::rule<Iterator, space_type> pointfloat; qi::rule<Iterator, space_type> exponentfloat; qi::rule<Iterator, space_type> intpart; qi::rule<Iterator, space_type> fraction; qi::rule<Iterator, space_type> exponent; //IMAGINARY LITERAL qi::rule<Iterator, space_type> imagnumber; // PYTHON GRAMMAR qi::rule<Iterator, space_type> single_input; qi::rule<Iterator, space_type> file_input; qi::rule<Iterator, space_type> eval_input; qi::rule<Iterator, space_type> decorator; qi::rule<Iterator, space_type> decorators; qi::rule<Iterator, space_type> decorated; qi::rule<Iterator, space_type> funcdef; qi::rule<Iterator, space_type> parameters; qi::rule<Iterator, space_type> varargslist; qi::rule<Iterator, space_type> fpdef; qi::rule<Iterator, space_type> fplist; qi::rule<Iterator, space_type> stmt; qi::rule<Iterator, space_type> simple_stmt; qi::rule<Iterator, space_type> small_stmt; qi::rule<Iterator, space_type> expr_stmt; qi::rule<Iterator, space_type> augassign; qi::rule<Iterator, space_type> print_stmt; qi::rule<Iterator, space_type> del_stmt; qi::rule<Iterator, space_type> pass_stmt; qi::rule<Iterator, space_type> flow_stmt; qi::rule<Iterator, space_type> break_stmt; qi::rule<Iterator, space_type> continue_stmt; qi::rule<Iterator, space_type> return_stmt; qi::rule<Iterator, space_type> yield_stmt; qi::rule<Iterator, space_type> raise_stmt; qi::rule<Iterator, space_type> import_stmt; qi::rule<Iterator, space_type> import_name; qi::rule<Iterator, space_type> import_from; qi::rule<Iterator, space_type> import_as_name; qi::rule<Iterator, space_type> dotted_as_name; qi::rule<Iterator, space_type> import_as_names; qi::rule<Iterator, space_type> dotted_as_names; qi::rule<Iterator, space_type> dotted_name; qi::rule<Iterator, space_type> global_stmt; qi::rule<Iterator, space_type> exec_stmt; qi::rule<Iterator, space_type> assert_stmt; qi::rule<Iterator, space_type> compound_stmt; qi::rule<Iterator, space_type> if_stmt; qi::rule<Iterator, space_type> while_stmt; qi::rule<Iterator, space_type> for_stmt; qi::rule<Iterator, space_type> try_stmt; qi::rule<Iterator, space_type> with_stmt; qi::rule<Iterator, space_type> with_item; qi::rule<Iterator, space_type> except_clause; qi::rule<Iterator, space_type> suite; qi::rule<Iterator, space_type> testlist_safe; qi::rule<Iterator, space_type> old_test; qi::rule<Iterator, space_type> old_lambdef; qi::rule<Iterator, space_type> test; qi::rule<Iterator, space_type> or_test; qi::rule<Iterator, space_type> and_test; qi::rule<Iterator, space_type> not_test; qi::rule<Iterator, space_type> comparison; qi::rule<Iterator, space_type> comp_op; qi::rule<Iterator, space_type> expr; qi::rule<Iterator, space_type> xor_expr; qi::rule<Iterator, space_type> and_expr; qi::rule<Iterator, space_type> shift_expr; qi::rule<Iterator, space_type> arith_expr; qi::rule<Iterator, space_type> term; qi::rule<Iterator, space_type> factor; qi::rule<Iterator, space_type> power; qi::rule<Iterator, space_type> atom; qi::rule<Iterator, space_type> listmaker; qi::rule<Iterator, space_type> testlist_comp; qi::rule<Iterator, space_type> lambdef; qi::rule<Iterator, space_type> trailer; qi::rule<Iterator, space_type> subscriptlist; qi::rule<Iterator, space_type> subscript; qi::rule<Iterator, space_type> sliceop; qi::rule<Iterator, space_type> exprlist; qi::rule<Iterator, space_type> testlist; qi::rule<Iterator, space_type> dictorsetmaker; qi::rule<Iterator, space_type> classdef; qi::rule<Iterator, space_type> arglist; qi::rule<Iterator, space_type> argument; qi::rule<Iterator, space_type> list_iter; qi::rule<Iterator, space_type> list_for; qi::rule<Iterator, space_type> list_if; qi::rule<Iterator, space_type> comp_iter; qi::rule<Iterator, space_type> comp_for; qi::rule<Iterator, space_type> comp_if; qi::rule<Iterator, space_type> testlist1; qi::rule<Iterator, space_type> encoding_decl; qi::rule<Iterator, space_type> yield_expr; }; 

The problem is that when I try to parse a simple file:

 pass 

after going through any lexer module:

 pass <NEWLINE> <EOT> 

parsing failed and stopped at the first character. When I tried to parse this file using the pass_stmt rule, everything will be fine (except that we still have and have left, but the word "is consumed"). When I tried to parse it with the rule one level up - small_stmt - the parser stops at

 > <EOT> 

consuming

 pass <NEWLINE 

One level up - simple_stmt gives the same result as file_input . The parser stops with the first character.

Before adding the grammar defined in the PYTHON GRAMMAR section (obtained from http://docs.python.org/reference/grammar.html ), everything works correctly. Parser recognizes identifiers, literals, numbers, etc.

Does anyone have any idea what might be wrong here?

+7
source share
1 answer

I suggest you enable debugging as described here . This will give you an idea of ​​what is actually happening. As a rule, I propose to build grammars step by step and not try to implement everything in one big jump.

The code you mentioned above is very difficult to understand for non-united readers, as it is very large and not commented on. Writing grammars is very similar to writing "normal" code. Encapsulation is the key to success. Try to build smaller grammars by covering the self-contained parts and combining these sub-grammars as needed. See here for some best practices.

+5
source

All Articles