Playing with the grammar: part 2, reserved words as identifiers

Tue Jun 12 11:53:49 EDT 2001

Someone was lamenting that they couldn't do ``foo.from = 17'' or somesuch,
and curious as I am, I set out to figure a way that this could safely be
allowed.  To understand what I was doing, I classified all Python reserved
words into four groups:

--------------------------------------------------------------------------
Keywords (only appears as the first word in a statement):
	def print del pass break continue return raise import from global
	exec assert if elif else while for try except finally class

Guidewords (the grammar can be parsed without them):
	as import

Infix operators (sortof):
	or and is in for if

Prefix operators (sortof):
	not lambda
--------------------------------------------------------------------------

Guidewords are not a problem; they are not recognized in the grammar, but
in a later stage in the parser, so they can already be used as identifiers.

For keywords, I initially figured it would be fairly straightforward to
provide some feedback from the lexer, letting the parser know when to
recognize keywords or not, based on wether it was the first token on a line
or not.  After inspecting the source, I found this easier said than done,
but I also found a different approach.

The suggestion was made by someone on c.l.p, to simply catch where
a reserved word causes a syntax error, and retry it as an identifier.
This solves the problem for both keywords and infix operators, and
half ways for the prefix operators.  That means ``(from) = 17'' and
``foo.not = 0'' works, but ``(not) = 0'' still breaks.

To allow also the infix operators to be used as all kinds of identifiers,
requires the parser to look ahead on the next token, before deciding how
to classify the word; in fact, for ``lambda'', it might have to look ahead
an arbitrarily long distance. Consider the use of ``'lambda'' in a dictionary
key: we can't tell for sure if interpreting it as a keyword will cause
a syntax error or not untill we know if there are an even or odd number
of ``:''s before the closeing ``}''.  I only added one token of lookahead,
defaulting ambigous cases to be interpreted as keywords; it's always
possible to force them to identifiers by wrapping parens around.

In addition to these changes, I also decided to not do the retry if
the word is the first word of a statement, as I found such usage
slightly confusing to read.

The bottom line: apply this patch, and you can use all of Python's
``reserved words'' as identifiers; in most cases right away,
in all other cases by wrapping parens around them.

	/Paul

--------------------------------------------------------------------------
diff -r -c Python-2.1/Parser/parser.c Python-2.1-UNRESERVED/Parser/parser.c
*** Python-2.1/Parser/parser.c	Mon Oct  2 06:21:59 2000
--- Python-2.1-UNRESERVED/Parser/parser.c	Fri Jun  8 12:56:39 2001
***************
*** 131,141 ****
  /* PARSER PROPER */

  static int
! classify(grammar *g, int type, char *str)
  {
  	register int n = g->g_ll.ll_nlabels;

! 	if (type == NAME) {
  		register char *s = str;
  		register label *l = g->g_ll.ll_label;
  		register int i;
--- 131,141 ----
  /* PARSER PROPER */

  static int
! classify(grammar *g, int type, char *str, int use_keywords)
  {
  	register int n = g->g_ll.ll_nlabels;

! 	if (use_keywords && type == NAME) {
  		register char *s = str;
  		register label *l = g->g_ll.ll_label;
  		register int i;
***************
*** 165,183 ****
  }

  int
! PyParser_AddToken(register parser_state *ps, register int type, char *str,
! 	          int lineno, int *expected_ret)
  {
  	register int ilabel;
  	int err;

  	D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));

  	/* Find out which label this token is */
! 	ilabel = classify(ps->p_grammar, type, str);
  	if (ilabel < 0)
  		return E_SYNTAX;
! 	
  	/* Loop until the token is shifted or an error occurred */
  	for (;;) {
  		/* Fetch the current dfa and state */
--- 165,186 ----
  }

  int
! PyParser_AddToken(register parser_state *ps, int atbol, int supressed,
!                   register int type, char *str, int lineno, int *expected_ret)
  {
  	register int ilabel;
+ 	int can_retry;
  	int err;

  	D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));

+ 	can_retry = (type == NAME) && !atbol && !supressed;
+   retry:
  	/* Find out which label this token is */
! 	ilabel = classify(ps->p_grammar, type, str, atbol || can_retry);
  	if (ilabel < 0)
  		return E_SYNTAX;
! 
  	/* Loop until the token is shifted or an error occurred */
  	for (;;) {
  		/* Fetch the current dfa and state */
***************
*** 191,196 ****
--- 194,200 ----
  		if (s->s_lower <= ilabel && ilabel < s->s_upper) {
  			register int x = s->s_accel[ilabel - s->s_lower];
  			if (x != -1) {
+ 				can_retry = 0;
  				if (x & (1<<7)) {
  					/* Push non-terminal */
  					int nt = (x >> 8) + NT_OFFSET;
***************
*** 230,235 ****
--- 234,240 ----
  		}

  		if (s->s_accept) {
+ 			can_retry = 0;
  			/* Pop this dfa and try again */
  			s_pop(&ps->p_stack);
  			D(printf(" Pop ...\n"));
***************
*** 238,243 ****
--- 243,254 ----
  				return E_SYNTAX;
  			}
  			continue;
+ 		}
+ 
+ 		if (can_retry) {
+ 			can_retry = 0;
+ 			D(printf("Retry as identfier.\n"));
+ 			goto retry;
  		}

  		/* Stuck, report syntax error */
diff -r -c Python-2.1/Parser/parser.h Python-2.1-UNRESERVED/Parser/parser.h
*** Python-2.1/Parser/parser.h	Fri Sep  1 19:29:28 2000
--- Python-2.1-UNRESERVED/Parser/parser.h	Fri Jun  8 09:04:18 2001
***************
*** 29,36 ****

  parser_state *PyParser_New(grammar *g, int start);
  void PyParser_Delete(parser_state *ps);
! int PyParser_AddToken(parser_state *ps, int type, char *str, int lineno,
!                       int *expected_ret);
  void PyGrammar_AddAccelerators(grammar *g);

  #ifdef __cplusplus
--- 29,36 ----

  parser_state *PyParser_New(grammar *g, int start);
  void PyParser_Delete(parser_state *ps);
! int PyParser_AddToken(parser_state *ps, int atbol, int supressed,
!                       int type, char *str, int lineno, int *expected_ret);
  void PyGrammar_AddAccelerators(grammar *g);

  #ifdef __cplusplus
diff -r -c Python-2.1/Parser/parsetok.c Python-2.1-UNRESERVED/Parser/parsetok.c
*** Python-2.1/Parser/parsetok.c	Fri Sep  1 19:29:28 2000
--- Python-2.1-UNRESERVED/Parser/parsetok.c	Fri Jun  8 14:34:37 2001
***************
*** 84,89 ****
--- 84,92 ----
  	parser_state *ps;
  	node *n;
  	int started = 0;
+ 	int atbol = 1;
+ 	int lookahead = -1;
+ 	char *ah, *bh;

  	if ((ps = PyParser_New(g, start)) == NULL) {
  		fprintf(stderr, "no mem for new parser\n");
***************
*** 96,103 ****
  		int type;
  		size_t len;
  		char *str;

! 		type = PyTokenizer_Get(tok, &a, &b);
  		if (type == ERRORTOKEN) {
  			err_ret->error = tok->done;
  			break;
--- 99,115 ----
  		int type;
  		size_t len;
  		char *str;
+ 		int supressed;

! 		if (lookahead != -1) {
! 			a = ah;
! 			b = bh;
! 			type = lookahead;
! 			lookahead = -1;
! 		}
! 		else
! 			type = PyTokenizer_Get(tok, &a, &b);
! 		
  		if (type == ERRORTOKEN) {
  			err_ret->error = tok->done;
  			break;
***************
*** 118,130 ****
  		if (len > 0)
  			strncpy(str, a, len);
  		str[len] = '\0';
! 		if ((err_ret->error =
! 		     PyParser_AddToken(ps, (int)type, str, tok->lineno,
! 				       &(err_ret->expected))) != E_OK) {
  			if (err_ret->error != E_DONE)
  				PyMem_DEL(str);
  			break;
  		}
  	}

  	if (err_ret->error == E_DONE) {
--- 130,176 ----
  		if (len > 0)
  			strncpy(str, a, len);
  		str[len] = '\0';
! 		
! 		supressed = 0;
! 		if (type == NAME) {
! 			int *tokens = NULL, *tp;
! 			if (len == 3 && strcmp(str, "not") == 0) {
! 				static int next[] = {
! 					NAME, NUMBER, STRING,
! 					LPAR, LBRACE, LSQB, BACKQUOTE,
! 					TILDE, MINUS, PLUS,
! 					-1
! 				};
! 				tokens = next;
! 			}
! 			if (len == 6 && strcmp(str, "lambda") == 0) {
! 				static int next[] = {
! 					NAME, COLON, LPAR, STAR, DOUBLESTAR,
! 					-1
! 				};
! 				tokens = next;
! 			}
! 			if (tokens) {
! 				lookahead = PyTokenizer_Get(tok, &ah, &bh);
! 				for (tp = tokens; *tp != -1; tp++) {
! 				supressed = 1;
! 					if (lookahead == *tp) {
! 						supressed = 0;
! 						break;
! 					}
! 				}
! 			}
! 		}
! 		err_ret->error = PyParser_AddToken(ps, atbol, supressed,
!                                                    type, str, tok->lineno,
!                                                    &(err_ret->expected));
! 		if (err_ret->error != E_OK) {
  			if (err_ret->error != E_DONE)
  				PyMem_DEL(str);
  			break;
  		}
+ 		if (type != INDENT && type != DEDENT)
+ 			atbol = (type == NEWLINE || type == SEMI);
  	}

  	if (err_ret->error == E_DONE) {
diff -r -c Python-2.1/Python/compile.c Python-2.1-UNRESERVED/Python/compile.c
*** Python-2.1/Python/compile.c	Sat Apr 14 13:51:48 2001
--- Python-2.1-UNRESERVED/Python/compile.c	Thu Jun  7 14:00:39 2001
***************
*** 2813,2825 ****
  static void
  com_if_stmt(struct compiling *c, node *n)
  {
! 	int i;
  	int anchor = 0;
  	REQ(n, if_stmt);
! 	/*'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] */
  	for (i = 0; i+3 < NCH(n); i+=4) {
! 		int a = 0;
! 		node *ch = CHILD(n, i+1);
  		if (is_constant_false(c, ch))
  			continue;
  		if (i > 0)
--- 2812,2830 ----
  static void
  com_if_stmt(struct compiling *c, node *n)
  {
! 	int i, a;
  	int anchor = 0;
+ 	node *ch;
  	REQ(n, if_stmt);
! 	/*'if' test ':' suite (('elif' | 'else' 'if') test ':' suite)* ['else' ':' suite] */
  	for (i = 0; i+3 < NCH(n); i+=4) {
! 		a = 0;
! 		if (TYPE(CHILD(n, i+3)) == COLON) {
! 			i++;
! 			if (i+3 >= NCH(n))
! 				break;
! 		}
! 		ch = CHILD(n, i+1);
  		if (is_constant_false(c, ch))
  			continue;
  		if (i > 0)