From 0bcf4d5e531611c766301da5f87f67d85f4d8e8c Mon Sep 17 00:00:00 2001 From: David Blacka Date: Sun, 13 Jul 2008 22:50:03 -0400 Subject: [PATCH] Add unit test for the query parser, fix some bugs in said query parser, and upgrade to PLY-2.5 (from ply-1.x something) --- rwhoisd/QueryParser.py | 70 +- rwhoisd/lex.py | 1109 +++++++++++++--------- rwhoisd/parsetab.py | 37 - rwhoisd/yacc.py | 1937 ++++++++++++++++++++++++++++++--------- test/TestQueryParser.py | 69 ++ test/test_queries | 24 + test/test_queries_bad | 8 + 7 files changed, 2278 insertions(+), 976 deletions(-) delete mode 100644 rwhoisd/parsetab.py create mode 100644 test/TestQueryParser.py create mode 100644 test/test_queries create mode 100644 test/test_queries_bad diff --git a/rwhoisd/QueryParser.py b/rwhoisd/QueryParser.py index 2e2ce6e..7b9ed84 100644 --- a/rwhoisd/QueryParser.py +++ b/rwhoisd/QueryParser.py @@ -17,6 +17,12 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA +# This module uses PLY (Python Lex-Yacc). See +# http://www.dabeaz.com/ply/ for more info. + +# This module is actually the grammar definition. The lexer finds +# variables and functions starting with 't_', as well as a list called +# 'tokens'. The parser (i.e., yacc) finds things starting with 'p_'. # queryparser.db must be set to a DB class instance. db = None @@ -50,6 +56,8 @@ def t_firstvalue(t): if db.is_objectclass(t.value): t.type = 'CLASS' + elif db.is_attribute(t.value): + t.type = 'ATTR' else: t.type = 'VALUE' return t @@ -57,13 +65,14 @@ def t_firstvalue(t): def t_VALUE(t): r'\*?[^\s"\'=!*]+\*{0,2}' - if t.value.upper() == 'AND': + v = t.value.upper() + if v == 'AND': t.type = 'AND' - t.value = t.value.upper() + t.value = v return t - if t.value.upper() == 'OR': + if v == 'OR': t.type = 'OR' - t.value = t.value.upper() + t.value = v return t if db.is_attribute(t.value): t.type = 'ATTR' @@ -125,17 +134,18 @@ def p_querystr_attr_value(t): t[0] = (t[1], t[2], t[3]) -def p_querystr_attr(t): - 'querystr : ATTR' +def p_querystr_attr_attr(t): + '''querystr : ATTR EQ ATTR + | ATTR NEQ ATTR''' - t[0] = (None, '=', t[1]) + t[0] = (t[1], t[2], t[3]) def p_querystr_value(t): - 'querystr : value' + '''querystr : ATTR + | value''' t[0] = (None, '=', t[1]) - def p_value(t): 'value : VALUE' @@ -150,7 +160,6 @@ def p_quotedvalue(t): def p_error(t): - # print "Syntax error at '%s:%s'" % (t.type, t.value) raise yacc.YaccError, "Syntax error at %r" % t.value @@ -223,42 +232,5 @@ def parse(p, query): assert db try: return p.parse(query) - except (lex.LexError, yacc.YaccError): - raise Rwhois.RwhoisError, (350, "") - -if __name__ == "__main__": - import sys - import MemDB - - mydb = MemDB.MemDB() - - print "loading schema:", sys.argv[1] - mydb.init_schema(sys.argv[1]) - for data_file in sys.argv[2:]: - print "loading data file:", data_file - mydb.load_data(data_file) - mydb.index_data() - - db = mydb - qp = get_parser() - - for line in sys.stdin.readlines(): - line = line.rstrip('\n') - line = line.strip() - if not line: continue - print 'inputting:', `line` - try: - res = qp.parse(line) - print repr(res) - except (lex.LexError, yacc.YaccError), x: - print "parse error occurred:", x - print "query:", line - - -# lex.input(line) -# while 1: -# tok = lex.token() -# if not tok: break -# print tok - - + except (lex.LexError, yacc.YaccError), e: + raise Rwhois.RwhoisError, (350, "Invalid Query Syntax: " + e.message) diff --git a/rwhoisd/lex.py b/rwhoisd/lex.py index ca9999d..c5beb8c 100644 --- a/rwhoisd/lex.py +++ b/rwhoisd/lex.py @@ -1,275 +1,259 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # ply: lex.py # -# Author: David M. Beazley (beazley@cs.uchicago.edu) -# Department of Computer Science -# University of Chicago -# Chicago, IL 60637 -# -# Copyright (C) 2001, David M. Beazley +# Author: David M. Beazley (dave@dabeaz.com) # -# $Header: /home/davidb/src/cvs/python-rwhoisd/rwhoisd/lex.py,v 1.1 2003/04/22 02:19:07 davidb Exp $ +# Copyright (C) 2001-2008, David M. Beazley # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See the file COPYING for a complete copy of the LGPL. -# -# -# This module automatically constructs a lexical analysis module from regular -# expression rules defined in a user-defined module. The idea is essentially the same -# as that used in John Aycock's Spark framework, but the implementation works -# at the module level rather than requiring the use of classes. -# -# This module tries to provide an interface that is closely modeled after -# the traditional lex interface in Unix. It also differs from Spark -# in that: -# -# - It provides more extensive error checking and reporting if -# the user supplies a set of regular expressions that can't -# be compiled or if there is any other kind of a problem in -# the specification. -# -# - The interface is geared towards LALR(1) and LR(1) parser -# generators. That is tokens are generated one at a time -# rather than being generated in advanced all in one step. -# -# There are a few limitations of this module -# -# - The module interface makes it somewhat awkward to support more -# than one lexer at a time. Although somewhat inelegant from a -# design perspective, this is rarely a practical concern for -# most compiler projects. -# -# - The lexer requires that the entire input text be read into -# a string before scanning. I suppose that most machines have -# enough memory to make this a minor issues, but it makes -# the lexer somewhat difficult to use in interactive sessions -# or with streaming data. # -#----------------------------------------------------------------------------- - -r""" -lex.py - -This module builds lex-like scanners based on regular expression rules. -To use the module, simply write a collection of regular expression rules -and actions like this: - -# lexer.py -import lex - -# Define a list of valid tokens -tokens = ( - 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' - ) - -# Define tokens as functions -def t_IDENTIFIER(t): - r' ([a-zA-Z_](\w|_)* ' - return t - -def t_NUMBER(t): - r' \d+ ' - return t - -# Some simple tokens with no actions -t_PLUS = r'\+' -t_MINUS = r'-' - -# Initialize the lexer -lex.lex() - -The tokens list is required and contains a complete list of all valid -token types that the lexer is allowed to produce. Token types are -restricted to be valid identifiers. This means that 'MINUS' is a valid -token type whereas '-' is not. - -Rules are defined by writing a function with a name of the form -t_rulename. Each rule must accept a single argument which is -a token object generated by the lexer. This token has the following -attributes: - - t.type = type string of the token. This is initially set to the - name of the rule without the leading t_ - t.value = The value of the lexeme. - t.lineno = The value of the line number where the token was encountered - -For example, the t_NUMBER() rule above might be called with the following: - - t.type = 'NUMBER' - t.value = '42' - t.lineno = 3 - -Each rule returns the token object it would like to supply to the -parser. In most cases, the token t is returned with few, if any -modifications. To discard a token for things like whitespace or -comments, simply return nothing. For instance: - -def t_whitespace(t): - r' \s+ ' - pass - -For faster lexing, you can also define this in terms of the ignore set like this: - -t_ignore = ' \t' - -The characters in this string are ignored by the lexer. Use of this feature can speed -up parsing significantly since scanning will immediately proceed to the next token. - -lex requires that the token returned by each rule has an attribute -t.type. Other than this, rules are free to return any kind of token -object that they wish and may construct a new type of token object -from the attributes of t (provided the new object has the required -type attribute). - -If illegal characters are encountered, the scanner executes the -function t_error(t) where t is a token representing the rest of the -string that hasn't been matched. If this function isn't defined, a -LexError exception is raised. The .text attribute of this exception -object contains the part of the string that wasn't matched. - -The t.skip(n) method can be used to skip ahead n characters in the -input stream. This is usually only used in the error handling rule. -For instance, the following rule would print an error message and -continue: - -def t_error(t): - print "Illegal character in input %s" % t.value[0] - t.skip(1) - -Of course, a nice scanner might wish to skip more than one character -if the input looks very corrupted. - -The lex module defines a t.lineno attribute on each token that can be used -to track the current line number in the input. The value of this -variable is not modified by lex so it is up to your lexer module -to correctly update its value depending on the lexical properties -of the input language. To do this, you might write rules such as -the following: - -def t_newline(t): - r' \n+ ' - t.lineno += t.value.count("\n") - -To initialize your lexer so that it can be used, simply call the lex.lex() -function in your rule file. If there are any errors in your -specification, warning messages or an exception will be generated to -alert you to the problem. - -(dave: this needs to be rewritten) -To use the newly constructed lexer from another module, simply do -this: - - import lex - import lexer - plex.input("position = initial + rate*60") +# See the file COPYING for a complete copy of the LGPL. +# ----------------------------------------------------------------------------- - while 1: - token = plex.token() # Get a token - if not token: break # No more tokens - ... do whatever ... +__version__ = "2.5" +__tabversion__ = "2.4" # Version of table file used -Assuming that the module 'lexer' has initialized plex as shown -above, parsing modules can safely import 'plex' without having -to import the rule file or any additional imformation about the -scanner you have defined. -""" +import re, sys, types, copy, os -# ----------------------------------------------------------------------------- +# This regular expression is used to match valid token names +_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') +# _INSTANCETYPE sets the valid set of instance types recognized +# by PLY when lexers are defined by a class. In order to maintain +# backwards compatibility with Python-2.0, we have to check for +# the existence of ObjectType. -__version__ = "1.3" +try: + _INSTANCETYPE = (types.InstanceType, types.ObjectType) +except AttributeError: + _INSTANCETYPE = types.InstanceType + class object: pass # Note: needed if no new-style classes present -import re, types, sys, copy +# Exception thrown when invalid token encountered and no default error +# handler is defined. -# Exception thrown when invalid token encountered and no default class LexError(Exception): def __init__(self,message,s): self.args = (message,) self.text = s -# Token class -class LexToken: +# An object used to issue one-time warning messages for various features + +class LexWarning(object): + def __init__(self): + self.warned = 0 + def __call__(self,msg): + if not self.warned: + sys.stderr.write("ply.lex: Warning: " + msg+"\n") + self.warned = 1 + +_SkipWarning = LexWarning() # Warning for use of t.skip() on tokens + +# Token class. This class is used to represent the tokens produced. +class LexToken(object): def __str__(self): - return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) + return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) def __repr__(self): return str(self) def skip(self,n): - try: - self._skipn += n - except AttributeError: - self._skipn = n + self.lexer.skip(n) + _SkipWarning("Calling t.skip() on a token is deprecated. Please use t.lexer.skip()") # ----------------------------------------------------------------------------- # Lexer class # +# This class encapsulates all of the methods and data associated with a lexer. +# # input() - Store a new string in the lexer # token() - Get the next token # ----------------------------------------------------------------------------- class Lexer: def __init__(self): - self.lexre = None # Master regular expression - self.lexdata = None # Actual input data (as a string) - self.lexpos = 0 # Current position in input text - self.lexlen = 0 # Length of the input text - self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types - self.lexerrorf = None # Error rule (if any) - self.lextokens = None # List of valid tokens - self.lexignore = None # Ignored characters - self.lineno = 1 # Current line number - self.debug = 0 # Debugging mode - self.optimize = 0 # Optimized mode - self.token = self.errtoken - - def __copy__(self): - c = Lexer() - c.lexre = self.lexre - c.lexdata = self.lexdata - c.lexpos = self.lexpos - c.lexlen = self.lexlen - c.lenindexfunc = self.lexindexfunc - c.lexerrorf = self.lexerrorf - c.lextokens = self.lextokens - c.lexignore = self.lexignore - c.lineno = self.lineno - c.optimize = self.optimize - c.token = c.realtoken + self.lexre = None # Master regular expression. This is a list of + # tuples (re,findex) where re is a compiled + # regular expression and findex is a list + # mapping regex group numbers to rules + self.lexretext = None # Current regular expression strings + self.lexstatere = {} # Dictionary mapping lexer states to master regexs + self.lexstateretext = {} # Dictionary mapping lexer states to regex strings + self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names + self.lexstate = "INITIAL" # Current lexer state + self.lexstatestack = [] # Stack of lexer states + self.lexstateinfo = None # State information + self.lexstateignore = {} # Dictionary of ignored characters for each state + self.lexstateerrorf = {} # Dictionary of error functions for each state + self.lexreflags = 0 # Optional re compile flags + self.lexdata = None # Actual input data (as a string) + self.lexpos = 0 # Current position in input text + self.lexlen = 0 # Length of the input text + self.lexerrorf = None # Error rule (if any) + self.lextokens = None # List of valid tokens + self.lexignore = "" # Ignored characters + self.lexliterals = "" # Literal characters that can be passed through + self.lexmodule = None # Module + self.lineno = 1 # Current line number + self.lexdebug = 0 # Debugging mode + self.lexoptimize = 0 # Optimized mode + + def clone(self,object=None): + c = copy.copy(self) + + # If the object parameter has been supplied, it means we are attaching the + # lexer to a new object. In this case, we have to rebind all methods in + # the lexstatere and lexstateerrorf tables. + + if object: + newtab = { } + for key, ritem in self.lexstatere.items(): + newre = [] + for cre, findex in ritem: + newfindex = [] + for f in findex: + if not f or not f[0]: + newfindex.append(f) + continue + newfindex.append((getattr(object,f[0].__name__),f[1])) + newre.append((cre,newfindex)) + newtab[key] = newre + c.lexstatere = newtab + c.lexstateerrorf = { } + for key, ef in self.lexstateerrorf.items(): + c.lexstateerrorf[key] = getattr(object,ef.__name__) + c.lexmodule = object + return c + + # ------------------------------------------------------------ + # writetab() - Write lexer information to a table file + # ------------------------------------------------------------ + def writetab(self,tabfile,outputdir=""): + if isinstance(tabfile,types.ModuleType): + return + basetabfilename = tabfile.split(".")[-1] + filename = os.path.join(outputdir,basetabfilename)+".py" + tf = open(filename,"w") + tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) + tf.write("_lextokens = %s\n" % repr(self.lextokens)) + tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) + tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) + tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) + + tabre = { } + # Collect all functions in the initial state + initial = self.lexstatere["INITIAL"] + initialfuncs = [] + for part in initial: + for f in part[1]: + if f and f[0]: + initialfuncs.append(f) + + for key, lre in self.lexstatere.items(): + titem = [] + for i in range(len(lre)): + titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) + tabre[key] = titem + + tf.write("_lexstatere = %s\n" % repr(tabre)) + tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) + + taberr = { } + for key, ef in self.lexstateerrorf.items(): + if ef: + taberr[key] = ef.__name__ + else: + taberr[key] = None + tf.write("_lexstateerrorf = %s\n" % repr(taberr)) + tf.close() + + # ------------------------------------------------------------ + # readtab() - Read lexer information from a tab file + # ------------------------------------------------------------ + def readtab(self,tabfile,fdict): + if isinstance(tabfile,types.ModuleType): + lextab = tabfile + else: + exec "import %s as lextab" % tabfile + self.lextokens = lextab._lextokens + self.lexreflags = lextab._lexreflags + self.lexliterals = lextab._lexliterals + self.lexstateinfo = lextab._lexstateinfo + self.lexstateignore = lextab._lexstateignore + self.lexstatere = { } + self.lexstateretext = { } + for key,lre in lextab._lexstatere.items(): + titem = [] + txtitem = [] + for i in range(len(lre)): + titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) + txtitem.append(lre[i][0]) + self.lexstatere[key] = titem + self.lexstateretext[key] = txtitem + self.lexstateerrorf = { } + for key,ef in lextab._lexstateerrorf.items(): + self.lexstateerrorf[key] = fdict[ef] + self.begin('INITIAL') # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): - if not isinstance(s,types.StringType): + # Pull off the first character to see if s looks like a string + c = s[:1] + if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)): raise ValueError, "Expected a string" self.lexdata = s self.lexpos = 0 self.lexlen = len(s) - self.token = self.realtoken - - # Change the token routine to point to realtoken() - global token - if token == self.errtoken: - token = self.token # ------------------------------------------------------------ - # errtoken() - Return error if token is called with no data + # begin() - Changes the lexing state + # ------------------------------------------------------------ + def begin(self,state): + if not self.lexstatere.has_key(state): + raise ValueError, "Undefined state" + self.lexre = self.lexstatere[state] + self.lexretext = self.lexstateretext[state] + self.lexignore = self.lexstateignore.get(state,"") + self.lexerrorf = self.lexstateerrorf.get(state,None) + self.lexstate = state + + # ------------------------------------------------------------ + # push_state() - Changes the lexing state and saves old on stack + # ------------------------------------------------------------ + def push_state(self,state): + self.lexstatestack.append(self.lexstate) + self.begin(state) + + # ------------------------------------------------------------ + # pop_state() - Restores the previous state + # ------------------------------------------------------------ + def pop_state(self): + self.begin(self.lexstatestack.pop()) + + # ------------------------------------------------------------ + # current_state() - Returns the current lexing state + # ------------------------------------------------------------ + def current_state(self): + return self.lexstate + + # ------------------------------------------------------------ + # skip() - Skip ahead n characters # ------------------------------------------------------------ - def errtoken(self): - raise RuntimeError, "No input string given with input()" - + def skip(self,n): + self.lexpos += n + # ------------------------------------------------------------ # token() - Return the next token from the Lexer # @@ -277,13 +261,13 @@ class Lexer: # as possible. Don't make changes unless you really know what # you are doing # ------------------------------------------------------------ - def realtoken(self): + def token(self): # Make local copies of frequently referenced attributes lexpos = self.lexpos lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata - + while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: @@ -291,71 +275,102 @@ class Lexer: continue # Look for a regular expression match - m = self.lexre.match(lexdata,lexpos) - if m: - i = m.lastindex - lexpos = m.end() + for lexre,lexindexfunc in self.lexre: + m = lexre.match(lexdata,lexpos) + if not m: continue + + # Create a token for return tok = LexToken() tok.value = m.group() tok.lineno = self.lineno - tok.lexer = self - func,tok.type = self.lexindexfunc[i] + tok.lexpos = lexpos + + i = m.lastindex + func,tok.type = lexindexfunc[i] + if not func: - self.lexpos = lexpos - return tok - + # If no token type was set, it's an ignored token + if tok.type: + self.lexpos = m.end() + return tok + else: + lexpos = m.end() + break + + lexpos = m.end() + + # if func not callable, it means it's an ignored token + if not callable(func): + break + # If token is processed by a function, call it + + tok.lexer = self # Set additional attributes useful in token rules + self.lexmatch = m self.lexpos = lexpos + newtok = func(tok) - self.lineno = tok.lineno # Update line number - + # Every function must return a token, if nothing, we just move to next token - if not newtok: continue - + if not newtok: + lexpos = self.lexpos # This is here in case user has updated lexpos. + lexignore = self.lexignore # This is here in case there was a state change + break + # Verify type of the token. If not in the token map, raise an error - if not self.optimize: + if not self.lexoptimize: if not self.lextokens.has_key(newtok.type): raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( func.func_code.co_filename, func.func_code.co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok + else: + # No match, see if in literals + if lexdata[lexpos] in self.lexliterals: + tok = LexToken() + tok.value = lexdata[lexpos] + tok.lineno = self.lineno + tok.type = tok.value + tok.lexpos = lexpos + self.lexpos = lexpos + 1 + return tok - # No match. Call t_error() if defined. - if self.lexerrorf: - tok = LexToken() - tok.value = self.lexdata[lexpos:] - tok.lineno = self.lineno - tok.type = "error" - tok.lexer = self - oldpos = lexpos - newtok = self.lexerrorf(tok) - lexpos += getattr(tok,"_skipn",0) - if oldpos == lexpos: - # Error method didn't change text position at all. This is an error. + # No match. Call t_error() if defined. + if self.lexerrorf: + tok = LexToken() + tok.value = self.lexdata[lexpos:] + tok.lineno = self.lineno + tok.type = "error" + tok.lexer = self + tok.lexpos = lexpos self.lexpos = lexpos - raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) - if not newtok: continue - self.lexpos = lexpos - return newtok + newtok = self.lexerrorf(tok) + if lexpos == self.lexpos: + # Error method didn't change text position at all. This is an error. + raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + lexpos = self.lexpos + if not newtok: continue + return newtok - self.lexpos = lexpos - raise LexError, ("No match found", lexdata[lexpos:]) + self.lexpos = lexpos + raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) - # No more input data self.lexpos = lexpos + 1 + if self.lexdata is None: + raise RuntimeError, "No input string given with input()" return None - # ----------------------------------------------------------------------------- -# validate_file() +# _validate_file() # # This checks to see if there are duplicated t_rulename() functions or strings # in the parser input file. This is done using a simple regular expression -# match on each line in the filename. +# match on each line in the given file. If the file can't be located or opened, +# a true result is returned by default. # ----------------------------------------------------------------------------- -def validate_file(filename): +def _validate_file(filename): import os.path base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea what the file is. Return OK @@ -365,10 +380,11 @@ def validate_file(filename): lines = f.readlines() f.close() except IOError: - return 1 # Oh well + return 1 # Couldn't find the file. Don't worry about it fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') + counthash = { } linen = 1 noerror = 1 @@ -382,52 +398,140 @@ def validate_file(filename): if not prev: counthash[name] = linen else: - print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) + print >>sys.stderr, "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) noerror = 0 linen += 1 return noerror # ----------------------------------------------------------------------------- -# _read_lextab(module) +# _funcs_to_names() # -# Reads lexer table from a lextab file instead of using introspection. +# Given a list of regular expression functions, this converts it to a list +# suitable for output to a table file # ----------------------------------------------------------------------------- -def _read_lextab(lexer, fdict, module): - exec "import %s as lextab" % module - lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) - lexer.lexindexfunc = lextab._lextab - for i in range(len(lextab._lextab)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) - lexer.lextokens = lextab._lextokens - lexer.lexignore = lextab._lexignore - if lextab._lexerrorf: - lexer.lexerrorf = fdict[lextab._lexerrorf] +def _funcs_to_names(funclist,namelist): + result = [] + for f,name in zip(funclist,namelist): + if f and f[0]: + result.append((name, f[1])) + else: + result.append(f) + return result + +# ----------------------------------------------------------------------------- +# _names_to_funcs() +# +# Given a list of regular expression function names, this converts it back to +# functions. +# ----------------------------------------------------------------------------- + +def _names_to_funcs(namelist,fdict): + result = [] + for n in namelist: + if n and n[0]: + result.append((fdict[n[0]],n[1])) + else: + result.append(n) + return result + +# ----------------------------------------------------------------------------- +# _form_master_re() +# +# This function takes a list of all of the regex components and attempts to +# form the master regular expression. Given limitations in the Python re +# module, it may be necessary to break the master regex into separate expressions. +# ----------------------------------------------------------------------------- + +def _form_master_re(relist,reflags,ldict,toknames): + if not relist: return [] + regex = "|".join(relist) + try: + lexre = re.compile(regex,re.VERBOSE | reflags) + + # Build the index to function map for the matching engine + lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) + lexindexnames = lexindexfunc[:] + + for f,i in lexre.groupindex.items(): + handle = ldict.get(f,None) + if type(handle) in (types.FunctionType, types.MethodType): + lexindexfunc[i] = (handle,toknames[f]) + lexindexnames[i] = f + elif handle is not None: + lexindexnames[i] = f + if f.find("ignore_") > 0: + lexindexfunc[i] = (None,None) + else: + lexindexfunc[i] = (None, toknames[f]) + return [(lexre,lexindexfunc)],[regex],[lexindexnames] + except Exception,e: + m = int(len(relist)/2) + if m == 0: m = 1 + llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) + rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) + return llist+rlist, lre+rre, lnames+rnames + +# ----------------------------------------------------------------------------- +# def _statetoken(s,names) +# +# Given a declaration name s of the form "t_" and a dictionary whose keys are +# state names, this function returns a tuple (states,tokenname) where states +# is a tuple of state names and tokenname is the name of the token. For example, +# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') +# ----------------------------------------------------------------------------- + +def _statetoken(s,names): + nonstate = 1 + parts = s.split("_") + for i in range(1,len(parts)): + if not names.has_key(parts[i]) and parts[i] != 'ANY': break + if i > 1: + states = tuple(parts[1:i]) + else: + states = ('INITIAL',) + + if 'ANY' in states: + states = tuple(names.keys()) + + tokenname = "_".join(parts[i:]) + return (states,tokenname) + # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None,debug=0,optimize=0,lextab="lextab"): +def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir=""): + global lexer ldict = None - regex = "" + stateinfo = { 'INITIAL' : 'inclusive'} error = 0 files = { } - lexer = Lexer() - lexer.debug = debug - lexer.optimize = optimize + lexobj = Lexer() + lexobj.lexdebug = debug + lexobj.lexoptimize = optimize global token,input - + + if nowarn: warn = 0 + else: warn = 1 + + if object: module = object + if module: - if not isinstance(module, types.ModuleType): - raise ValueError,"Expected a module" - - ldict = module.__dict__ - + # User supplied a module object. + if isinstance(module, types.ModuleType): + ldict = module.__dict__ + elif isinstance(module, _INSTANCETYPE): + _items = [(k,getattr(module,k)) for k in dir(module)] + ldict = { } + for (i,v) in _items: + ldict[i] = v + else: + raise ValueError,"Expected a module or instance" + lexobj.lexmodule = module + else: # No module given. We might be able to get information from the caller. try: @@ -435,217 +539,311 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab"): except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame - f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary + f = f.f_back # Walk out to our calling function + if f.f_globals is f.f_locals: # Collect global and local variations from caller + ldict = f.f_globals + else: + ldict = f.f_globals.copy() + ldict.update(f.f_locals) if optimize and lextab: try: - _read_lextab(lexer,ldict, lextab) - if not lexer.lexignore: lexer.lexignore = "" - token = lexer.token - input = lexer.input - return lexer - + lexobj.readtab(lextab,ldict) + token = lexobj.token + input = lexobj.input + lexer = lexobj + return lexobj + except ImportError: pass - - # Get the tokens map + + # Get the tokens, states, and literals variables (if any) + tokens = ldict.get("tokens",None) + states = ldict.get("states",None) + literals = ldict.get("literals","") + if not tokens: raise SyntaxError,"lex: module does not define 'tokens'" + if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): raise SyntaxError,"lex: tokens must be a list or tuple." # Build a dictionary of valid token names - lexer.lextokens = { } + lexobj.lextokens = { } if not optimize: - - # Utility function for verifying tokens - def is_identifier(s): - for c in s: - if not (c.isalnum() or c == '_'): return 0 - return 1 - for n in tokens: - if not is_identifier(n): - print "lex: Bad token name '%s'" % n + if not _is_identifier.match(n): + print >>sys.stderr, "lex: Bad token name '%s'" % n error = 1 - if lexer.lextokens.has_key(n): - print "lex: Warning. Token '%s' multiply defined." % n - lexer.lextokens[n] = None + if warn and lexobj.lextokens.has_key(n): + print >>sys.stderr, "lex: Warning. Token '%s' multiply defined." % n + lexobj.lextokens[n] = None else: - for n in tokens: lexer.lextokens[n] = None - + for n in tokens: lexobj.lextokens[n] = None if debug: - print "lex: tokens = '%s'" % lexer.lextokens.keys() + print "lex: tokens = '%s'" % lexobj.lextokens.keys() + + try: + for c in literals: + if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1: + print >>sys.stderr, "lex: Invalid literal %s. Must be a single character" % repr(c) + error = 1 + continue - # Get a list of symbols with the t_ prefix - tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] + except TypeError: + print >>sys.stderr, "lex: Invalid literals specification. literals must be a sequence of characters." + error = 1 + + lexobj.lexliterals = literals + + # Build statemap + if states: + if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)): + print >>sys.stderr, "lex: states must be defined as a tuple or list." + error = 1 + else: + for s in states: + if not isinstance(s,types.TupleType) or len(s) != 2: + print >>sys.stderr, "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s) + error = 1 + continue + name, statetype = s + if not isinstance(name,types.StringType): + print >>sys.stderr, "lex: state name %s must be a string" % repr(name) + error = 1 + continue + if not (statetype == 'inclusive' or statetype == 'exclusive'): + print >>sys.stderr, "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name + error = 1 + continue + if stateinfo.has_key(name): + print >>sys.stderr, "lex: state '%s' already defined." % name + error = 1 + continue + stateinfo[name] = statetype + + # Get a list of symbols with the t_ or s_ prefix + tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ] # Now build up a list of functions and a list of strings - fsymbols = [ ] - ssymbols = [ ] + + funcsym = { } # Symbols defined as functions + strsym = { } # Symbols defined as strings + toknames = { } # Mapping of symbols to token names + + for s in stateinfo.keys(): + funcsym[s] = [] + strsym[s] = [] + + ignore = { } # Ignore strings by state + errorf = { } # Error functions by state + + if len(tsymbols) == 0: + raise SyntaxError,"lex: no rules of the form t_rulename are defined." + for f in tsymbols: - if isinstance(ldict[f],types.FunctionType): - fsymbols.append(ldict[f]) - elif isinstance(ldict[f],types.StringType): - ssymbols.append((f,ldict[f])) + t = ldict[f] + states, tokname = _statetoken(f,stateinfo) + toknames[f] = tokname + + if callable(t): + for s in states: funcsym[s].append((f,t)) + elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)): + for s in states: strsym[s].append((f,t)) else: - print "lex: %s not defined as a function or string" % f + print >>sys.stderr, "lex: %s not defined as a function or string" % f error = 1 - + # Sort the functions by line number - fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) + for f in funcsym.values(): + f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno)) # Sort the strings by regular expression length - ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - - # Check for non-empty symbols - if len(fsymbols) == 0 and len(ssymbols) == 0: - raise SyntaxError,"lex: no rules of the form t_rulename are defined." + for s in strsym.values(): + s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - # Add all of the rules defined with actions first - for f in fsymbols: - - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - files[file] = None + regexs = { } - if not optimize: - if f.func_code.co_argcount > 1: - print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) - error = 1 - continue + # Build the master regular expressions + for state in stateinfo.keys(): + regex_list = [] - if f.func_code.co_argcount < 1: - print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) - error = 1 - continue + # Add rules defined by functions first + for fname, f in funcsym[state]: + line = f.func_code.co_firstlineno + file = f.func_code.co_filename + files[file] = None + tokname = toknames[fname] - if f.__name__ == 't_ignore': - print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) - error = 1 + ismethod = isinstance(f, types.MethodType) + + if not optimize: + nargs = f.func_code.co_argcount + if ismethod: + reqargs = 2 + else: + reqargs = 1 + if nargs > reqargs: + print >>sys.stderr, "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) + error = 1 + continue + + if nargs < reqargs: + print >>sys.stderr, "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) + error = 1 + continue + + if tokname == 'ignore': + print >>sys.stderr, "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) + error = 1 + continue + + if tokname == 'error': + errorf[state] = f continue - - if f.__name__ == 't_error': - lexer.lexerrorf = f - continue - if f.__doc__: + if f.__doc__: + if not optimize: + try: + c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | reflags) + if c.match(""): + print >>sys.stderr, "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__) + error = 1 + continue + except re.error,e: + print >>sys.stderr, "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) + if '#' in f.__doc__: + print >>sys.stderr, "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__) + error = 1 + continue + + if debug: + print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state) + + # Okay. The regular expression seemed okay. Let's append it to the master regular + # expression we're building + + regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) + else: + print >>sys.stderr, "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + + # Now add all of the simple rules + for name,r in strsym[state]: + tokname = toknames[name] + + if tokname == 'ignore': + if "\\" in r: + print >>sys.stderr, "lex: Warning. %s contains a literal backslash '\\'" % name + ignore[state] = r + continue + if not optimize: + if tokname == 'error': + raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name + error = 1 + continue + + if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0: + print >>sys.stderr, "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname) + error = 1 + continue try: - c = re.compile(f.__doc__, re.VERBOSE) + c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) + if (c.match("")): + print >>sys.stderr, "lex: Regular expression for rule '%s' matches empty string." % name + error = 1 + continue except re.error,e: - print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) + print >>sys.stderr, "lex: Invalid regular expression for rule '%s'. %s" % (name,e) + if '#' in r: + print >>sys.stderr, "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name + error = 1 continue - if debug: - print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) + print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state) - # Okay. The regular expression seemed okay. Let's append it to the master regular - # expression we're building - - if (regex): regex += "|" - regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) - else: - print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + regex_list.append("(?P<%s>%s)" % (name,r)) - # Now add all of the simple rules - for name,r in ssymbols: + if not regex_list: + print >>sys.stderr, "lex: No rules defined for state '%s'" % state + error = 1 + + regexs[state] = regex_list - if name == 't_ignore': - lexer.lexignore = r - continue - - if not optimize: - if name == 't_error': - raise SyntaxError,"lex: Rule 't_error' must be defined as a function" - error = 1 - continue - - if not lexer.lextokens.has_key(name[2:]): - print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) - error = 1 - continue - try: - c = re.compile(r,re.VERBOSE) - except re.error,e: - print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) - error = 1 - continue - if debug: - print "lex: Adding rule %s -> '%s'" % (name,r) - - if regex: regex += "|" - regex += "(?P<%s>%s)" % (name,r) if not optimize: for f in files.keys(): - if not validate_file(f): + if not _validate_file(f): error = 1 - try: - if debug: - print "lex: regex = '%s'" % regex - lexer.lexre = re.compile(regex, re.VERBOSE) - # Build the index to function map for the matching engine - lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) - for f,i in lexer.lexre.groupindex.items(): - handle = ldict[f] - if isinstance(handle,types.FunctionType): - lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) - else: - # If rule was specified as a string, we build an anonymous - # callback function to carry out the action - lexer.lexindexfunc[i] = (None,f[2:]) - - # If a lextab was specified, we create a file containing the precomputed - # regular expression and index table - - if lextab and optimize: - lt = open(lextab+".py","w") - lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) - lt.write("_lexre = %s\n" % repr(regex)) - lt.write("_lextab = [\n"); - for i in range(0,len(lexer.lexindexfunc)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) - else: - lt.write(" (None,%s),\n" % repr(t[1])) - else: - lt.write(" None,\n") - - lt.write("]\n"); - lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) - lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) - if (lexer.lexerrorf): - lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) - else: - lt.write("_lexerrorf = None\n") - lt.close() - - except re.error,e: - print "lex: Fatal error. Unable to compile regular expression rules. %s" % e - error = 1 if error: raise SyntaxError,"lex: Unable to build lexer." - if not lexer.lexerrorf: - print "lex: Warning. no t_error rule is defined." - if not lexer.lexignore: lexer.lexignore = "" - + # From this point forward, we're reasonably confident that we can build the lexer. + # No more errors will be generated, but there might be some warning messages. + + # Build the master regular expressions + + for state in regexs.keys(): + lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,toknames) + lexobj.lexstatere[state] = lexre + lexobj.lexstateretext[state] = re_text + lexobj.lexstaterenames[state] = re_names + if debug: + for i in range(len(re_text)): + print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i]) + + # For inclusive states, we need to add the INITIAL state + for state,type in stateinfo.items(): + if state != "INITIAL" and type == 'inclusive': + lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) + lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) + lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) + + lexobj.lexstateinfo = stateinfo + lexobj.lexre = lexobj.lexstatere["INITIAL"] + lexobj.lexretext = lexobj.lexstateretext["INITIAL"] + + # Set up ignore variables + lexobj.lexstateignore = ignore + lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") + + # Set up error functions + lexobj.lexstateerrorf = errorf + lexobj.lexerrorf = errorf.get("INITIAL",None) + if warn and not lexobj.lexerrorf: + print >>sys.stderr, "lex: Warning. no t_error rule is defined." + + # Check state information for ignore and error rules + for s,stype in stateinfo.items(): + if stype == 'exclusive': + if warn and not errorf.has_key(s): + print >>sys.stderr, "lex: Warning. no error rule is defined for exclusive state '%s'" % s + if warn and not ignore.has_key(s) and lexobj.lexignore: + print >>sys.stderr, "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s + elif stype == 'inclusive': + if not errorf.has_key(s): + errorf[s] = errorf.get("INITIAL",None) + if not ignore.has_key(s): + ignore[s] = ignore.get("INITIAL","") + + # Create global versions of the token() and input() functions - token = lexer.token - input = lexer.input - - return lexer + token = lexobj.token + input = lexobj.input + lexer = lexobj + + # If in optimize mode, we write the lextab + if lextab and optimize: + lexobj.writetab(lextab,outputdir) + + return lexobj # ----------------------------------------------------------------------------- -# run() +# runmain() # # This runs the lexer as a main program # ----------------------------------------------------------------------------- @@ -670,12 +868,29 @@ def runmain(lexer=None,data=None): _token = lexer.token else: _token = token - + while 1: tok = _token() if not tok: break - print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) - - + print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos) + + +# ----------------------------------------------------------------------------- +# @TOKEN(regex) +# +# This decorator function can be used to set the regex expression on a function +# when its docstring might need to be set in an alternative way +# ----------------------------------------------------------------------------- + +def TOKEN(r): + def set_doc(f): + if callable(r): + f.__doc__ = r.__doc__ + else: + f.__doc__ = r + return f + return set_doc +# Alternative spelling of the TOKEN decorator +Token = TOKEN diff --git a/rwhoisd/parsetab.py b/rwhoisd/parsetab.py deleted file mode 100644 index 31d6031..0000000 --- a/rwhoisd/parsetab.py +++ /dev/null @@ -1,37 +0,0 @@ - -# parsetab.py -# This file is automatically generated. Do not edit. - -_lr_method = 'SLR' - -_lr_signature = '\xd4p\x02\xe6^\xeb\x1b\xa7\xf4\r\xb2\xf7!\xf32\x1d' - -_lr_action_items = {'AND':([13,1,6,2,15,17,16,14,3,5,4,],[11,-11,-5,-8,-7,-4,-3,-6,-10,11,-9,]),'QUOTEDVALUE':([8,10,12,0,9,11,],[1,1,1,1,1,1,]),'ATTR':([12,0,8,11,],[2,2,2,2,]),'CLASS':([0,],[8,]),'NEQ':([2,],[10,]),'EQ':([2,],[9,]),'OR':([1,15,13,6,4,2,14,5,17,16,3,],[-11,-7,12,-5,-9,-8,-6,12,-4,-3,-10,]),'VALUE':([11,9,12,10,8,0,],[3,3,3,3,3,3,]),'$':([17,14,2,5,3,7,13,16,15,1,4,6,],[-4,-6,-8,-2,-10,0,-1,-3,-7,-11,-9,-5,]),} - -_lr_action = { } -for _k, _v in _lr_action_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_action[(_x,_k)] = _y -del _lr_action_items - -_lr_goto_items = {'querystr':([0,11,12,8,],[6,16,17,6,]),'total':([0,],[7,]),'value':([12,11,10,8,9,0,],[4,4,15,4,14,4,]),'query':([8,0,],[13,5,]),} - -_lr_goto = { } -for _k, _v in _lr_goto_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_goto[(_x,_k)] = _y -del _lr_goto_items -_lr_productions = [ - ("S'",1,None,None,None), - ('total',2,'p_total_class_query','QueryParser.py',74), - ('total',1,'p_total_query','QueryParser.py',80), - ('query',3,'p_query_oper_querystr','QueryParser.py',86), - ('query',3,'p_query_oper_querystr','QueryParser.py',87), - ('query',1,'p_query_querystr','QueryParser.py',97), - ('querystr',3,'p_querystr_attr_value','QueryParser.py',104), - ('querystr',3,'p_querystr_attr_value','QueryParser.py',105), - ('querystr',1,'p_querystr_attr','QueryParser.py',110), - ('querystr',1,'p_querystr_value','QueryParser.py',115), - ('value',1,'p_value','QueryParser.py',121), - ('value',1,'p_quotedvalue','QueryParser.py',128), -] diff --git a/rwhoisd/yacc.py b/rwhoisd/yacc.py index b79d95f..b8f4772 100644 --- a/rwhoisd/yacc.py +++ b/rwhoisd/yacc.py @@ -1,39 +1,32 @@ #----------------------------------------------------------------------------- # ply: yacc.py # -# Author: David M. Beazley (beazley@cs.uchicago.edu) -# Department of Computer Science -# University of Chicago -# Chicago, IL 60637 +# Author(s): David M. Beazley (dave@dabeaz.com) # -# Copyright (C) 2001, David M. Beazley -# -# $Header: /home/davidb/src/cvs/python-rwhoisd/rwhoisd/yacc.py,v 1.1 2003/04/22 02:19:07 davidb Exp $ +# Copyright (C) 2001-2008, David M. Beazley # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# +# # See the file COPYING for a complete copy of the LGPL. # # # This implements an LR parser that is constructed from grammar rules defined -# as Python functions. Roughly speaking, this module is a cross between -# John Aycock's Spark system and the GNU bison utility. -# -# Disclaimer: This is a work in progress. SLR parsing seems to work fairly -# well and there is extensive error checking. LALR(1) is in progress. The -# rest of this file is a bit of a mess. Please pardon the dust. +# as Python functions. The grammer is specified by supplying the BNF inside +# Python documentation strings. The inspiration for this technique was borrowed +# from John Aycock's Spark parsing system. PLY might be viewed as cross between +# Spark and the GNU bison utility. # # The current implementation is only somewhat object-oriented. The # LR parser itself is defined in terms of an object (which allows multiple @@ -41,9 +34,24 @@ # construction are defined in terms of global variables. Users shouldn't # notice unless they are trying to define multiple parsers at the same # time using threads (in which case they should have their head examined). -#----------------------------------------------------------------------------- +# +# This implementation supports both SLR and LALR(1) parsing. LALR(1) +# support was originally implemented by Elias Ioup (ezioup@alumni.uchicago.edu), +# using the algorithm found in Aho, Sethi, and Ullman "Compilers: Principles, +# Techniques, and Tools" (The Dragon Book). LALR(1) has since been replaced +# by the more efficient DeRemer and Pennello algorithm. +# +# :::::::: WARNING ::::::: +# +# Construction of LR parsing tables is fairly complicated and expensive. +# To make this module run fast, a *LOT* of work has been put into +# optimization---often at the expensive of readability and what might +# consider to be good Python "coding style." Modify the code at your +# own risk! +# ---------------------------------------------------------------------------- -__version__ = "1.3" +__version__ = "2.5" +__tabversion__ = "2.4" # Table version #----------------------------------------------------------------------------- # === User configurable parameters === @@ -56,15 +64,32 @@ yaccdebug = 1 # Debugging mode. If set, yacc generates a debug_file = 'parser.out' # Default name of the debugging file tab_module = 'parsetab' # Default name of the table module -default_lr = 'SLR' # Default LR table generation method +default_lr = 'LALR' # Default LR table generation method error_count = 3 # Number of symbols that must be shifted to leave recovery mode +yaccdevel = 0 # Set to True if developing yacc. This turns off optimized + # implementations of certain functions. + import re, types, sys, cStringIO, md5, os.path # Exception raised for yacc-related errors class YaccError(Exception): pass +# Exception raised for errors raised in production rules +class SyntaxError(Exception): pass + + +# Available instance types. This is used when parsers are defined by a class. +# it's a little funky because I want to preserve backwards compatibility +# with Python 2.0 where types.ObjectType is undefined. + +try: + _INSTANCETYPE = (types.InstanceType, types.ObjectType) +except AttributeError: + _INSTANCETYPE = types.InstanceType + class object: pass # Note: needed if no new-style classes present + #----------------------------------------------------------------------------- # === LR Parsing Engine === # @@ -79,6 +104,8 @@ class YaccError(Exception): pass # .value = Symbol value # .lineno = Starting line number # .endlineno = Ending line number (optional, set automatically) +# .lexpos = Starting lex position +# .endlexpos = Ending lex position (optional, set automatically) class YaccSymbol: def __str__(self): return self.type @@ -90,19 +117,28 @@ class YaccSymbol: # The lineno() method returns the line number of a given # item (or 0 if not defined). The linespan() method returns # a tuple of (startline,endline) representing the range of lines -# for a symbol. +# for a symbol. The lexspan() method returns a tuple (lexpos,endlexpos) +# representing the range of positional information for a symbol. -class YaccSlice: - def __init__(self,s): +class YaccProduction: + def __init__(self,s,stack=None): self.slice = s - self.pbstack = [] - + self.stack = stack + self.lexer = None + self.parser= None def __getitem__(self,n): - return self.slice[n].value + if n >= 0: return self.slice[n].value + else: return self.stack[n].value def __setitem__(self,n,v): self.slice[n].value = v + def __getslice__(self,i,j): + return [s.value for s in self.slice[i:j]] + + def __len__(self): + return len(self.slice) + def lineno(self,n): return getattr(self.slice[n],"lineno",0) @@ -111,18 +147,22 @@ class YaccSlice: endline = getattr(self.slice[n],"endlineno",startline) return startline,endline - def pushback(self,n): - if n <= 0: - raise ValueError, "Expected a positive value" - if n > (len(self.slice)-1): - raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) - for i in range(0,n): - self.pbstack.append(self.slice[-i-1]) + def lexpos(self,n): + return getattr(self.slice[n],"lexpos",0) + + def lexspan(self,n): + startpos = getattr(self.slice[n],"lexpos",0) + endpos = getattr(self.slice[n],"endlexpos",startpos) + return startpos,endpos + + def error(self): + raise SyntaxError + # The LR Parsing engine. This is defined as a class so that multiple parsers # can exist in the same process. A user never instantiates this directly. # Instead, the global yacc() function should be used to create a suitable Parser -# object. +# object. class Parser: def __init__(self,magic=None): @@ -131,7 +171,7 @@ class Parser: # object directly. if magic != "xyzzy": - raise YaccError, "Can't instantiate Parser. Use yacc() instead." + raise YaccError, "Can't directly instantiate Parser. Use yacc() instead." # Reset internal state self.productions = None # List of productions @@ -141,57 +181,669 @@ class Parser: self.require = { } # Attribute require table self.method = "Unknown LR" # Table construction method used - def errok(self): - self.errorcount = 0 + def errok(self): + self.errorok = 1 + + def restart(self): + del self.statestack[:] + del self.symstack[:] + sym = YaccSymbol() + sym.type = '$end' + self.symstack.append(sym) + self.statestack.append(0) + + def parse(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + if debug or yaccdevel: + return self.parsedebug(input,lexer,debug,tracking,tokenfunc) + elif tracking: + return self.parseopt(input,lexer,debug,tracking,tokenfunc) + else: + return self.parseopt_notrack(input,lexer,debug,tracking,tokenfunc) + + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parsedebug(). + # + # This is the debugging enabled version of parse(). All changes made to the + # parsing engine should be made here. For the non-debugging version, + # copy this code to a method parseopt() and delete all of the sections + # enclosed in: + # + # #--! DEBUG + # statements + # #--! DEBUG + # + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + lookahead = None # Current lookahead symbol + lookaheadstack = [ ] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + endsym = "$end" # End symbol + # If no lexer was given, we will try to use the lex module + if not lexer: + import lex + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set up the state and symbol stacks + + statestack = [ ] # Stack of parsing states + self.statestack = statestack + symstack = [ ] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = endsym + symstack.append(sym) + state = 0 + while 1: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + # --! DEBUG + if debug > 1: + print 'state', state + # --! DEBUG + + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = endsym + + # --! DEBUG + if debug: + errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() + # --! DEBUG + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + + # --! DEBUG + if debug > 1: + print 'action', t + # --! DEBUG + + if t is not None: + if t > 0: + # shift a symbol on the stack + if ltype is endsym: + # Error, end of input + sys.stderr.write("yacc: Parse error. EOF\n") + return + statestack.append(t) + state = t + + # --! DEBUG + if debug > 1: + sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) + # --! DEBUG + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: errorcount -=1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + # --! DEBUG + if debug > 1: + sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) + # --! DEBUG + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + # --! TRACKING + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1,"endlineno",t1.lineno) + sym.endlexpos = getattr(t1,"endlexpos",t1.lexpos) + + # --! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.func(pslice) + del symstack[-plen:] + del statestack[-plen:] + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + # --! TRACKING + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos + # --! TRACKING + + targ = [ sym ] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.func(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + return getattr(n,"value",None) + + if t == None: + + # --! DEBUG + if debug: + sys.stderr.write(errorlead + "\n") + # --! DEBUG + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = 0 + errtoken = lookahead + if errtoken.type is endsym: + errtoken = None # End of file! + if self.errorfunc: + global errok,token,restart + errok = self.errok # Set some special functions available in error recovery + token = get_token + restart = self.restart + tok = self.errorfunc(errtoken) + del errok, token, restart # Delete special functions + + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken,"lineno"): lineno = lookahead.lineno + else: lineno = 0 + if lineno: + sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) + else: + sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) + else: + sys.stderr.write("yacc: Parse error in input. EOF\n") + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type is not endsym: + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type is endsym: + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + lookahead = None + continue + t = YaccSymbol() + t.type = 'error' + if hasattr(lookahead,"lineno"): + t.lineno = lookahead.lineno + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + symstack.pop() + statestack.pop() + state = statestack[-1] # Potential bug fix + + continue + + # Call an error function here + raise RuntimeError, "yacc: internal parser error!!!\n" + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt(). + # + # Optimized version of parse() method. DO NOT EDIT THIS CODE DIRECTLY. + # Edit the debug version above, then copy any modifications to the method + # below while removing #--! DEBUG sections. + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + def parseopt(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + lookahead = None # Current lookahead symbol + lookaheadstack = [ ] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + # If no lexer was given, we will try to use the lex module + if not lexer: + import lex + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set up the state and symbol stacks + + statestack = [ ] # Stack of parsing states + self.statestack = statestack + symstack = [ ] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while 1: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + + if t is not None: + if t > 0: + # shift a symbol on the stack + if ltype == '$end': + # Error, end of input + sys.stderr.write("yacc: Parse error. EOF\n") + return + statestack.append(t) + state = t + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: errorcount -=1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + # --! TRACKING + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1,"endlineno",t1.lineno) + sym.endlexpos = getattr(t1,"endlexpos",t1.lexpos) + + # --! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.func(pslice) + del symstack[-plen:] + del statestack[-plen:] + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + # --! TRACKING + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos + # --! TRACKING + + targ = [ sym ] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.func(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + return getattr(n,"value",None) + + if t == None: + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = 0 + errtoken = lookahead + if errtoken.type == '$end': + errtoken = None # End of file! + if self.errorfunc: + global errok,token,restart + errok = self.errok # Set some special functions available in error recovery + token = get_token + restart = self.restart + tok = self.errorfunc(errtoken) + del errok, token, restart # Delete special functions + + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken,"lineno"): lineno = lookahead.lineno + else: lineno = 0 + if lineno: + sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) + else: + sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) + else: + sys.stderr.write("yacc: Parse error in input. EOF\n") + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != '$end': + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == '$end': + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + lookahead = None + continue + t = YaccSymbol() + t.type = 'error' + if hasattr(lookahead,"lineno"): + t.lineno = lookahead.lineno + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + symstack.pop() + statestack.pop() + state = statestack[-1] # Potential bug fix + + continue + + # Call an error function here + raise RuntimeError, "yacc: internal parser error!!!\n" + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt_notrack(). + # + # Optimized version of parseopt() with line number tracking removed. + # DO NOT EDIT THIS CODE DIRECTLY. Copy the optimized version and remove + # code in the #--! TRACKING sections + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - def restart(self): - del self.statestack[:] - del self.symstack[:] - sym = YaccSymbol() - sym.type = '$' - self.symstack.append(sym) - self.statestack.append(0) - - def parse(self,input=None,lexer=None,debug=0): + def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): lookahead = None # Current lookahead symbol lookaheadstack = [ ] # Stack of lookahead symbols - actions = self.action # Local reference to action table - goto = self.goto # Local reference to goto table - prod = self.productions # Local reference to production list - pslice = YaccSlice(None) # Slice object passed to grammar rules - pslice.parser = self # Parser object - self.errorcount = 0 # Used during error recovery + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery # If no lexer was given, we will try to use the lex module if not lexer: - import lex as lexer + import lex + lexer = lex.lexer + # Set up the lexer and parser objects on pslice pslice.lexer = lexer - + pslice.parser = self + # If input was supplied, pass to lexer - if input: + if input is not None: lexer.input(input) - # Tokenize function - get_token = lexer.token + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set up the state and symbol stacks statestack = [ ] # Stack of parsing states self.statestack = statestack symstack = [ ] # Stack of grammar symbols self.symstack = symstack + pslice.stack = symstack # Put in the production errtoken = None # Err token - # The start state is assumed to be (0,$) + # The start state is assumed to be (0,$end) + statestack.append(0) sym = YaccSymbol() - sym.type = '$' + sym.type = '$end' symstack.append(sym) - + state = 0 while 1: # Get the next symbol on the input. If a lookahead symbol # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer + if not lookahead: if not lookaheadstack: lookahead = get_token() # Get the next token @@ -199,32 +851,29 @@ class Parser: lookahead = lookaheadstack.pop() if not lookahead: lookahead = YaccSymbol() - lookahead.type = '$' - if debug: - print "%-20s : %s" % (lookahead, [xx.type for xx in symstack]) + lookahead.type = '$end' # Check the action table - s = statestack[-1] ltype = lookahead.type - t = actions.get((s,ltype),None) + t = actions[state].get(ltype) if t is not None: if t > 0: # shift a symbol on the stack - if ltype == '$': + if ltype == '$end': # Error, end of input - print "yacc: Parse error. EOF" + sys.stderr.write("yacc: Parse error. EOF\n") return statestack.append(t) + state = t + symstack.append(lookahead) lookahead = None # Decrease error count on successful shift - if self.errorcount > 0: - self.errorcount -= 1 - + if errorcount: errorcount -=1 continue - + if t < 0: # reduce a symbol on the stack, emit a production p = prod[-t] @@ -239,73 +888,86 @@ class Parser: if plen: targ = symstack[-plen-1:] targ[0] = sym + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + try: - sym.lineno = targ[1].lineno - sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) - except AttributeError: - sym.lineno = 0 - del symstack[-plen:] - del statestack[-plen:] + # Call the grammar rule with our special slice object + p.func(pslice) + del symstack[-plen:] + del statestack[-plen:] + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + else: - sym.lineno = 0 + targ = [ sym ] - pslice.slice = targ - pslice.pbstack = [] - # Call the grammar rule with our special slice object - p.func(pslice) - - # Validate attributes of the resulting value attribute -# if require: -# try: -# t0 = targ[0] -# r = Requires.get(t0.type,None) -# t0d = t0.__dict__ -# if r: -# for field in r: -# tn = t0 -# for fname in field: -# try: -# tf = tn.__dict__ -# tn = tf.get(fname) -# except StandardError: -# tn = None -# if not tn: -# print "%s:%d: Rule %s doesn't set required attribute '%s'" % \ -# (p.file,p.line,p.name,".".join(field)) -# except TypeError,LookupError: -# print "Bad requires directive " % r -# pass - - - # If there was a pushback, put that on the stack - if pslice.pbstack: - lookaheadstack.append(lookahead) - for _t in pslice.pbstack: - lookaheadstack.append(_t) - lookahead = None - symstack.append(sym) - statestack.append(goto[statestack[-1],pname]) - continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.func(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if t == 0: n = symstack[-1] return getattr(n,"value",None) if t == None: - # We have some kind of parsing error here. To handle this, - # we are going to push the current token onto the tokenstack - # and replace it with an 'error' token. If there are any synchronization - # rules, they may catch it. - # - # In addition to pushing the error token, we call call the user defined p_error() - # function if this is the first syntax error. This function is only called - # if errorcount == 0. - if not self.errorcount: - self.errorcount = error_count + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = 0 errtoken = lookahead - if errtoken.type == '$': + if errtoken.type == '$end': errtoken = None # End of file! if self.errorfunc: global errok,token,restart @@ -314,10 +976,11 @@ class Parser: restart = self.restart tok = self.errorfunc(errtoken) del errok, token, restart # Delete special functions - - if not self.errorcount: - # User must have done some kind of panic mode recovery on their own. The returned token - # is the next lookahead + + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead lookahead = tok errtoken = None continue @@ -326,23 +989,24 @@ class Parser: if hasattr(errtoken,"lineno"): lineno = lookahead.lineno else: lineno = 0 if lineno: - print "yacc: Syntax error at line %d, token=%s" % (lineno, errtoken.type) + sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) else: - print "yacc: Syntax error, token=%s" % errtoken.type + sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) else: - print "yacc: Parse error in input. EOF" + sys.stderr.write("yacc: Parse error in input. EOF\n") return else: - self.errorcount = error_count - + errorcount = error_count + # case 1: the statestack only has 1 entry on it. If we're in this state, the # entire parse has been rolled back and we're completely hosed. The token is # discarded and we just keep going. - if len(statestack) <= 1 and lookahead.type != '$': + if len(statestack) <= 1 and lookahead.type != '$end': lookahead = None errtoken = None + state = 0 # Nuke the pushback stack del lookaheadstack[:] continue @@ -351,9 +1015,9 @@ class Parser: # at the end of the file. nuke the top entry and generate an error token # Start nuking entries on the stack - if lookahead.type == '$': + if lookahead.type == '$end': # Whoa. We're really hosed here. Bail out - return + return if lookahead.type != 'error': sym = symstack[-1] @@ -372,12 +1036,14 @@ class Parser: else: symstack.pop() statestack.pop() + state = statestack[-1] # Potential bug fix continue # Call an error function here raise RuntimeError, "yacc: internal parser error!!!\n" + # ----------------------------------------------------------------------------- # === Parser Construction === # @@ -388,7 +1054,7 @@ class Parser: # is completely self contained--meaning that it is safe to repeatedly # call yacc() with different grammars in the same application. # ----------------------------------------------------------------------------- - + # ----------------------------------------------------------------------------- # validate_file() # @@ -424,24 +1090,24 @@ def validate_file(filename): if not prev: counthash[name] = linen else: - print "%s:%d: Function %s redefined. Previously defined on line %d" % (filename,linen,name,prev) + sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) noerror = 0 linen += 1 return noerror # This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. def validate_dict(d): - for n,v in d.items(): - if n[0:2] == 'p_' and isinstance(v,types.FunctionType): continue + for n,v in d.items(): + if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue if n[0:2] == 't_': continue if n[0:2] == 'p_': - print "yacc: Warning. '%s' not defined as a function" % n - if isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: + sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) + if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: try: doc = v.__doc__.split(" ") if doc[1] == ':': - print "%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix." % (v.func_code.co_filename, v.func_code.co_firstlineno,n) + sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) except StandardError: pass @@ -454,17 +1120,17 @@ def validate_dict(d): # Initialize all of the global variables used during grammar construction def initialize_vars(): - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems + global Productions, Prodnames, Prodmap, Terminals + global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems global Errorfunc, Signature, Requires Productions = [None] # A list of all of the productions. The first # entry is always reserved for the purpose of # building an augmented grammar - + Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all # productions of that nonterminal. - + Prodmap = { } # A dictionary that is only used to detect duplicate # productions. @@ -475,12 +1141,16 @@ def initialize_vars(): # of rule numbers where they are used. First = { } # A dictionary of precomputed FIRST(x) symbols - + Follow = { } # A dictionary of precomputed FOLLOW(x) symbols Precedence = { } # Precedence rules for each terminal. Contains tuples of the # form ('right',level) or ('nonassoc', level) or ('left',level) + UsedPrecedence = { } # Precedence rules that were actually used by the grammer. + # This is only used to provide error checking and to generate + # a warning about unused precedence rules. + LRitems = [ ] # A list of all LR items for the grammar. These are the # productions with the "dot" like E -> E . PLUS E @@ -490,6 +1160,8 @@ def initialize_vars(): # and other information. Used to determined when a # parsing table needs to be regenerated. + Signature.update(__tabversion__) + Requires = { } # Requires list # File objects used when creating the parser.out debugging file @@ -515,8 +1187,9 @@ def initialize_vars(): # func - Action function # prec - Precedence level # lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' -# then lr_next refers to 'E -> E PLUS . E' +# then lr_next refers to 'E -> E PLUS . E' # lr_index - LR item index (location of the ".") in the prod list. +# lookaheads - LALR lookahead symbols for this item # len - Length of the production (number of symbols on right hand side) # ----------------------------------------------------------------------------- @@ -526,8 +1199,12 @@ class Production: setattr(self,k,v) self.lr_index = -1 self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure + self.lr1_added = 0 # Flag indicating whether or not added to LR1 self.usyms = [ ] - + self.lookaheads = { } + self.lk_added = { } + self.setnumbers = [ ] + def __str__(self): if self.prod: s = "%s -> %s" % (self.name," ".join(self.prod)) @@ -546,6 +1223,8 @@ class Production: p.prod = list(self.prod) p.number = self.number p.lr_index = n + p.lookaheads = { } + p.setnumbers = self.setnumbers p.prod.insert(n,".") p.prod = tuple(p.prod) p.len = len(p.prod) @@ -566,11 +1245,8 @@ class Production: class MiniProduction: pass -# Utility function -def is_identifier(s): - for c in s: - if not (c.isalnum() or c == '_'): return 0 - return 1 +# regex matching identifiers +_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') # ----------------------------------------------------------------------------- # add_production() @@ -586,33 +1262,46 @@ def is_identifier(s): # | productionn # name2 ::= production1 # | production2 -# ... +# ... # ----------------------------------------------------------------------------- def add_production(f,file,line,prodname,syms): - + if Terminals.has_key(prodname): - print "%s:%d: Illegal rule name '%s'. Already defined as a token." % (file,line,prodname) + sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) return -1 if prodname == 'error': - print "%s:%d: Illegal rule name '%s'. error is a reserved word." % (file,line,prodname) + sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) return -1 - - if not is_identifier(prodname): - print "%s:%d: Illegal rule name '%s'" % (file,line,prodname) + + if not _is_identifier.match(prodname): + sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) return -1 - for s in syms: - if not is_identifier(s) and s != '%prec': - print "%s:%d: Illegal name '%s' in rule '%s'" % (file,line,s, prodname) + for x in range(len(syms)): + s = syms[x] + if s[0] in "'\"": + try: + c = eval(s) + if (len(c) > 1): + sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname)) + return -1 + if not Terminals.has_key(c): + Terminals[c] = [] + syms[x] = c + continue + except SyntaxError: + pass + if not _is_identifier.match(s) and s != '%prec': + sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) return -1 # See if the rule is already in the rulemap map = "%s -> %s" % (prodname,syms) if Prodmap.has_key(map): m = Prodmap[map] - print "%s:%d: Duplicate rule %s." % (file,line, m) - print "%s:%d: Previous definition at %s:%d" % (file,line, m.file, m.line) + sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) + sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) return -1 p = Production() @@ -623,12 +1312,12 @@ def add_production(f,file,line,prodname,syms): p.func = f p.number = len(Productions) - + Productions.append(p) Prodmap[map] = p if not Nonterminals.has_key(prodname): Nonterminals[prodname] = [ ] - + # Add all terminals to Terminals i = 0 while i < len(p.prod): @@ -637,15 +1326,16 @@ def add_production(f,file,line,prodname,syms): try: precname = p.prod[i+1] except IndexError: - print "%s:%d: Syntax error. Nothing follows %%prec." % (p.file,p.line) + sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) return -1 prec = Precedence.get(precname,None) if not prec: - print "%s:%d: Nothing known about the precedence of '%s'" % (p.file,p.line,precname) + sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) return -1 else: p.prec = prec + UsedPrecedence[precname] = 1 del p.prod[i] del p.prod[i] continue @@ -663,7 +1353,7 @@ def add_production(f,file,line,prodname,syms): if not hasattr(p,"prec"): p.prec = ('right',0) - + # Set final length of productions p.len = len(p.prod) p.prod = tuple(p.prod) @@ -673,7 +1363,7 @@ def add_production(f,file,line,prodname,syms): for s in p.prod: if s not in p.usyms: p.usyms.append(s) - + # Add to the global productions list try: Prodnames[p.name].append(p) @@ -688,15 +1378,20 @@ def add_function(f): line = f.func_code.co_firstlineno file = f.func_code.co_filename error = 0 - - if f.func_code.co_argcount > 1: - print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) + + if isinstance(f,types.MethodType): + reqdargs = 2 + else: + reqdargs = 1 + + if f.func_code.co_argcount > reqdargs: + sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) return -1 - if f.func_code.co_argcount < 1: - print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) + if f.func_code.co_argcount < reqdargs: + sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) return -1 - + if f.__doc__: # Split the doc string into lines pstrings = f.__doc__.splitlines() @@ -710,7 +1405,7 @@ def add_function(f): if p[0] == '|': # This is a continuation of a previous rule if not lastp: - print "%s:%d: Misplaced '|'." % (file,dline) + sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) return -1 prodname = lastp if len(p) > 1: @@ -726,15 +1421,19 @@ def add_function(f): else: syms = [ ] if assign != ':' and assign != '::=': - print "%s:%d: Syntax error. Expected ':'" % (file,dline) + sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) return -1 + + e = add_production(f,file,dline,prodname,syms) error += e + + except StandardError: - print "%s:%d: Syntax error in rule '%s'" % (file,dline,ps) + sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) error -= 1 else: - print "%s:%d: No documentation string specified in function '%s'" % (file,line,f.__name__) + sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) return error @@ -754,7 +1453,7 @@ def compute_reachable(): for s in Nonterminals.keys(): if not Reachable[s]: - print "yacc: Symbol '%s' is unreachable." % s + sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) def mark_reachable_from(s, Reachable): ''' @@ -785,7 +1484,7 @@ def compute_terminates(): for t in Terminals.keys(): Terminates[t] = 1 - Terminates['$'] = 1 + Terminates['$end'] = 1 # Nonterminals: @@ -831,7 +1530,7 @@ def compute_terminates(): # so it would be overkill to say that it's also non-terminating. pass else: - print "yacc: Infinite recursion detected for symbol '%s'." % s + sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) some_error = 1 return some_error @@ -848,17 +1547,17 @@ def verify_productions(cycle_check=1): for s in p.prod: if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': - print "%s:%d: Symbol '%s' used, but not defined as a token or a rule." % (p.file,p.line,s) + sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) error = 1 continue - unused_tok = 0 + unused_tok = 0 # Now verify all of the tokens if yaccdebug: _vf.write("Unused terminals:\n\n") for s,v in Terminals.items(): if s != 'error' and not v: - print "yacc: Warning. Token '%s' defined, but not used." % s + sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) if yaccdebug: _vf.write(" %s\n"% s) unused_tok += 1 @@ -867,25 +1566,25 @@ def verify_productions(cycle_check=1): _vf.write("\nGrammar\n\n") for i in range(1,len(Productions)): _vf.write("Rule %-5d %s\n" % (i, Productions[i])) - + unused_prod = 0 # Verify the use of all productions for s,v in Nonterminals.items(): if not v: p = Prodnames[s][0] - print "%s:%d: Warning. Rule '%s' defined, but not used." % (p.file,p.line, s) + sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) unused_prod += 1 - + if unused_tok == 1: - print "yacc: Warning. There is 1 unused token." + sys.stderr.write("yacc: Warning. There is 1 unused token.\n") if unused_tok > 1: - print "yacc: Warning. There are %d unused tokens." % unused_tok + sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) if unused_prod == 1: - print "yacc: Warning. There is 1 unused rule." + sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") if unused_prod > 1: - print "yacc: Warning. There are %d unused rules." % unused_prod + sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) if yaccdebug: _vf.write("\nTerminals, with rules where they appear\n\n") @@ -917,7 +1616,7 @@ def verify_productions(cycle_check=1): # # Creates the list # -# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] +# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] # ----------------------------------------------------------------------------- def build_lritems(): @@ -954,20 +1653,35 @@ def add_precedence(plist): prec = p[0] terms = p[1:] if prec != 'left' and prec != 'right' and prec != 'nonassoc': - print "yacc: Invalid precedence '%s'" % prec + sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) return -1 for t in terms: if Precedence.has_key(t): - print "yacc: Precedence already specified for terminal '%s'" % t + sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) error += 1 continue Precedence[t] = (prec,plevel) except: - print "yacc: Invalid precedence table." + sys.stderr.write("yacc: Invalid precedence table.\n") error += 1 return error +# ----------------------------------------------------------------------------- +# check_precedence() +# +# Checks the use of the Precedence tables. This makes sure all of the symbols +# are terminals or were used with %prec +# ----------------------------------------------------------------------------- + +def check_precedence(): + error = 0 + for precname in Precedence.keys(): + if not (Terminals.has_key(precname) or UsedPrecedence.has_key(precname)): + sys.stderr.write("yacc: Precedence rule '%s' defined for unknown symbol '%s'\n" % (Precedence[precname][0],precname)) + error += 1 + return error + # ----------------------------------------------------------------------------- # augment_grammar() # @@ -1026,15 +1740,15 @@ def first(beta): # that might follow it. Dragon book, p. 189. def compute_follow(start=None): - # Add '$' to the follow list of the start symbol + # Add '$end' to the follow list of the start symbol for k in Nonterminals.keys(): Follow[k] = [ ] if not start: start = Productions[1].name - - Follow[start] = [ '$' ] - + + Follow[start] = [ '$end' ] + while 1: didadd = 0 for p in Productions[1:]: @@ -1075,7 +1789,7 @@ def compute_first1(): for t in Terminals.keys(): First[t] = [t] - First['$'] = ['$'] + First['$end'] = ['$end'] First['#'] = ['#'] # what's this for? # Nonterminals: @@ -1112,12 +1826,14 @@ def compute_first1(): # Global variables for the LR parsing engine def lr_init_vars(): global _lr_action, _lr_goto, _lr_method - global _lr_goto_cache - + global _lr_goto_cache, _lr0_cidhash + _lr_action = { } # Action table _lr_goto = { } # Goto table _lr_method = "Unknown" # LR method used _lr_goto_cache = { } + _lr0_cidhash = { } + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. # prodlist is a list of productions. @@ -1126,11 +1842,11 @@ _add_count = 0 # Counter used to detect cycles def lr0_closure(I): global _add_count - + _add_count += 1 prodlist = Productions - - # Add everything in I to J + + # Add everything in I to J J = I[:] didadd = 1 while didadd: @@ -1142,7 +1858,7 @@ def lr0_closure(I): J.append(x.lr_next) x.lr0_added = _add_count didadd = 1 - + return J # Compute the LR(0) goto function goto(I,X) where I is a set @@ -1159,7 +1875,7 @@ def lr0_goto(I,x): # Now we generate the goto set in a way that guarantees uniqueness # of the result - + s = _lr_goto_cache.get(x,None) if not s: s = { } @@ -1175,30 +1891,21 @@ def lr0_goto(I,x): s[id(n)] = s1 gs.append(n) s = s1 - g = s.get('$',None) + g = s.get('$end',None) if not g: if gs: g = lr0_closure(gs) - s['$'] = g + s['$end'] = g else: - s['$'] = gs + s['$end'] = gs _lr_goto_cache[(id(I),x)] = g return g -# Compute the kernel of a set of LR(0) items -def lr0_kernel(I): - KI = [ ] - for p in I: - if p.name == "S'" or p.lr_index > 0 or p.len == 0: - KI.append(p) - - return KI - _lr0_cidhash = { } # Compute the LR(0) sets of item function def lr0_items(): - + C = [ lr0_closure([Productions[0].lr_next]) ] i = 0 for I in C: @@ -1221,42 +1928,392 @@ def lr0_items(): g = lr0_goto(I,x) if not g: continue if _lr0_cidhash.has_key(id(g)): continue - _lr0_cidhash[id(g)] = len(C) + _lr0_cidhash[id(g)] = len(C) C.append(g) - + return C # ----------------------------------------------------------------------------- -# slr_parse_table() +# ==== LALR(1) Parsing ==== +# +# LALR(1) parsing is almost exactly the same as SLR except that instead of +# relying upon Follow() sets when performing reductions, a more selective +# lookahead set that incorporates the state of the LR(0) machine is utilized. +# Thus, we mainly just have to focus on calculating the lookahead sets. +# +# The method used here is due to DeRemer and Pennelo (1982). +# +# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) +# Lookahead Sets", ACM Transactions on Programming Languages and Systems, +# Vol. 4, No. 4, Oct. 1982, pp. 615-649 +# +# Further details can also be found in: +# +# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", +# McGraw-Hill Book Company, (1985). +# +# Note: This implementation is a complete replacement of the LALR(1) +# implementation in PLY-1.x releases. That version was based on +# a less efficient algorithm and it had bugs in its implementation. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# compute_nullable_nonterminals() +# +# Creates a dictionary containing all of the non-terminals that might produce +# an empty production. +# ----------------------------------------------------------------------------- + +def compute_nullable_nonterminals(): + nullable = {} + num_nullable = 0 + while 1: + for p in Productions[1:]: + if p.len == 0: + nullable[p.name] = 1 + continue + for t in p.prod: + if not nullable.has_key(t): break + else: + nullable[p.name] = 1 + if len(nullable) == num_nullable: break + num_nullable = len(nullable) + return nullable + +# ----------------------------------------------------------------------------- +# find_nonterminal_trans(C) +# +# Given a set of LR(0) items, this functions finds all of the non-terminal +# transitions. These are transitions in which a dot appears immediately before +# a non-terminal. Returns a list of tuples of the form (state,N) where state +# is the state number and N is the nonterminal symbol. +# +# The input C is the set of LR(0) items. +# ----------------------------------------------------------------------------- + +def find_nonterminal_transitions(C): + trans = [] + for state in range(len(C)): + for p in C[state]: + if p.lr_index < p.len - 1: + t = (state,p.prod[p.lr_index+1]) + if Nonterminals.has_key(t[1]): + if t not in trans: trans.append(t) + state = state + 1 + return trans + +# ----------------------------------------------------------------------------- +# dr_relation() +# +# Computes the DR(p,A) relationships for non-terminal transitions. The input +# is a tuple (state,N) where state is a number and N is a nonterminal symbol. +# +# Returns a list of terminals. +# ----------------------------------------------------------------------------- + +def dr_relation(C,trans,nullable): + dr_set = { } + state,N = trans + terms = [] + + g = lr0_goto(C[state],N) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index+1] + if Terminals.has_key(a): + if a not in terms: terms.append(a) + + # This extra bit is to handle the start state + if state == 0 and N == Productions[0].prod[0]: + terms.append('$end') + + return terms + +# ----------------------------------------------------------------------------- +# reads_relation() +# +# Computes the READS() relation (p,A) READS (t,C). +# ----------------------------------------------------------------------------- + +def reads_relation(C, trans, empty): + # Look for empty transitions + rel = [] + state, N = trans + + g = lr0_goto(C[state],N) + j = _lr0_cidhash.get(id(g),-1) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index + 1] + if empty.has_key(a): + rel.append((j,a)) + + return rel + +# ----------------------------------------------------------------------------- +# compute_lookback_includes() +# +# Determines the lookback and includes relations +# +# LOOKBACK: +# +# This relation is determined by running the LR(0) state machine forward. +# For example, starting with a production "N : . A B C", we run it forward +# to obtain "N : A B C ." We then build a relationship between this final +# state and the starting state. These relationships are stored in a dictionary +# lookdict. +# +# INCLUDES: +# +# Computes the INCLUDE() relation (p,A) INCLUDES (p',B). +# +# This relation is used to determine non-terminal transitions that occur +# inside of other non-terminal transition states. (p,A) INCLUDES (p', B) +# if the following holds: +# +# B -> LAT, where T -> epsilon and p' -L-> p +# +# L is essentially a prefix (which may be empty), T is a suffix that must be +# able to derive an empty string. State p' must lead to state p with the string L. +# +# ----------------------------------------------------------------------------- + +def compute_lookback_includes(C,trans,nullable): + + lookdict = {} # Dictionary of lookback relations + includedict = {} # Dictionary of include relations + + # Make a dictionary of non-terminal transitions + dtrans = {} + for t in trans: + dtrans[t] = 1 + + # Loop over all transitions and compute lookbacks and includes + for state,N in trans: + lookb = [] + includes = [] + for p in C[state]: + if p.name != N: continue + + # Okay, we have a name match. We now follow the production all the way + # through the state machine until we get the . on the right hand side + + lr_index = p.lr_index + j = state + while lr_index < p.len - 1: + lr_index = lr_index + 1 + t = p.prod[lr_index] + + # Check to see if this symbol and state are a non-terminal transition + if dtrans.has_key((j,t)): + # Yes. Okay, there is some chance that this is an includes relation + # the only way to know for certain is whether the rest of the + # production derives empty + + li = lr_index + 1 + while li < p.len: + if Terminals.has_key(p.prod[li]): break # No forget it + if not nullable.has_key(p.prod[li]): break + li = li + 1 + else: + # Appears to be a relation between (j,t) and (state,N) + includes.append((j,t)) + + g = lr0_goto(C[j],t) # Go to next set + j = _lr0_cidhash.get(id(g),-1) # Go to next state + + # When we get here, j is the final state, now we have to locate the production + for r in C[j]: + if r.name != p.name: continue + if r.len != p.len: continue + i = 0 + # This look is comparing a production ". A B C" with "A B C ." + while i < r.lr_index: + if r.prod[i] != p.prod[i+1]: break + i = i + 1 + else: + lookb.append((j,r)) + for i in includes: + if not includedict.has_key(i): includedict[i] = [] + includedict[i].append((state,N)) + lookdict[(state,N)] = lookb + + return lookdict,includedict + +# ----------------------------------------------------------------------------- +# digraph() +# traverse() +# +# The following two functions are used to compute set valued functions +# of the form: +# +# F(x) = F'(x) U U{F(y) | x R y} +# +# This is used to compute the values of Read() sets as well as FOLLOW sets +# in LALR(1) generation. +# +# Inputs: X - An input set +# R - A relation +# FP - Set-valued function +# ------------------------------------------------------------------------------ + +def digraph(X,R,FP): + N = { } + for x in X: + N[x] = 0 + stack = [] + F = { } + for x in X: + if N[x] == 0: traverse(x,N,stack,F,X,R,FP) + return F + +def traverse(x,N,stack,F,X,R,FP): + stack.append(x) + d = len(stack) + N[x] = d + F[x] = FP(x) # F(X) <- F'(x) + + rel = R(x) # Get y's related to x + for y in rel: + if N[y] == 0: + traverse(y,N,stack,F,X,R,FP) + N[x] = min(N[x],N[y]) + for a in F.get(y,[]): + if a not in F[x]: F[x].append(a) + if N[x] == d: + N[stack[-1]] = sys.maxint + F[stack[-1]] = F[x] + element = stack.pop() + while element != x: + N[stack[-1]] = sys.maxint + F[stack[-1]] = F[x] + element = stack.pop() + +# ----------------------------------------------------------------------------- +# compute_read_sets() +# +# Given a set of LR(0) items, this function computes the read sets. +# +# Inputs: C = Set of LR(0) items +# ntrans = Set of nonterminal transitions +# nullable = Set of empty transitions +# +# Returns a set containing the read sets +# ----------------------------------------------------------------------------- + +def compute_read_sets(C, ntrans, nullable): + FP = lambda x: dr_relation(C,x,nullable) + R = lambda x: reads_relation(C,x,nullable) + F = digraph(ntrans,R,FP) + return F + +# ----------------------------------------------------------------------------- +# compute_follow_sets() +# +# Given a set of LR(0) items, a set of non-terminal transitions, a readset, +# and an include set, this function computes the follow sets +# +# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} +# +# Inputs: +# ntrans = Set of nonterminal transitions +# readsets = Readset (previously computed) +# inclsets = Include sets (previously computed) +# +# Returns a set containing the follow sets +# ----------------------------------------------------------------------------- + +def compute_follow_sets(ntrans,readsets,inclsets): + FP = lambda x: readsets[x] + R = lambda x: inclsets.get(x,[]) + F = digraph(ntrans,R,FP) + return F + +# ----------------------------------------------------------------------------- +# add_lookaheads() +# +# Attaches the lookahead symbols to grammar rules. +# +# Inputs: lookbacks - Set of lookback relations +# followset - Computed follow set +# +# This function directly attaches the lookaheads to productions contained +# in the lookbacks set +# ----------------------------------------------------------------------------- + +def add_lookaheads(lookbacks,followset): + for trans,lb in lookbacks.items(): + # Loop over productions in lookback + for state,p in lb: + if not p.lookaheads.has_key(state): + p.lookaheads[state] = [] + f = followset.get(trans,[]) + for a in f: + if a not in p.lookaheads[state]: p.lookaheads[state].append(a) + +# ----------------------------------------------------------------------------- +# add_lalr_lookaheads() +# +# This function does all of the work of adding lookahead information for use +# with LALR parsing +# ----------------------------------------------------------------------------- + +def add_lalr_lookaheads(C): + # Determine all of the nullable nonterminals + nullable = compute_nullable_nonterminals() + + # Find all non-terminal transitions + trans = find_nonterminal_transitions(C) + + # Compute read sets + readsets = compute_read_sets(C,trans,nullable) + + # Compute lookback/includes relations + lookd, included = compute_lookback_includes(C,trans,nullable) + + # Compute LALR FOLLOW sets + followsets = compute_follow_sets(trans,readsets,included) + + # Add all of the lookaheads + add_lookaheads(lookd,followsets) + +# ----------------------------------------------------------------------------- +# lr_parse_table() # -# This function constructs an SLR table. +# This function constructs the parse tables for SLR or LALR # ----------------------------------------------------------------------------- -def slr_parse_table(): +def lr_parse_table(method): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) - _lr_method = "SLR" - + _lr_method = method + n_srconflict = 0 n_rrconflict = 0 - print "yacc: Generating SLR parsing table..." if yaccdebug: - _vf.write("\n\nParsing method: SLR\n\n") - + sys.stderr.write("yacc: Generating %s parsing table...\n" % method) + _vf.write("\n\nParsing method: %s\n\n" % method) + # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states - + C = lr0_items() + if method == 'LALR': + add_lalr_lookaheads(C) + + # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions - + st_action = { } + st_actionp = { } + st_goto = { } if yaccdebug: _vf.write("\nstate %d\n\n" % st) for p in I: @@ -1265,57 +2322,61 @@ def slr_parse_table(): for p in I: try: - if p.prod[-1] == ".": + if p.len == p.lr_index + 1: if p.name == "S'": # Start symbol. Accept! - action[st,"$"] = 0 - actionp[st,"$"] = p + st_action["$end"] = 0 + st_actionp["$end"] = p else: # We are at the end of a production. Reduce! - for a in Follow[p.name]: + if method == 'LALR': + laheads = p.lookaheads[st] + else: + laheads = Follow[p.name] + for a in laheads: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = action.get((st,a),None) + r = st_action.get(a,None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec + sprec,slevel = Productions[st_actionp[a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p + # We really need to reduce here. + st_action[a] = -p.number + st_actionp[a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None + st_action[a] = None else: # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: + if not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 + n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # print "Reduce/reduce conflict in state %d" % st + st_action[a] = -p.number + st_actionp[a] = p + # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) + _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, st_actionp[a].number, st_actionp[a])) + _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,st_actionp[a].number, st_actionp[a])) else: - print "Unknown conflict in state %d" % st + sys.stderr.write("Unknown conflict in state %d\n" % st) else: - action[st,a] = -p.number - actionp[st,a] = p + st_action[a] = -p.number + st_actionp[a] = p else: i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." @@ -1325,57 +2386,62 @@ def slr_parse_table(): if j >= 0: # We are in a shift state actlist.append((a,p,"shift and go to state %d" % j)) - r = action.get((st,a),None) + r = st_action.get(a,None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: - print "Shift/shift conflict in state %d" % st + sys.stderr.write("Shift/shift conflict in state %d\n" % st) elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec + rprec,rlevel = Productions[st_actionp[a].number].prec sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): + if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: + st_action[a] = j + st_actionp[a] = p + if not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: + st_action[a] = None + else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - + else: - print "Unknown conflict in state %d" % st + sys.stderr.write("Unknown conflict in state %d\n" % st) else: - action[st,a] = j - actionp[st,a] = p - + st_action[a] = j + st_actionp[a] = p + except StandardError,e: - raise YaccError, "Hosed in slr_parse_table", e + print sys.exc_info() + raise YaccError, "Hosed in lr_parse_table" # Print the actions associated with each terminal if yaccdebug: + _actprint = { } for a,p,m in actlist: - if action.has_key((st,a)): - if p is actionp[st,a]: + if st_action.has_key(a): + if p is st_actionp[a]: _vf.write(" %-15s %s\n" % (a,m)) + _actprint[(a,m)] = 1 _vf.write("\n") for a,p,m in actlist: - if action.has_key((st,a)): - if p is not actionp[st,a]: - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) - + if st_action.has_key(a): + if p is not st_actionp[a]: + if not _actprint.has_key((a,m)): + _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + _actprint[(a,m)] = 1 + # Construct the goto table for this state if yaccdebug: _vf.write("\n") @@ -1386,85 +2452,28 @@ def slr_parse_table(): nkeys[s] = None for n in nkeys.keys(): g = lr0_goto(I,n) - j = _lr0_cidhash.get(id(g),-1) + j = _lr0_cidhash.get(id(g),-1) if j >= 0: - goto[st,n] = j + st_goto[n] = j if yaccdebug: - _vf.write(" %-15s shift and go to state %d\n" % (n,j)) - - st += 1 - - if n_srconflict == 1: - print "yacc: %d shift/reduce conflict" % n_srconflict - if n_srconflict > 1: - print "yacc: %d shift/reduce conflicts" % n_srconflict - if n_rrconflict == 1: - print "yacc: %d reduce/reduce conflict" % n_rrconflict - if n_rrconflict > 1: - print "yacc: %d reduce/reduce conflicts" % n_rrconflict - - -# ----------------------------------------------------------------------------- -# ==== LALR(1) Parsing ==== -# **** UNFINISHED! 6/16/01 -# ----------------------------------------------------------------------------- - - -# Compute the lr1_closure of a set I. I is a list of tuples (p,a) where -# p is a LR0 item and a is a terminal - -_lr1_add_count = 0 + _vf.write(" %-30s shift and go to state %d\n" % (n,j)) -def lr1_closure(I): - global _lr1_add_count + action[st] = st_action + actionp[st] = st_actionp + goto[st] = st_goto - _lr1_add_count += 1 - - J = I[:] - - # Loop over items (p,a) in I. - ji = 0 - while ji < len(J): - p,a = J[ji] - # p = [ A -> alpha . B beta] - - # For each production B -> gamma - for B in p.lr1_after: - f = tuple(p.lr1_beta + (a,)) - - # For each terminal b in first(Beta a) - for b in first(f): - # Check if (B -> . gamma, b) is in J - # Only way this can happen is if the add count mismatches - pn = B.lr_next - if pn.lr_added.get(b,0) == _lr1_add_count: continue - pn.lr_added[b] = _lr1_add_count - J.append((pn,b)) - ji += 1 - - return J - -def lalr_parse_table(): + st += 1 - # Compute some lr1 information about all of the productions - for p in LRitems: - try: - after = p.prod[p.lr_index + 1] - p.lr1_after = Prodnames[after] - p.lr1_beta = p.prod[p.lr_index + 2:] - except LookupError: - p.lr1_after = [ ] - p.lr1_beta = [ ] - p.lr_added = { } - - # Compute the LR(0) items - C = lr0_items() - CK = [] - for I in C: - CK.append(lr0_kernel(I)) + if yaccdebug: + if n_srconflict == 1: + sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) + if n_srconflict > 1: + sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) + if n_rrconflict == 1: + sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) + if n_rrconflict > 1: + sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - print CK - # ----------------------------------------------------------------------------- # ==== LR Utility functions ==== # ----------------------------------------------------------------------------- @@ -1475,8 +2484,13 @@ def lalr_parse_table(): # This function writes the LR parsing tables to a file # ----------------------------------------------------------------------------- -def lr_write_tables(modulename=tab_module): - filename = modulename + ".py" +def lr_write_tables(modulename=tab_module,outputdir=''): + if isinstance(modulename, types.ModuleType): + print >>sys.stderr, "Warning module %s is inconsistent with the grammar (ignored)" % modulename + return + + basemodulename = modulename.split(".")[-1] + filename = os.path.join(outputdir,basemodulename) + ".py" try: f = open(filename,"w") @@ -1491,18 +2505,19 @@ _lr_signature = %s # Change smaller to 0 to go back to original tables smaller = 1 - + # Factor out names to try and make smaller if smaller: items = { } - - for k,v in _lr_action.items(): - i = items.get(k[1]) - if not i: - i = ([],[]) - items[k[1]] = i - i[0].append(k[0]) - i[1].append(v) + + for s,nd in _lr_action.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) f.write("\n_lr_action_items = {") for k,v in items.items(): @@ -1512,7 +2527,7 @@ _lr_signature = %s f.write("],[") for i in v[1]: f.write("%r," % i) - + f.write("]),") f.write("}\n") @@ -1520,10 +2535,11 @@ _lr_signature = %s _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): - _lr_action[(_x,_k)] = _y + if not _lr_action.has_key(_x): _lr_action[_x] = { } + _lr_action[_x][_k] = _y del _lr_action_items """) - + else: f.write("\n_lr_action = { "); for k,v in _lr_action.items(): @@ -1533,14 +2549,15 @@ del _lr_action_items if smaller: # Factor out names to try and make smaller items = { } - - for k,v in _lr_goto.items(): - i = items.get(k[1]) - if not i: - i = ([],[]) - items[k[1]] = i - i[0].append(k[0]) - i[1].append(v) + + for s,nd in _lr_goto.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) f.write("\n_lr_goto_items = {") for k,v in items.items(): @@ -1550,7 +2567,7 @@ del _lr_action_items f.write("],[") for i in v[1]: f.write("%r," % i) - + f.write("]),") f.write("}\n") @@ -1558,13 +2575,14 @@ del _lr_action_items _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): - _lr_goto[(_x,_k)] = _y + if not _lr_goto.has_key(_x): _lr_goto[_x] = { } + _lr_goto[_x][_k] = _y del _lr_goto_items """) else: f.write("\n_lr_goto = { "); for k,v in _lr_goto.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("(%r,%r):%r," % (k[0],k[1],v)) f.write("}\n"); # Write production table @@ -1578,18 +2596,22 @@ del _lr_goto_items else: f.write(" None,\n") f.write("]\n") + f.close() except IOError,e: - print "Unable to create '%s'" % filename - print e + print >>sys.stderr, "Unable to create '%s'" % filename + print >>sys.stderr, e return def lr_read_tables(module=tab_module,optimize=0): global _lr_action, _lr_goto, _lr_productions, _lr_method try: - exec "import %s as parsetab" % module - + if isinstance(module,types.ModuleType): + parsetab = module + else: + exec "import %s as parsetab" % module + if (optimize) or (Signature.digest() == parsetab._lr_signature): _lr_action = parsetab._lr_action _lr_goto = parsetab._lr_goto @@ -1598,49 +2620,87 @@ def lr_read_tables(module=tab_module,optimize=0): return 1 else: return 0 - + except (ImportError,AttributeError): return 0 + # ----------------------------------------------------------------------------- # yacc(module) # # Build the parser module # ----------------------------------------------------------------------------- -def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0): +def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file,outputdir=''): global yaccdebug yaccdebug = debug - + initialize_vars() files = { } error = 0 - # Add starting symbol to signature - if start: - Signature.update(start) - - # Try to figure out what module we are working with + + # Add parsing method to signature + Signature.update(method) + + # If a "module" parameter was supplied, extract its dictionary. + # Note: a module may in fact be an instance as well. + if module: # User supplied a module object. - if not isinstance(module, types.ModuleType): + if isinstance(module, types.ModuleType): + ldict = module.__dict__ + elif isinstance(module, _INSTANCETYPE): + _items = [(k,getattr(module,k)) for k in dir(module)] + ldict = { } + for i in _items: + ldict[i[0]] = i[1] + else: raise ValueError,"Expected a module" - ldict = module.__dict__ - else: # No module given. We might be able to get information from the caller. # Throw an exception and unwind the traceback to get the globals - + try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary + if f.f_globals is f.f_locals: # Collect global and local variations from caller + ldict = f.f_globals + else: + ldict = f.f_globals.copy() + ldict.update(f.f_locals) + + # Add starting symbol to signature + if not start: + start = ldict.get("start",None) + if start: + Signature.update(start) + + # Look for error handler + ef = ldict.get('p_error',None) + if ef: + if isinstance(ef,types.FunctionType): + ismethod = 0 + elif isinstance(ef, types.MethodType): + ismethod = 1 + else: + raise YaccError,"'p_error' defined, but is not a function or method." + eline = ef.func_code.co_firstlineno + efile = ef.func_code.co_filename + files[efile] = None + + if (ef.func_code.co_argcount != 1+ismethod): + raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) + global Errorfunc + Errorfunc = ef + else: + print >>sys.stderr, "yacc: Warning. no p_error() function is defined." - # If running in optimized mode. We're going to + # If running in optimized mode. We're going to read tables instead if (optimize and lr_read_tables(tabmodule,1)): # Read parse table @@ -1657,11 +2717,14 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, if p[2]: m.func = ldict[p[2]] Productions.append(m) - + else: # Get the tokens map - tokens = ldict.get("tokens",None) - + if (module and isinstance(module,_INSTANCETYPE)): + tokens = getattr(module,"tokens",None) + else: + tokens = ldict.get("tokens",None) + if not tokens: raise YaccError,"module does not define a list 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): @@ -1680,20 +2743,20 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, v1 = [x.split(".") for x in v] Requires[r] = v1 except StandardError: - print "Invalid specification for rule '%s' in require. Expected a list of strings" % r + print >>sys.stderr, "Invalid specification for rule '%s' in require. Expected a list of strings" % r + - # Build the dictionary of terminals. We a record a 0 in the # dictionary to track whether or not a terminal is actually # used in the grammar if 'error' in tokens: - print "yacc: Illegal token 'error'. Is a reserved word." + print >>sys.stderr, "yacc: Illegal token 'error'. Is a reserved word." raise YaccError,"Illegal token name" for n in tokens: if Terminals.has_key(n): - print "yacc: Warning. Token '%s' multiply defined." % n + print >>sys.stderr, "yacc: Warning. Token '%s' multiply defined." % n Terminals[n] = [ ] Terminals['error'] = [ ] @@ -1710,31 +2773,15 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, if not Precedence.has_key(n): Precedence[n] = ('right',0) # Default, right associative, 0 precedence - # Look for error handler - ef = ldict.get('p_error',None) - if ef: - if not isinstance(ef,types.FunctionType): - raise YaccError,"'p_error' defined, but is not a function." - eline = ef.func_code.co_firstlineno - efile = ef.func_code.co_filename - files[efile] = None - - if (ef.func_code.co_argcount != 1): - raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) - global Errorfunc - Errorfunc = ef - else: - print "yacc: Warning. no p_error() function is defined." - # Get the list of built-in functions with p_ prefix symbols = [ldict[f] for f in ldict.keys() - if (isinstance(ldict[f],types.FunctionType) and ldict[f].__name__[:2] == 'p_' + if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' and ldict[f].__name__ != 'p_error')] # Check for non-empty symbols if len(symbols) == 0: raise YaccError,"no rules of the form p_rulename are defined." - + # Sort the symbols by line number symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) @@ -1749,7 +2796,7 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, for f in symbols: if f.__doc__: Signature.update(f.__doc__) - + lr_init_vars() if error: @@ -1767,39 +2814,41 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, if start and not Prodnames.has_key(start): raise YaccError,"Bad starting symbol '%s'" % start - - augment_grammar(start) + + augment_grammar(start) error = verify_productions(cycle_check=check_recursion) otherfunc = [ldict[f] for f in ldict.keys() - if (isinstance(ldict[f],types.FunctionType) and ldict[f].__name__[:2] != 'p_')] + if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] + + # Check precedence rules + if check_precedence(): + error = 1 if error: raise YaccError,"Unable to construct parser." - + build_lritems() compute_first1() compute_follow(start) - - if method == 'SLR': - slr_parse_table() - elif method == 'LALR1': - lalr_parse_table() - return + + if method in ['SLR','LALR']: + lr_parse_table(method) else: raise YaccError, "Unknown parsing method '%s'" % method - - lr_write_tables(tabmodule) - + + if write_tables: + lr_write_tables(tabmodule,outputdir) + if yaccdebug: try: - f = open(debug_file,"w") + f = open(os.path.join(outputdir,debugfile),"w") f.write(_vfc.getvalue()) f.write("\n\n") f.write(_vf.getvalue()) f.close() except IOError,e: - print "yacc: can't create '%s'" % debug_file,e - + print >>sys.stderr, "yacc: can't create '%s'" % debugfile,e + # Made it here. Create a parser object and set up its internal state. # Set global parse() method to bound method of parser object. @@ -1814,6 +2863,9 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, global parse parse = p.parse + global parser + parser = p + # Clean up all of the globals we created if (not optimize): yacc_cleanup() @@ -1826,19 +2878,18 @@ def yacc_cleanup(): global _lr_action, _lr_goto, _lr_method, _lr_goto_cache del _lr_action, _lr_goto, _lr_method, _lr_goto_cache - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems + global Productions, Prodnames, Prodmap, Terminals + global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems global Errorfunc, Signature, Requires del Productions, Prodnames, Prodmap, Terminals - del Nonterminals, First, Follow, Precedence, LRitems + del Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems del Errorfunc, Signature, Requires global _vf, _vfc del _vf, _vfc - - + + # Stub that raises an error if parsing is attempted without first calling yacc() def parse(*args,**kwargs): raise YaccError, "yacc: No parser built with yacc()" - diff --git a/test/TestQueryParser.py b/test/TestQueryParser.py new file mode 100644 index 0000000..eb311fa --- /dev/null +++ b/test/TestQueryParser.py @@ -0,0 +1,69 @@ +# This file is part of python-rwhoisd +# +# Copyright (C) 2008 David E. Blacka +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA + + +import path +import QueryParser, MemDB, Rwhois +import unittest, types + +class TestParse(unittest.TestCase): + def setUp(self): + db = MemDB.MemDB() + db.init_schema("test_schema") + db.load_data("test_data") + db.index_data() + QueryParser.db = db + self.qp = QueryParser.get_parser() + + def testParseKnown(self): + queries = [ ('domain foo.com', """clause 0: + (None, '=', 'foo.com') + ('class-name', '=', 'domain') +"""), + ('a or b or domain-name = c', """clause 0: + (None, '=', 'a') +clause 1: + (None, '=', 'b') +clause 2: + ('domain-name', '=', 'c') +""") ] + + for q, s in queries: + res = QueryParser.parse(self.qp, q) + assert(isinstance(res, QueryParser.Query)) + self.assertEquals(str(res), s) + + def testParseGood(self): + qf = open("test_queries", "r") + for line in qf.xreadlines(): + line = line.strip() + if not line or line.startswith("#"): continue + res = QueryParser.parse(self.qp, line) + assert(isinstance(res, QueryParser.Query)) + + def testParseBad(self): + qf = open("test_queries_bad", "r") + for line in qf.xreadlines(): + line = line.strip() + if not line or line.startswith("#"): continue + self.assertRaises(Rwhois.RwhoisError, QueryParser.parse, self.qp, + line) + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_queries b/test/test_queries new file mode 100644 index 0000000..5489f9b --- /dev/null +++ b/test/test_queries @@ -0,0 +1,24 @@ +#10.131.252.227 +network 10.131.252/24 +network 24.210.66.0/24 +domain a.com +domain domain-Name = "a.com" +domain-name = a.com +contact doe +contact public and type = "I" +contact public and type = O or doe and type = I +contact Pub* +contact pub* +contact "Pub*" +network 11/8 +www.fddi.a.com +network class-name = bar and foo OR ip-address != foo and "1 2 3 4" and First-Name = " Bob**" +Bob +network 10.0.0.0/8 +domain domain-name="example*" and domain-name=*ple.net and host-name="f*" +domain organization="foo" +one or two or domain-name = 3 +domain-name +# these are tricky because the value is also an attribute +domain-name != first-name +first-name = domain-name \ No newline at end of file diff --git a/test/test_queries_bad b/test/test_queries_bad new file mode 100644 index 0000000..1854557 --- /dev/null +++ b/test/test_queries_bad @@ -0,0 +1,8 @@ +# no attribute 'foo' +network foo=bar +# no class 'foo' +foo bar +# no operator < +network foo < 127.0.0.1 +# just an invalid char += -- 2.36.6