rwhoisd/lex.py

   1 #-----------------------------------------------------------------------------
   2 # ply: lex.py
   3 #
   4 # Author: David M. Beazley (beazley@cs.uchicago.edu)
   5 #         Department of Computer Science
   6 #         University of Chicago
   7 #         Chicago, IL  60637
   8 #
   9 # Copyright (C) 2001, David M. Beazley
  10 #
  11 # $Header: /home/davidb/src/cvs/python-rwhoisd/rwhoisd/lex.py,v 1.1.1.1 2003/04/22 02:19:07 davidb Exp $
  12 #
  13 # This library is free software; you can redistribute it and/or
  14 # modify it under the terms of the GNU Lesser General Public
  15 # License as published by the Free Software Foundation; either
  16 # version 2.1 of the License, or (at your option) any later version.
  17 #
  18 # This library is distributed in the hope that it will be useful,
  19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # Lesser General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public
  24 # License along with this library; if not, write to the Free Software
  25 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  26 #
  27 # See the file COPYING for a complete copy of the LGPL.
  28 #
  29 #
  30 # This module automatically constructs a lexical analysis module from regular
  31 # expression rules defined in a user-defined module.  The idea is essentially the same
  32 # as that used in John Aycock's Spark framework, but the implementation works
  33 # at the module level rather than requiring the use of classes.
  34 #
  35 # This module tries to provide an interface that is closely modeled after
  36 # the traditional lex interface in Unix.  It also differs from Spark
  37 # in that:
  38 #
  39 #   -  It provides more extensive error checking and reporting if
  40 #      the user supplies a set of regular expressions that can't
  41 #      be compiled or if there is any other kind of a problem in
  42 #      the specification.
  43 #
  44 #   -  The interface is geared towards LALR(1) and LR(1) parser
  45 #      generators.  That is tokens are generated one at a time
  46 #      rather than being generated in advanced all in one step.
  47 #
  48 # There are a few limitations of this module
  49 #
  50 #   -  The module interface makes it somewhat awkward to support more
  51 #      than one lexer at a time.  Although somewhat inelegant from a
  52 #      design perspective, this is rarely a practical concern for
  53 #      most compiler projects.
  54 #
  55 #   -  The lexer requires that the entire input text be read into
  56 #      a string before scanning.  I suppose that most machines have
  57 #      enough memory to make this a minor issues, but it makes
  58 #      the lexer somewhat difficult to use in interactive sessions
  59 #      or with streaming data.
  60 #
  61 #-----------------------------------------------------------------------------
  62
  63 r"""
  64 lex.py
  65
  66 This module builds lex-like scanners based on regular expression rules.
  67 To use the module, simply write a collection of regular expression rules
  68 and actions like this:
  69
  70 # lexer.py
  71 import lex
  72
  73 # Define a list of valid tokens
  74 tokens = (
  75     'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS'
  76     )
  77
  78 # Define tokens as functions
  79 def t_IDENTIFIER(t):
  80     r' ([a-zA-Z_](\w|_)* '
  81     return t
  82
  83 def t_NUMBER(t):
  84     r' \d+ '
  85     return t
  86
  87 # Some simple tokens with no actions
  88 t_PLUS = r'\+'
  89 t_MINUS = r'-'
  90
  91 # Initialize the lexer
  92 lex.lex()
  93
  94 The tokens list is required and contains a complete list of all valid
  95 token types that the lexer is allowed to produce.  Token types are
  96 restricted to be valid identifiers.  This means that 'MINUS' is a valid
  97 token type whereas '-' is not.
  98
  99 Rules are defined by writing a function with a name of the form
 100 t_rulename.  Each rule must accept a single argument which is
 101 a token object generated by the lexer. This token has the following
 102 attributes:
 103
 104     t.type   = type string of the token.  This is initially set to the
 105                name of the rule without the leading t_
 106     t.value  = The value of the lexeme.
 107     t.lineno = The value of the line number where the token was encountered
 108
 109 For example, the t_NUMBER() rule above might be called with the following:
 110
 111     t.type  = 'NUMBER'
 112     t.value = '42'
 113     t.lineno = 3
 114
 115 Each rule returns the token object it would like to supply to the
 116 parser.  In most cases, the token t is returned with few, if any
 117 modifications.  To discard a token for things like whitespace or
 118 comments, simply return nothing.  For instance:
 119
 120 def t_whitespace(t):
 121     r' \s+ '
 122     pass
 123
 124 For faster lexing, you can also define this in terms of the ignore set like this:
 125
 126 t_ignore = ' \t'
 127
 128 The characters in this string are ignored by the lexer. Use of this feature can speed
 129 up parsing significantly since scanning will immediately proceed to the next token.
 130
 131 lex requires that the token returned by each rule has an attribute
 132 t.type.  Other than this, rules are free to return any kind of token
 133 object that they wish and may construct a new type of token object
 134 from the attributes of t (provided the new object has the required
 135 type attribute).
 136
 137 If illegal characters are encountered, the scanner executes the
 138 function t_error(t) where t is a token representing the rest of the
 139 string that hasn't been matched.  If this function isn't defined, a
 140 LexError exception is raised.  The .text attribute of this exception
 141 object contains the part of the string that wasn't matched.
 142
 143 The t.skip(n) method can be used to skip ahead n characters in the
 144 input stream.  This is usually only used in the error handling rule.
 145 For instance, the following rule would print an error message and
 146 continue:
 147
 148 def t_error(t):
 149     print "Illegal character in input %s" % t.value[0]
 150     t.skip(1)
 151
 152 Of course, a nice scanner might wish to skip more than one character
 153 if the input looks very corrupted.
 154
 155 The lex module defines a t.lineno attribute on each token that can be used
 156 to track the current line number in the input.  The value of this
 157 variable is not modified by lex so it is up to your lexer module
 158 to correctly update its value depending on the lexical properties
 159 of the input language.  To do this, you might write rules such as
 160 the following:
 161
 162 def t_newline(t):
 163     r' \n+ '
 164     t.lineno += t.value.count("\n")
 165
 166 To initialize your lexer so that it can be used, simply call the lex.lex()
 167 function in your rule file.  If there are any errors in your
 168 specification, warning messages or an exception will be generated to
 169 alert you to the problem.
 170
 171 (dave: this needs to be rewritten)
 172 To use the newly constructed lexer from another module, simply do
 173 this:
 174
 175     import lex
 176     import lexer
 177     plex.input("position = initial + rate*60")
 178
 179     while 1:
 180         token = plex.token()       # Get a token
 181         if not token: break        # No more tokens
 182         ... do whatever ...
 183
 184 Assuming that the module 'lexer' has initialized plex as shown
 185 above, parsing modules can safely import 'plex' without having
 186 to import the rule file or any additional imformation about the
 187 scanner you have defined.
 188 """
 189
 190 # -----------------------------------------------------------------------------
 191
 192
 193 __version__ = "1.3"
 194
 195 import re, types, sys, copy
 196
 197 # Exception thrown when invalid token encountered and no default
 198 class LexError(Exception):
 199     def __init__(self,message,s):
 200          self.args = (message,)
 201          self.text = s
 202
 203 # Token class
 204 class LexToken:
 205     def __str__(self):
 206         return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno)
 207     def __repr__(self):
 208         return str(self)
 209     def skip(self,n):
 210         try:
 211             self._skipn += n
 212         except AttributeError:
 213             self._skipn = n
 214
 215 # -----------------------------------------------------------------------------
 216 # Lexer class
 217 #
 218 #    input()          -  Store a new string in the lexer
 219 #    token()          -  Get the next token
 220 # -----------------------------------------------------------------------------
 221
 222 class Lexer:
 223     def __init__(self):
 224         self.lexre = None           # Master regular expression
 225         self.lexdata = None         # Actual input data (as a string)
 226         self.lexpos = 0             # Current position in input text
 227         self.lexlen = 0             # Length of the input text
 228         self.lexindexfunc = [ ]     # Reverse mapping of groups to functions and types
 229         self.lexerrorf = None       # Error rule (if any)
 230         self.lextokens = None       # List of valid tokens
 231         self.lexignore = None       # Ignored characters
 232         self.lineno = 1             # Current line number
 233         self.debug = 0              # Debugging mode
 234         self.optimize = 0           # Optimized mode
 235         self.token = self.errtoken
 236
 237     def __copy__(self):
 238         c = Lexer()
 239         c.lexre = self.lexre
 240         c.lexdata = self.lexdata
 241         c.lexpos = self.lexpos
 242         c.lexlen = self.lexlen
 243         c.lenindexfunc = self.lexindexfunc
 244         c.lexerrorf = self.lexerrorf
 245         c.lextokens = self.lextokens
 246         c.lexignore = self.lexignore
 247         c.lineno = self.lineno
 248         c.optimize = self.optimize
 249         c.token = c.realtoken
 250
 251     # ------------------------------------------------------------
 252     # input() - Push a new string into the lexer
 253     # ------------------------------------------------------------
 254     def input(self,s):
 255         if not isinstance(s,types.StringType):
 256             raise ValueError, "Expected a string"
 257         self.lexdata = s
 258         self.lexpos = 0
 259         self.lexlen = len(s)
 260         self.token = self.realtoken
 261
 262         # Change the token routine to point to realtoken()
 263         global token
 264         if token == self.errtoken:
 265             token = self.token
 266
 267     # ------------------------------------------------------------
 268     # errtoken() - Return error if token is called with no data
 269     # ------------------------------------------------------------
 270     def errtoken(self):
 271         raise RuntimeError, "No input string given with input()"
 272
 273     # ------------------------------------------------------------
 274     # token() - Return the next token from the Lexer
 275     #
 276     # Note: This function has been carefully implemented to be as fast
 277     # as possible.  Don't make changes unless you really know what
 278     # you are doing
 279     # ------------------------------------------------------------
 280     def realtoken(self):
 281         # Make local copies of frequently referenced attributes
 282         lexpos    = self.lexpos
 283         lexlen    = self.lexlen
 284         lexignore = self.lexignore
 285         lexdata   = self.lexdata
 286
 287         while lexpos < lexlen:
 288             # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
 289             if lexdata[lexpos] in lexignore:
 290                 lexpos += 1
 291                 continue
 292
 293             # Look for a regular expression match
 294             m = self.lexre.match(lexdata,lexpos)
 295             if m:
 296                 i = m.lastindex
 297                 lexpos = m.end()
 298                 tok = LexToken()
 299                 tok.value = m.group()
 300                 tok.lineno = self.lineno
 301                 tok.lexer = self
 302                 func,tok.type = self.lexindexfunc[i]
 303                 if not func:
 304                     self.lexpos = lexpos
 305                     return tok
 306
 307                 # If token is processed by a function, call it
 308                 self.lexpos = lexpos
 309                 newtok = func(tok)
 310                 self.lineno = tok.lineno     # Update line number
 311
 312                 # Every function must return a token, if nothing, we just move to next token
 313                 if not newtok: continue
 314
 315                 # Verify type of the token.  If not in the token map, raise an error
 316                 if not self.optimize:
 317                     if not self.lextokens.has_key(newtok.type):
 318                         raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
 319                             func.func_code.co_filename, func.func_code.co_firstlineno,
 320                             func.__name__, newtok.type),lexdata[lexpos:])
 321
 322                 return newtok
 323
 324             # No match. Call t_error() if defined.
 325             if self.lexerrorf:
 326                 tok = LexToken()
 327                 tok.value = self.lexdata[lexpos:]
 328                 tok.lineno = self.lineno
 329                 tok.type = "error"
 330                 tok.lexer = self
 331                 oldpos = lexpos
 332                 newtok = self.lexerrorf(tok)
 333                 lexpos += getattr(tok,"_skipn",0)
 334                 if oldpos == lexpos:
 335                     # Error method didn't change text position at all. This is an error.
 336                     self.lexpos = lexpos
 337                     raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
 338                 if not newtok: continue
 339                 self.lexpos = lexpos
 340                 return newtok
 341
 342             self.lexpos = lexpos
 343             raise LexError, ("No match found", lexdata[lexpos:])
 344
 345         # No more input data
 346         self.lexpos = lexpos + 1
 347         return None
 348
 349
 350 # -----------------------------------------------------------------------------
 351 # validate_file()
 352 #
 353 # This checks to see if there are duplicated t_rulename() functions or strings
 354 # in the parser input file.  This is done using a simple regular expression
 355 # match on each line in the filename.
 356 # -----------------------------------------------------------------------------
 357
 358 def validate_file(filename):
 359     import os.path
 360     base,ext = os.path.splitext(filename)
 361     if ext != '.py': return 1        # No idea what the file is. Return OK
 362
 363     try:
 364         f = open(filename)
 365         lines = f.readlines()
 366         f.close()
 367     except IOError:
 368         return 1                       # Oh well
 369
 370     fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
 371     sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
 372     counthash = { }
 373     linen = 1
 374     noerror = 1
 375     for l in lines:
 376         m = fre.match(l)
 377         if not m:
 378             m = sre.match(l)
 379         if m:
 380             name = m.group(1)
 381             prev = counthash.get(name)
 382             if not prev:
 383                 counthash[name] = linen
 384             else:
 385                 print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
 386                 noerror = 0
 387         linen += 1
 388     return noerror
 389
 390 # -----------------------------------------------------------------------------
 391 # _read_lextab(module)
 392 #
 393 # Reads lexer table from a lextab file instead of using introspection.
 394 # -----------------------------------------------------------------------------
 395
 396 def _read_lextab(lexer, fdict, module):
 397     exec "import %s as lextab" % module
 398     lexer.lexre = re.compile(lextab._lexre, re.VERBOSE)
 399     lexer.lexindexfunc = lextab._lextab
 400     for i in range(len(lextab._lextab)):
 401         t = lexer.lexindexfunc[i]
 402         if t:
 403             if t[0]:
 404                 lexer.lexindexfunc[i] = (fdict[t[0]],t[1])
 405     lexer.lextokens = lextab._lextokens
 406     lexer.lexignore = lextab._lexignore
 407     if lextab._lexerrorf:
 408         lexer.lexerrorf = fdict[lextab._lexerrorf]
 409
 410 # -----------------------------------------------------------------------------
 411 # lex(module)
 412 #
 413 # Build all of the regular expression rules from definitions in the supplied module
 414 # -----------------------------------------------------------------------------
 415 def lex(module=None,debug=0,optimize=0,lextab="lextab"):
 416     ldict = None
 417     regex = ""
 418     error = 0
 419     files = { }
 420     lexer = Lexer()
 421     lexer.debug = debug
 422     lexer.optimize = optimize
 423     global token,input
 424
 425     if module:
 426         if not isinstance(module, types.ModuleType):
 427             raise ValueError,"Expected a module"
 428
 429         ldict = module.__dict__
 430
 431     else:
 432         # No module given.  We might be able to get information from the caller.
 433         try:
 434             raise RuntimeError
 435         except RuntimeError:
 436             e,b,t = sys.exc_info()
 437             f = t.tb_frame
 438             f = f.f_back           # Walk out to our calling function
 439             ldict = f.f_globals    # Grab its globals dictionary
 440
 441     if optimize and lextab:
 442         try:
 443             _read_lextab(lexer,ldict, lextab)
 444             if not lexer.lexignore: lexer.lexignore = ""
 445             token = lexer.token
 446             input = lexer.input
 447             return lexer
 448
 449         except ImportError:
 450             pass
 451
 452     # Get the tokens map
 453     tokens = ldict.get("tokens",None)
 454     if not tokens:
 455         raise SyntaxError,"lex: module does not define 'tokens'"
 456     if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
 457         raise SyntaxError,"lex: tokens must be a list or tuple."
 458
 459     # Build a dictionary of valid token names
 460     lexer.lextokens = { }
 461     if not optimize:
 462
 463         # Utility function for verifying tokens
 464         def is_identifier(s):
 465             for c in s:
 466                 if not (c.isalnum() or c == '_'): return 0
 467             return 1
 468
 469         for n in tokens:
 470             if not is_identifier(n):
 471                 print "lex: Bad token name '%s'" % n
 472                 error = 1
 473             if lexer.lextokens.has_key(n):
 474                 print "lex: Warning. Token '%s' multiply defined." % n
 475             lexer.lextokens[n] = None
 476     else:
 477         for n in tokens: lexer.lextokens[n] = None
 478
 479
 480     if debug:
 481         print "lex: tokens = '%s'" % lexer.lextokens.keys()
 482
 483     # Get a list of symbols with the t_ prefix
 484     tsymbols = [f for f in ldict.keys() if f[:2] == 't_']
 485
 486     # Now build up a list of functions and a list of strings
 487     fsymbols = [ ]
 488     ssymbols = [ ]
 489     for f in tsymbols:
 490         if isinstance(ldict[f],types.FunctionType):
 491             fsymbols.append(ldict[f])
 492         elif isinstance(ldict[f],types.StringType):
 493             ssymbols.append((f,ldict[f]))
 494         else:
 495             print "lex: %s not defined as a function or string" % f
 496             error = 1
 497
 498     # Sort the functions by line number
 499     fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno))
 500
 501     # Sort the strings by regular expression length
 502     ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
 503
 504     # Check for non-empty symbols
 505     if len(fsymbols) == 0 and len(ssymbols) == 0:
 506         raise SyntaxError,"lex: no rules of the form t_rulename are defined."
 507
 508     # Add all of the rules defined with actions first
 509     for f in fsymbols:
 510
 511         line = f.func_code.co_firstlineno
 512         file = f.func_code.co_filename
 513         files[file] = None
 514
 515         if not optimize:
 516             if f.func_code.co_argcount > 1:
 517                 print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
 518                 error = 1
 519                 continue
 520
 521             if f.func_code.co_argcount < 1:
 522                 print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
 523                 error = 1
 524                 continue
 525
 526             if f.__name__ == 't_ignore':
 527                 print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
 528                 error = 1
 529                 continue
 530
 531         if f.__name__ == 't_error':
 532             lexer.lexerrorf = f
 533             continue
 534
 535         if f.__doc__:
 536             if not optimize:
 537                 try:
 538                     c = re.compile(f.__doc__, re.VERBOSE)
 539                 except re.error,e:
 540                     print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
 541                     error = 1
 542                     continue
 543
 544                 if debug:
 545                     print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__)
 546
 547             # Okay. The regular expression seemed okay.  Let's append it to the master regular
 548             # expression we're building
 549
 550             if (regex): regex += "|"
 551             regex += "(?P<%s>%s)" % (f.__name__,f.__doc__)
 552         else:
 553             print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
 554
 555     # Now add all of the simple rules
 556     for name,r in ssymbols:
 557
 558         if name == 't_ignore':
 559             lexer.lexignore = r
 560             continue
 561
 562         if not optimize:
 563             if name == 't_error':
 564                 raise SyntaxError,"lex: Rule 't_error' must be defined as a function"
 565                 error = 1
 566                 continue
 567
 568             if not lexer.lextokens.has_key(name[2:]):
 569                 print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:])
 570                 error = 1
 571                 continue
 572             try:
 573                 c = re.compile(r,re.VERBOSE)
 574             except re.error,e:
 575                 print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
 576                 error = 1
 577                 continue
 578             if debug:
 579                 print "lex: Adding rule %s -> '%s'" % (name,r)
 580
 581         if regex: regex += "|"
 582         regex += "(?P<%s>%s)" % (name,r)
 583
 584     if not optimize:
 585         for f in files.keys():
 586             if not validate_file(f):
 587                 error = 1
 588     try:
 589         if debug:
 590             print "lex: regex = '%s'" % regex
 591         lexer.lexre = re.compile(regex, re.VERBOSE)
 592
 593         # Build the index to function map for the matching engine
 594         lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1)
 595         for f,i in lexer.lexre.groupindex.items():
 596             handle = ldict[f]
 597             if isinstance(handle,types.FunctionType):
 598                 lexer.lexindexfunc[i] = (handle,handle.__name__[2:])
 599             else:
 600                 # If rule was specified as a string, we build an anonymous
 601                 # callback function to carry out the action
 602                 lexer.lexindexfunc[i] = (None,f[2:])
 603
 604         # If a lextab was specified, we create a file containing the precomputed
 605         # regular expression and index table
 606
 607         if lextab and optimize:
 608             lt = open(lextab+".py","w")
 609             lt.write("# %s.py.  This file automatically created by PLY. Don't edit.\n" % lextab)
 610             lt.write("_lexre = %s\n" % repr(regex))
 611             lt.write("_lextab = [\n");
 612             for i in range(0,len(lexer.lexindexfunc)):
 613                 t = lexer.lexindexfunc[i]
 614                 if t:
 615                     if t[0]:
 616                         lt.write("  ('%s',%s),\n"% (t[0].__name__, repr(t[1])))
 617                     else:
 618                         lt.write("  (None,%s),\n" % repr(t[1]))
 619                 else:
 620                     lt.write("  None,\n")
 621
 622             lt.write("]\n");
 623             lt.write("_lextokens = %s\n" % repr(lexer.lextokens))
 624             lt.write("_lexignore = %s\n" % repr(lexer.lexignore))
 625             if (lexer.lexerrorf):
 626                 lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__))
 627             else:
 628                 lt.write("_lexerrorf = None\n")
 629             lt.close()
 630
 631     except re.error,e:
 632         print "lex: Fatal error. Unable to compile regular expression rules. %s" % e
 633         error = 1
 634     if error:
 635         raise SyntaxError,"lex: Unable to build lexer."
 636     if not lexer.lexerrorf:
 637         print "lex: Warning. no t_error rule is defined."
 638
 639     if not lexer.lexignore: lexer.lexignore = ""
 640
 641     # Create global versions of the token() and input() functions
 642     token = lexer.token
 643     input = lexer.input
 644
 645     return lexer
 646
 647 # -----------------------------------------------------------------------------
 648 # run()
 649 #
 650 # This runs the lexer as a main program
 651 # -----------------------------------------------------------------------------
 652
 653 def runmain(lexer=None,data=None):
 654     if not data:
 655         try:
 656             filename = sys.argv[1]
 657             f = open(filename)
 658             data = f.read()
 659             f.close()
 660         except IndexError:
 661             print "Reading from standard input (type EOF to end):"
 662             data = sys.stdin.read()
 663
 664     if lexer:
 665         _input = lexer.input
 666     else:
 667         _input = input
 668     _input(data)
 669     if lexer:
 670         _token = lexer.token
 671     else:
 672         _token = token
 673
 674     while 1:
 675         tok = _token()
 676         if not tok: break
 677         print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno)
 678
 679
 680
 681