From a6b5f5371ee8a2510703e0f6f491df9f8065e622 Mon Sep 17 00:00:00 2001 From: davidb Date: Tue, 22 Apr 2003 02:19:07 +0000 Subject: [PATCH] Initial revision --- rwhoisd/Cidr.py | 170 +++ rwhoisd/DirectiveProcessor.py | 191 +++ rwhoisd/MemDB.py | 306 +++++ rwhoisd/MemIndex.py | 397 ++++++ rwhoisd/QueryParser.py | 245 ++++ rwhoisd/QueryProcessor.py | 285 +++++ rwhoisd/Rwhois.py | 161 +++ rwhoisd/RwhoisServer.py | 130 ++ rwhoisd/Session.py | 15 + rwhoisd/__init__.py | 0 rwhoisd/config.py | 30 + rwhoisd/lex.py | 681 ++++++++++ rwhoisd/parsetab.py | 37 + rwhoisd/yacc.py | 1844 ++++++++++++++++++++++++++++ sample_data/example_data | 99 ++ sample_data/example_queries | 13 + sample_data/example_random_queries | 5 + sample_data/example_schema | 16 + setup.py | 17 + 19 files changed, 4642 insertions(+) create mode 100644 rwhoisd/Cidr.py create mode 100644 rwhoisd/DirectiveProcessor.py create mode 100644 rwhoisd/MemDB.py create mode 100644 rwhoisd/MemIndex.py create mode 100644 rwhoisd/QueryParser.py create mode 100644 rwhoisd/QueryProcessor.py create mode 100644 rwhoisd/Rwhois.py create mode 100644 rwhoisd/RwhoisServer.py create mode 100644 rwhoisd/Session.py create mode 100644 rwhoisd/__init__.py create mode 100644 rwhoisd/config.py create mode 100644 rwhoisd/lex.py create mode 100644 rwhoisd/parsetab.py create mode 100644 rwhoisd/yacc.py create mode 100644 sample_data/example_data create mode 100644 sample_data/example_queries create mode 100644 sample_data/example_random_queries create mode 100644 sample_data/example_schema create mode 100644 setup.py diff --git a/rwhoisd/Cidr.py b/rwhoisd/Cidr.py new file mode 100644 index 0000000..57339bc --- /dev/null +++ b/rwhoisd/Cidr.py @@ -0,0 +1,170 @@ +import socket, types, copy, bisect, re + +class Cidr: + """A class representing a CIDRized IPv4 network value. + + Specifically, it is representing contiguous IPv4 network blocks + that can be expressed as a ip-address/network length pair.""" + + # FIXME: we should probably actually make this class immutable and + # add methods that generate copies of this class with different + # netlens or whatever. + + ip4addr_re = re.compile("^\d{1,3}(\.\d{1,3}){0,3}(/\d{1,2})?$") + + def __init__(self, address, netlen = -1): + """This takes either a formatted string in CIDR notation: + (e.g., "127.0.0.1/32"), A tuple consisting of an formatting + string IPv4 address and a numeric network length, or the same + as two arguments.""" + + if not Cidr.ip4addr_re.search(address): + raise ValueError, repr(address) + \ + " is not a valid CIDR representation" + + if netlen < 0: + if type(address) == types.StringType: + if "/" in address: + self.addr, self.netlen = address.split("/", 1) + else: + self.addr, self.netlen = address, 32 + elif type(address) == types.TupleType: + self.addr, self.netlen = address + else: + raise TypeError, "address must be a string or a tuple" + else: + self.addr = address + self.netlen = netlen + + # convert string network lengths to integer + if type(self.netlen) == types.StringType: + self.netlen = int(self.netlen) + + self.calc() + + def __cmp__(self, other): + """One CIDR network block is less than another if the start + address is numerically less or if the block is larger. That + is, supernets will sort before subnets. This ordering allows + for an effienct search for subnets of a given network.""" + + # FIXME: have to convert to longs to overcome signedness problems. + # There is probably a better way to do this. + res = (self.numaddr & 0xFFFFFFFFL) - (other.numaddr & 0xFFFFFFFFL) + if (res < 0 ): return -1 + if (res > 0): return 1 + res = self.netlen - other.netlen + return res + + def __str__(self): + return self.addr + "/" + str(self.netlen) + + def __repr__(self): + return "<" + str(self) + ">" + + def calc(self): + """This method should be called after any change to the main + internal state: netlen or numaddr.""" + + # make sure the network length is valid + if self.netlen > 32 or self.netlen < 0: + raise TypeError, "network length must be between 0 and 32" + + # convert the string ipv4 address to a 32bit number + self.numaddr = self._convert_ip4str(self.addr) + # calculate our netmask + self.mask = self._mask(self.netlen) + # force the cidr address into correct masked notation + self.numaddr &= self.mask + + # convert the number back to a string to normalize the string + self.addr = self._convert_ip4addr(self.numaddr) + + def is_supernet(self, other): + """returns True if the other Cidr object is a supernet (an + enclosing network block) of this one. A Cidr object is a + supernet of itself.""" + return other.numaddr & self.mask == self.numaddr + + def is_subnet(self, other): + """returns True if the other Cidr object is a subnet (an + enclosednetwork block) of this one. A Cidr object is a + subnet of itself.""" + return self.numaddr & other.mask == other.numaddr + + def netmask(self): + """return the netmask of this Cidr network""" + return self._convert_ip4addr(self.mask) + + def length(self): + """return the length (in number of addresses) of this network block""" + return 1 << (32 - self.netlen); + + def end(self): + """return the last IP address in this network block""" + return self._convert_ip4addr(self.numaddr + self.length() - 1) + + def _convert_ip4str(self, addr): + p = 3; a = 0 + for octet in addr.split(".", 3): + o = int(octet); + if (o & 0xFF != o): + raise SyntaxWarning, "octet " + str(o) + " isn't in range" + a |= o << (p * 8) + p -= 1 + return a + + def _convert_ip4addr(self, numaddr): + res = str((numaddr & 0xFF000000) >> 24 & 0xFF) + "." + \ + str((numaddr & 0x00FF0000) >> 16) + "." + \ + str((numaddr & 0x0000FF00) >> 8) + "." + \ + str(numaddr & 0x000000FF) + return res + + def _mask(self, len): + return 0xFFFFFFFF << (32 - len) + + def clone(self): + # we can get away with a shallow copy (so far) + return copy.copy(self) + + +def valid_cidr(address): + """Returns the converted Cidr object if 'address' is valid + CIDR notation, False if not. For the purposes of this module, + valid CIDR notation consists of 1 to 4 octets separated by '.' + characters, with an optional trailing "/netlen".""" + + if isinstance(address, Cidr): return address + try: + c = Cidr(address) + return c + except: + return False + + +# test driver +if __name__ == "__main__": + a = Cidr("127.00.000.1/24") + b = Cidr("127.0.0.1", 32) + c = Cidr("24.232.119.192", 26) + d = Cidr("24.232.119.0", 24) + e = Cidr(("24.224.0.0", 11)) + f = Cidr("216.168.111.0/27"); + g = Cidr("127.0.0.2/31"); + h = Cidr("127.0.0.16/32") + print f.addr + + try: + bad = Cidr("24.261.119.0", 32) + except Warning, x: + print "warning:", x + + print "cidr:", a, "num addresses:", a.length(), "ending address", \ + a.end(), "netmask", a.netmask() + + clist = [a, b, c, d, e, f, g, h] + print "unsorted list of cidr objects:\n ", clist + + clist.sort() + print "sorted list of cidr object:\n ", clist diff --git a/rwhoisd/DirectiveProcessor.py b/rwhoisd/DirectiveProcessor.py new file mode 100644 index 0000000..94e3233 --- /dev/null +++ b/rwhoisd/DirectiveProcessor.py @@ -0,0 +1,191 @@ +import re +import Rwhois, config + +class DirectiveProcessor: + + rwhois_dir_exp = re.compile(r"V-(\d+\.\d+)", re.I) + + def __init__(self, db): + self.db = db + self.directives = { + "rwhois" : self.rwhois_directive, + "limit" : self.limit_directive, + "holdconnect" : self.hold_directive, + "directive" : self.directive_directive, + "xfer" : self.xfer_directive + } + + def process_directive(self, session, line): + d_args = line.lstrip("-").split() + + if not self.directives.has_key(d_args[0]): + session.wfile.write(Rwhois.error_message(400)) + return + + self.directives[d_args[0]](session, d_args[1:]) + + + def rwhois_directive(self, session, arglist): + if not arglist: + session.wfile.write(Rwhois.error_message(338)) + return + + mo = DirectiveProcessor.rwhois_dir_exp.match(arglist[0]) + if not mo: + session.wfile.write(Rwhois.error_message(338)) + return + + # normally we would make sure that the version given was + # sufficiently great. + + session.wfile.write(config.banner_string) + session.wfile.write("\r\n") + + def limit_directive(self, session, arglist): + try: + limit = int(arglist[0]) + except (IndexError, ValueError): + session.wfile.write(Rwhois.error_message(338)) + return + + if limit > config.max_limit: + limit = config.max_limit + elif limit < config.min_limit: + limit = config.min_limit + session.limit = limit + + session.wfile.write(Rwhois.ok()) + + def hold_directive(self, session, arglist): + if not arglist: + session.wfile.write(Rwhois.error_message(338)) + return + + arg = arglist[0].lower() + if arg == "on": + session.holdconnect = True + elif arg == "off": + session.holdconnect = False + else: + session.wfile.write(Rwhois.error_message(338)) + return + + session.wfile.write(Rwhois.ok()) + + def directive_directive(self, session, arglist): + if not arglist: + reslist = [] + dirs = self.directives.keys() + dirs.sort() + for dir in dirs: + desc = dir.capitalize() + reslist.append("%%directive directive:%s" % dir) + reslist.append("%%directive description:%s directive" % desc) + session.wfile.write("\r\n".join(reslist)) + session.wfile.write("\r\n") + session.wfile.write(Rwhois.ok()) + return + if self.directives.has_key(arglist[0]): + dir = arglist[0] + desc = dir.capitalize() + session.wfile.write("%%directive directive:%s\r\n" % dir) + session.wfile.write("%%directive description:%s directive\r\n" + % desc) + else: + session.wfile.write(Rwhois.error_message(400)) + return + session.wfile.write(Rwhois.ok()) + + + def xfer_directive(self, session, arglist): + if not arglist: + session.wfile.write(Rwhois.error_message(338)) + return + + aa = arglist[0].lower() + + oc = None + attr_list = [] + for arg in arglist[1:]: + if arg.startswith("class="): + oc = arg[6:].lower() + elif arg.startswith("attribute="): + attr = arg[10:].lower() + if attr: attr_list.append(attr) + + # check the constraints + if not self.db.is_autharea(aa): + session.wfile.write(Rwhois.error_message((340, aa))) + return + if oc and not self.db.is_objectclass(oc): + session.wfile.write(Rwhois.error_message((341, oc))) + return + + for attr in attr_list: + if not self.db.is_attribute(attr): + session.wfile.write(Rwhois.error_message((342, attr))) + return + + # now iterate over the entire dataset looking for objects that + # match our criteria. + + objs = self.db.object_iterator() + + for obj in objs: + # Note: in theory, we should leverage QueryProcessors + # filtering code. + if obj.get_attr_value("auth-area").lower() != aa: + continue + if oc and obj.get_attr_value("class-name").lower() != oc: + continue + + if attr_list: + session.wfile.write(obj.attrs_to_wire_str(attr_list, "%xfer ")) + else: + session.wfile.write(obj.to_wire_str("%xfer ")) + session.wfile.write("\r\n%xfer \r\n"); + + session.wfile.write(Rwhois.ok()) + + +if __name__ == '__main__': + + import sys + import MemDB + + session = Session.Context() + session.wfile = sys.stdout + + db = MemDB.MemDB() + + db.init_schema(sys.argv[1]) + for data_file in sys.argv[2:]: + db.load_data(data_file) + db.index_data() + + dp = DirectiveProcessor(db) + + directives = [ "-rwhois", + "-rwhois foo bar baz", + "-rwhois V-1.6 noise blah", + "-limit", + "-limit a", + "-limit 20", + "-holdconnect", + "-holdconnect on", + "-holdconnect foo", + "-directive", + "-directive limit", + "-directive foo", + "-xfer", + "-xfer a.com", + "-xfer a.com class=contact", + "-xfer a.com class=domain attribute=domain-name", + "-xfer foo class=bar", + "-xfer foo class=bar attribute=baz attribute=boo", + "-foo baz bar" ] + + for dir in directives: + print "trying %r:" % dir + print dp.process_directive(session, dir) + diff --git a/rwhoisd/MemDB.py b/rwhoisd/MemDB.py new file mode 100644 index 0000000..5d2b6e7 --- /dev/null +++ b/rwhoisd/MemDB.py @@ -0,0 +1,306 @@ +import bisect, types +import MemIndex, Cidr +from Rwhois import rwhoisobject + +class MemDB: + + def __init__(self): + + # a dictonary holding the various attribute indexes. The keys + # are lowercase attribute names, values are MemIndex or + # CidrMemIndex objects. + self.indexes = {} + + # a dictonary holding the actual rwhoisobjects. keys are + # string IDs, values are rwhoisobject instances. + self.main_index = {} + + # dictonary holding all of the seen attributes. keys are + # lowercase attribute names, value is a character indicating + # the index type (if indexed), or None if not indexed. Index + # type characters a 'N' for normal string index, 'C' for CIDR + # index. + self.attrs = {} + + # Lists containing attribute names that have indexes by type. + # This exists so unconstrained searches can just iterate over + # them. + self.normal_indexes = [] + self.cidr_indexes = [] + + # dictonary holding all of the seen class names. keys are + # lowercase classnames, value is always None. + self.classes = {} + + # dictionary holding all of the seen auth-areas. keys are + # lowercase authority area names, value is always None. + self.authareas = {} + + def init_schema(self, schema_file): + """Initialize the schema from a schema file. Currently the + schema file is a list of 'attribute_name = index_type' pairs, + one per line. index_type is one of N or C, where N means a + normal string index, and C means a CIDR index. + + It should be noted that this database implementation + implements a global namespace for attributes, which isn't + really correct according to RFC 2167. RFC 2167 dictates that + different authority area are actually autonomous and thus have + separate schemas.""" + + # initialize base schema + + self.attrs['id'] = "N" + self.attrs['auth-area'] = None + self.attrs['class-name'] = None + self.attrs['updated'] = None + self.attrs['referred-auth-area'] = "R" + + sf = open(schema_file, "r") + + for line in sf.xreadlines(): + line = line.strip() + if not line or line.startswith("#"): continue + + attr, it = line.split("=") + self.attrs[attr.strip().lower()] = it.strip()[0].upper() + + for attr, index_type in self.attrs.items(): + if index_type == "N": + # normal index + self.indexes[attr] = MemIndex.MemIndex() + self.normal_indexes.append(attr) + elif index_type == "A": + # "all" index -- both a normal and a cidr index + self.indexes[attr] = MemIndex.ComboMemIndex() + self.normal_indexes.append(attr) + self.cidr_indexes.append(attr) + elif index_type == "R": + # referral index, an all index that must be searched + # explictly by attribute + self.indexes[attr] = MemIndex.ComboMemIndex() + elif index_type == "C": + # a cidr index + self.indexes[attr] = MemIndex.CidrMemIndex() + self.cidr_indexes.append(attr) + return + + def add_object(self, obj): + """Add an rwhoisobject to the raw indexes, including the + master index.""" + + # add the object to the main index + id = obj.getid() + if not id: return + id = id.lower() + + self.main_index[id] = obj + + for a,v in obj.items(): + # note the attribute. + index_type = self.attrs.setdefault(a, None) + v = v.lower() + # make sure that we note the auth-area and class + if a == 'auth-area': + self.authareas.setdefault(v, None) + elif a == 'class-name': + self.classes.setdefault(v, None) + + if index_type: + index = self.indexes[a] + index.add(v, id) + + def load_data(self, data_file): + """Load data from rwhoisd-style TXT files (i.e., attr:value, + records separated with a "---" bare line).""" + + df = open(data_file, "r") + obj = rwhoisobject() + + for line in df.xreadlines(): + line = line.strip() + if line.startswith("#"): continue + if not line or line.startswith("---"): + # we've reached the end of an object, so index it. + self.add_object(obj) + # reset obj + obj = rwhoisobject() + continue + + a, v = line.split(":", 1) + obj.add_attr(a, v.lstrip()) + + self.add_object(obj) + return + + def index_data(self): + """Prepare the indexes for searching. Currently, this isn't + strictly necessary (the indexes will prepare themselves when + necessary), but it should elminate a penalty on initial + searches""" + + for i in self.indexes.values(): + i.prepare() + return + + def is_attribute(self, attr): + return self.attrs.has_key(attr.lower()) + + def is_indexed_attr(self, attr): + if self.is_attribute(attr): + return self.attrs[attr.lower()] + return False + + def is_objectclass(self, objectclass): + return self.classes.has_key(objectclass.lower()) + + def is_autharea(self, aa): + return self.authareas.has_key(aa.lower()) + + def fetch_objects(self, id_list): + return [ self.main_index[x] for x in id_list + if self.main_index.has_key(x) ] + + def search_attr(self, attr, value, max = 0): + + """Search for a value in a particular attribute's index. If + the attribute is cidr indexed, an attempt to convert value + into a Cidr object will be made. Returns a list of object ids + (or an empty list if nothing was found)""" + + attr = attr.lower() + index_type = self.attrs.get(attr) + index = self.indexes.get(attr) + if not index: return [] + + super_prefix_match = False + if value.endswith("**"): + super_prefix_match = True + + prefix_match = False + if value.endswith("*"): + value = value.rstrip("*") + prefix_match = True + + if index_type == 'C' and not isinstance(value, Cidr.Cidr): + value = Cidr.valid_cidr(value) + else: + value = value.strip().lower() + + if index_type == 'C' and super_prefix_match: + return index.find_subnets(value, max) + + res = index.find(value, prefix_match, max) + return IndexResult(res) + + def search_normal(self, value, max = 0): + """Search for a value in the 'normal' (string keyed) indexes. + Returns a list of object ids, or an empty list if nothing was + found.""" + + res = IndexResult() + + for attr in self.normal_indexes: + res.extend(self.search_attr(attr, value, max)) + if max: + if len(res) >= max: + res.truncate(max) + return res + return res + + def search_cidr(self, value, max = 0): + """Search for a value in the cidr indexes. Returns a list of + object ids, or an empty list if nothing was found.""" + + res = IndexResult() + for attr in self.cidr_indexes: + res.extend(self.search_attr(attr, value, max)) + if max: + if len(res) >= max: + res.truncate(max) + return res + return res + + def search_referral(self, value, max = 0): + """Given a heirarchal value, search for referrals. Returns a + list of object ids or an empty list.""" + + return self.search_attr("referred-auth-area", value, max) + + def object_iterator(self): + return self.main_index.itervalues() + +class IndexResult: + def __init__(self, list=None): + if not list: list = [] + self.data = list + self._dict = dict(zip(self.data, self.data)) + + def extend(self, list): + if isinstance(list, type(self)): + list = list.list() + new_els = [ x for x in list if not self._dict.has_key(x) ] + self.data.extend(new_els) + self._dict.update(dict(zip(new_els, new_els))) + + def list(self): + return self.data + + def truncate(self, n=0): + to_del = self.data[n:] + for i in to_del: del self._dict[i] + self.data = self.data[:n] + + +# test driver +if __name__ == "__main__": + import sys + db = MemDB() + + print "loading schema:", sys.argv[1] + db.init_schema(sys.argv[1]) + for data_file in sys.argv[2:]: + print "loading data file:", data_file + db.load_data(data_file) + db.index_data() + + print "Schema: authority areas" + for a in db.authareas.keys(): + print " %s" % a + print "Schema: classes" + for c in db.classes.keys(): + print " %s" % c + print "Schema: attributes" + for a in db.attrs.keys(): + print " %s" % a + + print "Is 'Network' a class?", db.is_objectclass("Network") + +# for k, v in db.main_index.items(): +# print "main_index[", k, "]:", v + + print "searching for a.com" + res = db.search_attr("domain-name", "a.com") + print res.list() + print [ str(x) for x in db.fetch_objects(res.list()) ] + + print "searching for doe" + res = db.search_normal("doe") + print res.list() + print [ str(x) for x in db.fetch_objects(res.list()) ] + + print "searching for 10.0.0.2" + res = db.search_cidr("10.0.0.2") + print res.list() + print [ str(x) for x in db.fetch_objects(res.list()) ] + + print "searching for fddi.a.com" + res = db.search_normal("fddi.a.com") + print res.list() + + print "searching referral index for fddi.a.com" + res = db.search_attr("referred-auth-area", "fddi.a.com") + print res.list() + print [ str(x) for x in db.fetch_objects(res.list()) ] + + diff --git a/rwhoisd/MemIndex.py b/rwhoisd/MemIndex.py new file mode 100644 index 0000000..24a64e2 --- /dev/null +++ b/rwhoisd/MemIndex.py @@ -0,0 +1,397 @@ +import bisect, types +import Cidr + +class MemIndex: + """This class implements a simple in-memory key-value map. This + index supports efficient prefix matching (as well as pretty + efficient exact matching). Internally, it is implemented as a + sorted list supporting binary searches.""" + + # NOTE: This implementation is probably far from as efficient as + # it could be. Ideally, we should avoid the use of custom + # comparison functions, so that list.sort() will use built-in + # comparitors. This would mean storing raw key tuples in index as + # opposed to element objects. Also, it would mean avoiding the + # use of objects (like Cidr) as keys in favor of a primitive type. + # In the Cidr case, we would either have to use longs or strings, + # as Python doesn't seem to have an unsigned 32-bit integer type. + + def __init__(self): + self.index = [] + self.sorted = False + + def add(self, key, value=None): + """Add a key-value pair to the map. If the map is already in + the prepared state, this operation will preserve it, so don't + use this if many elements are to be added at once. The 'key' + argument may be a 2 element tuple, in which case 'value' is + ignored.""" + + if isinstance(key, types.TupleType): + el = element(key[0], key[1]) + else: + el = element(key, value) + + if self.sorted: + i = bisect.bisect_left(self.index, el) + while i < len(self.index): + if self.index[i].total_equals(el): + break + if self.index[i] != el: + self.index.insert(i, el) + break + i += 1 + else: + self.index.append(el) + else: + self.index.append(el) + + def addlist(self, list): + """Add the entire list of elements to the map. The elements + of 'list' may be 2 element tuples or actual 'element' objects. + Use this method to add many elements at once.""" + + self.sorted = False + for i in list: + if isinstance(i, types.TupleType): + self.index.append(element(i[0], i[1])) + elif isinstance(i, element): + self.index.append(i) + + def prepare(self): + """Put the map in a prepared state, if necessary.""" + + n = len(self.index) + if not n: return + if not self.sorted: + self.index.sort() + # unique the index + last = self.index[0] + lasti = i = 1 + while i < n: + if not self.index[i].total_equals(last): + self.index[lasti] = last = self.index[i] + lasti += 1 + i += 1 + self.index[lasti:] + self.sorted = True + + def _find(self, key): + """Return the (search_element, index) tuple. Used internally + only.""" + + self.prepare() + search_el = element(key, None) + i = bisect.bisect_left(self.index, search_el) + if i > len(self.index) or i < 0: + print "warning: bisect.bisect_left returned something " + \ + "unexpected:", i, len(self.index) + return (search_el, i) + + def find(self, key, prefix_match=False, max=0): + """Return a list of values whose keys string match 'key'. If + prefix_match is True, then keys will match if 'key' is a + prefix of the element key.""" + + search_el, i = self._find(key) + res = [] + while i < len(self.index): + if max and len(res) == max: break + if search_el.equals(self.index[i], prefix_match): + res.append(self.index[i].value) + i += 1 + else: + break + return res + +class CidrMemIndex(MemIndex): + """This is an in-memory map that has been extended to support CIDR + searching semantics.""" + + # NOTE: this structure lends to fairly efficient exact searches + # (O[log2N]), effience subnet searches (also O[log2N]), but not + # terribly efficient supernet searches (O[32log2N]), because we + # have to potentially do 32 exact matches. If we want efficient + # supernet searches, we will probably have to use some sort of + # general (i.e., not binary) search tree datastructure, as there + # is no sorted ordering that will efficiently give supernets that + # I can think of. + + def add(self, key, value=None): + if isinstance(key, types.TupleType): + MemIndex.add(self, (Cidr.valid_cidr(key[0]), key[1]), value) + else: + MemIndex.add(self, Cidr.valid_cidr(key), value) + return + + def addlist(self, list): + + # make sure the keys are Cidr objects + for i in list: + if isinstance(i, types.TupleType): + i = (Cidr.valid_cidr(el[0]), el[1]) + elif isinstance(el, element): + i.key = Cidr.valid_cidr(i.key) + + MemIndex.addlist(self, list) + return + + def find_exact(self, key, max = 0): + + key = Cidr.valid_cidr(key) + search_el, i = self._find(key) + res = [] + while i < len(self.index) and self.index[i].key == key: + res.append(self.index[i].value) + if max and len(res) == max: break + i += 1 + return res + + def find_subnets(self, key, max = 0): + """Return all values that are subnets of 'key', including any + that match 'key' itself.""" + + key = Cidr.valid_cidr(key) + search_el, i = self._find(key) + + res = [] + while i < len(self.index) and self.index[i].key.is_subnet(key): + if max and len(res) == max: break + res.append(self.index[i].value) + i += 1 + return res + + def find_supernets(self, key, max = 0): + """Return all values that are supernets of 'key', including + any that match 'key' itself.""" + + key = Cidr.valid_cidr(key) + k = key.clone() + res = [] + while k.netlen >= 0: + k.calc() + res += self.find_exact(k, max) + if max and len(res) >= max: + return res[:max] + k.netlen -= 1 + + + return res + + def find(self, key, prefix_match=0, max=0): + """Return either the exact match of 'key', or the closest + supernet of 'key'. If prefix_match is True, then find all + supernets of 'key'""" + + key = Cidr.valid_cidr(key) + if prefix_match == 0: + res = self.find_exact(key, max) + + if not res: + # now do a modified supernet search that stops after + # the first proper supernet, but gets all values + # matching that supernet key + k = key.clone() + k.netlen -= 1 + while not res and k.netlen >= 0: + k.calc() + res = self.find_exact(k, max) + k.netlen -= 1 + return res + + # for now, a prefix match means all supernets + return self.find_supernets(key, max) + +class ComboMemIndex: + """This is an in-memory map that contains both a normal string + index and a CIDR index. Valid CIDR values we be applied against + the CIDR index. Other values will be applied against the normal + index.""" + + def __init__(self): + self.normal_index = MemIndex() + self.cidr_index = CidrMemIndex() + + def add(self, key, value = None): + """Add a key,value pair to the correct map. See MemIndex for + the behavior of this method""" + + if isinstance(key, types.TupleType): + k = key[0] + else: + k = key + c = Cidr.valid_cidr(key) + if c: + self.cidr_index.add(key, value) + else: + self.normal_index.add(key, value) + return + + def addlist(self, list): + """Add a list of elements or key, value tuples to the + appropriate maps.""" + + cidr_list = [] + normal_list = [] + + for i in list: + if isinstance(i, element): + k, v = i.key, i.value + elif isinstance(i, types.TupleType): + k, v = i[:2] + + c = Cidr.valid_cidr(k) + if c: + cidr_list.append((c, v)) + else: + normal_list.append((k, v)) + + if cidr_list: + self.cidr_index.addlist(cidr_list) + if normal_list: + self.normal_index.addlist(normal_list) + return + + def prepare(self): + """Prepare the internally held maps for searching.""" + + self.cidr_index.prepare() + self.normal_index.prepare() + + def find(self, key, prefix_match=False, max=0): + """Return a list of values whose keys match 'key'.""" + + c = Cidr.valid_cidr(key) + if c: + return self.cidr_index.find(c, prefix_match, max) + return self.normal_index.find(key, prefix_match, max) + + def find_exact(self, key, max = 0): + """Return a list of values whose keys match 'key'. if 'key' + is not a CIDR value, then this is the same as find().""" + + c = Cidr.valid_cidr(key) + if c: + return self.cidr_index.find_exact(c, max) + return self.normal_index.find(key, False, max) + + def find_subnets(self, key, max = 0): + """If 'key' is a CIDR value (either a Cidr object or a valid + CIDR string representation, do a find_subnets on the internal + CidrMemIndex, otherwise return None.""" + + c = Cidr.valid_cidr(key) + if c: return self.cidr_index.find_subnets(key, max) + return None + + def find_supernets(self, key, max = 0): + """If 'key' is a CIDR value (either a Cidr object or a valid + CIDR string representation, do a find_supernets on the internal + CidrMemIndex, otherwise return None.""" + + c = Cidr.valid_cidr(key) + if c: return self.cidr_index.find_supernets(key, max) + return None + +class element: + """This is the base element class. It basically exists to + simplify sorting.""" + + def __init__(self, key, value): + self.key = key + self.value = value + + def __cmp__(self, other): + """Compare only on the key.""" + + if not type(self.key) == type(other.key): + print "other is incompatible type?", repr(other.key), other.key + if self.key < other.key: + return -1 + if self.key == other.key: + return 0 + return 1 + + def __str__(self): + return "<" + str(self.key) + ", " + str(self.value) + ">" + + def __repr__(self): + return "element" + str(self) + + def __hash__(self): + return self.key.__hash__() + + def equals(self, other, prefix_match=0): + if prefix_match: + return self.key == other.key[:len(self.key)] + return self.key == other.key + + def total_equals(self, other): + if not isinstance(other, type(self)): return False + return self.key == other.key and self.value == other.value + +if __name__ == "__main__": + + source = [ ("foo", "foo-id"), ("bar", "bar-id"), ("baz", "baz-id"), + ("foobar", "foo-id-2"), ("barnone", "bar-id-2"), + ("zygnax", "z-id") ] + + mi = MemIndex() + mi.addlist(source) + + print "finding foobar:" + res = mi.find("foobar") + print res + + print "finding foo*:" + res = mi.find("foo", 1) + print res + + print "finding baz:" + res = mi.find("baz") + print res + + print "adding bork" + mi.add("bork", "bork-id") + + print "finding b*:" + res = mi.find("b", 1) + print res + + ci = CidrMemIndex() + + ci.add(Cidr.Cidr("127.0.0.1/24"), "net-local-1"); + ci.add(Cidr.Cidr("127.0.0.1/32"), "net-local-2"); + ci.add(Cidr.Cidr("216.168.224.0", 22), "net-vrsn-1") + ci.add(Cidr.Cidr("216.168.252.1", 32), "net-vrsn-2") + ci.add(Cidr.Cidr("24.36.191.0/24"), "net-foo-c") + ci.add(Cidr.Cidr("24.36.191.32/27"), "net-foo-sub-c") + ci.add(Cidr.Cidr("24.36/16"), "net-foo-b") + + print "finding exactly 127.0.0.0/24" + res = ci.find(Cidr.Cidr("127.0.0.0/24")) + print res + + print "finding exactly 127.0.0.16/32" + res = ci.find(Cidr.Cidr("127.0.0.16/32")) + print res + + print "finding supernets of 127.0.0.16/32" + res = ci.find_supernets(Cidr.Cidr("127.0.0.16/32")) + print res + + print "finding supernets of 24.36.191.32/27" + res = ci.find(Cidr.Cidr("24.36.191.32/27"), 1) + print res + + print "finding supernets of 24.36.191.33/27" + res = ci.find_supernets(Cidr.Cidr("24.36.191.33/27")) + print res + + print "finding supernets of 24.36.191.64/27" + res = ci.find_supernets(Cidr.Cidr("24.36.191.64/27")) + print res + + print "finding subnets of 127.0/16" + res = ci.find_subnets(Cidr.Cidr("127.0/16")) + print res diff --git a/rwhoisd/QueryParser.py b/rwhoisd/QueryParser.py new file mode 100644 index 0000000..32fb47f --- /dev/null +++ b/rwhoisd/QueryParser.py @@ -0,0 +1,245 @@ + +# queryparser.db must be set to a DB class instance. +db = None + +# Define the Lexer for the RWhois query language + +tokens = ( + 'VALUE', + 'QUOTEDVALUE', + 'CLASS', + 'ATTR', + 'AND', + 'OR', + 'EQ', + 'NEQ' + ) + +# whitespace +t_ignore = ' \t' +# equality +t_EQ = r'=' +# inequality +t_NEQ = r'!=' + +# for now, quoted values must have the wildcards inside the quotes. +# I kind of wonder if anyone would ever notice this. +t_QUOTEDVALUE = r'["\']\*?[^"*\n]+\*{0,2}["\']' + +def t_firstvalue(t): + r'^\*?[^\s"\'=*]+\*{0,2}' + + if db.is_objectclass(t.value): + t.type = 'CLASS' + else: + t.type = 'VALUE' + return t + +def t_VALUE(t): + r'\*?[^\s"\'=!*]+\*{0,2}' + + if t.value.upper() == 'AND': + t.type = 'AND' + t.value = t.value.upper() + return t + if t.value.upper() == 'OR': + t.type = 'OR' + t.value = t.value.upper() + return t + if db.is_attribute(t.value): + t.type = 'ATTR' + else: + t.type = 'VALUE' + return t + + +def t_error(t): + pass + # print "Illegal character '%r'" % t.value[0] + # t.type = 'ERR' + # t.skip(1) + +# initalize the lexer +import lex +lex.lex() + +# Define the parser for the query language + +# 'value' productions are simple strings +# 'querystr' productions are tuples (either 1 or 3 values) +# 'query' productions are Query objects +# 'total' productions are Query objects + +def p_total_class_query(t): + 'total : CLASS query' + + t[0] = t[2] + t[0].set_class(t[1]) + +def p_total_query(t): + 'total : query' + + t[0] = t[1] + + +def p_query_oper_querystr(t): + '''query : query AND querystr + | query OR querystr''' + + t[0] = t[1] + if t[2] == 'OR': + t[0].cur_clause = [ t[3] ] + t[0].clauses.append(t[0].cur_clause) + else: + t[0].cur_clause.append(t[3]) + +def p_query_querystr(t): + 'query : querystr' + + t[0] = Query() + t[0].cur_clause = [ t[1] ] + t[0].clauses.append(t[0].cur_clause) + +def p_querystr_attr_value(t): + '''querystr : ATTR EQ value + | ATTR NEQ value''' + + t[0] = (t[1], t[2], t[3]) + +def p_querystr_attr(t): + 'querystr : ATTR' + + t[0] = (None, '=', t[1]) + +def p_querystr_value(t): + 'querystr : value' + + t[0] = (None, '=', t[1]) + + +def p_value(t): + 'value : VALUE' + + t[1] = t[1].strip() + if t[1]: + t[0] = t[1] + +def p_quotedvalue(t): + 'value : QUOTEDVALUE' + + t[0] = t[1].strip('"') + + +def p_error(t): + # print "Syntax error at '%s:%s'" % (t.type, t.value) + raise yacc.YaccError, "Syntax error at %r" % t.value + + +import types +class Query: + """A representation of a parsed RWhois query.""" + + def __init__(self): + self.clauses = [] + self.cur_clause = None + self.objectclass = None + self.prepared = False + + def __str__(self): + self._prepare() + res = '' + for i in range(len(self.clauses)): + cl = self.clauses[i] + res += "clause %d:\n" % i + for item in cl: + res += " " + repr(item) + "\n" + return res + + def __repr__(self): + return "" + + def _prepare(self): + """Prepare the query for use. For now, this means propagating + an objectclass restriction to all query clauses.""" + + if self.prepared: return + if self.objectclass: + for c in self.clauses: + c.append(("class-name", "=", self.objectclass)) + + def clauses(self): + """Return the query clauses. This is a list of AND clauses, + which are, in turn, lists of query terms. Query terms are 3 + element tuples: (attr, op, value).""" + + return self.clauses + + def set_class(self, objectclass): + """Set the query-wide objectclass restriction.""" + + # note: we don't allow the code to set this more than once, + # because we would have to code the removal of the previous + # class restriction from the query clauses, and it just isn't + # worth it. Queries are built, used and thrown away. + assert not self.prepared + self.objectclass = objectclass + return + + +import yacc +import Rwhois + +def get_parser(): + """Return a parser instances. Parser objects should not be shared + amongst threads.""" + + return yacc.yacc() + +def parse(p, query): + """Parse a query, raising a RwhoisError in case of parse failure. + Returns a Query object.""" + + # before using any parser objects, the database backend must be + # set (and it shared by all parsers). + assert db + try: + return p.parse(query) + except (lex.LexError, yacc.YaccError): + raise Rwhois.RwhoisError, (350, "") + +if __name__ == "__main__": + import sys + import MemDB + + mydb = MemDB.MemDB() + + print "loading schema:", sys.argv[1] + mydb.init_schema(sys.argv[1]) + for data_file in sys.argv[2:]: + print "loading data file:", data_file + mydb.load_data(data_file) + mydb.index_data() + + db = mydb + qp = get_parser() + + for line in sys.stdin.readlines(): + line = line.rstrip('\n') + line = line.strip() + if not line: continue + print 'inputting:', `line` + try: + res = qp.parse(line) + print repr(res) + except (lex.LexError, yacc.YaccError), x: + print "parse error occurred:", x + print "query:", line + + +# lex.input(line) +# while 1: +# tok = lex.token() +# if not tok: break +# print tok + + diff --git a/rwhoisd/QueryProcessor.py b/rwhoisd/QueryProcessor.py new file mode 100644 index 0000000..4e9c8d2 --- /dev/null +++ b/rwhoisd/QueryProcessor.py @@ -0,0 +1,285 @@ +import sys +import Cidr, Rwhois, QueryParser + +class QueryProcessor: + + def __init__(self, db): + self.db = db + + def _filter_obj_term(self, obj, term): + """Given a rwhoisobject and a query term (a 3 element tuple: + attr, operator, value), determine if the object satisfies the + term. Returns True if the object matches the term, False if + not.""" + + attr, op, searchval = term + res = False + + # filter by named attribute + if attr: + vals = obj.get_attr(attr) + if not vals: + res = False + else: + res = match_values(searchval, vals) + if op == "!=": return not res + return res + # filter by general term + else: + for val in obj.values(): + if match_value(searchval, val): + return True + return False + + def _filter_obj(self, obj, terms): + """Given a rwhoisobject and a list of query terms (i.e., a + whole AND clause), return True if the object satisfies the + terms.""" + + for term in terms: + if not self._filter_obj_term(obj, term): return False + return True + + def _filter_results(self, reslist, terms): + """Given list of result objects (not simply the ids returned + from the search) and a list of query terms (i.e., a query + clause), remove elements that do not satisfy the terms. + Returns a list of objects that satisfy the filters.""" + + if not terms: return reslist + return [ x for x in reslist if self._filter_obj(x, terms) ] + + def process_query_clause(self, clause, max=0): + """Process a query clause (a grouping of terms ANDed + together). This is where the indexed searches actually get + done. The technique used here is to search on one index and + use the rest of the clause to filter the results. Returns a + QueryResult object""" + + # the technique is to do an index search on the first (or + # maybe best) indexed term (bare terms are always considered + # indexed), and filter those results with the remaining terms. + + # Note: this could be better if we found the "optimal" query + # term. One approach may be to create a cost function and + # search for the minimum cost term. + + # Note: another approach might be to actually do indexed + # searches on all applicable terms (bare or using an indexed + # attribute) and find the intersection of the results. + + # FIXME: need to put in the referral chasing logic here, I + # think. + + st = None + sti = 0 + + # find the first searchable term: + for term, i in zip(clause, xrange(sys.maxint)): + attr, op, value = term + if op == "!=": continue + if not attr or self.db.is_indexed_attr(attr): + st, sti = term, i + break + if not st: + raise Rwhois.RwhoisError, (351, "No indexed terms in query clause") + + # remove the search term from the clause, what remains is the + # filter. + del clause[sti] + + # if we have an attribute name, search on that. + if st[0]: + res = self.db.search_attr(st[0], st[2], max) + else: + if Cidr.valid_cidr(st[2].strip("*")): + res = self.db.search_cidr(st[2], max) + else: + res = self.db.search_normal(st[2], max) + + objs = self._filter_results(self.db.fetch_objects(res.list()), clause) + + return QueryResult(objs) + + def process_full_query(self, query, max=0): + """Given a parsed query object, process it by unioning the + results of the various ORed together clauses""" + + # shortcut for the very common single clause case: + if len(query.clauses) == 1: + res = self.process_query_clause(query.clauses[0]) + return res + + res = QueryResult() + for clause in query.clauses: + res.extend(self.process_query_clause(clause)) + if max and len(res) >= max: + res.truncate(max) + break + + return res + + def process_query(self, session, queryline): + """Given a session config and a query line, parse the query, + perform any searches, return any referrals.""" + + if not session.queryparser: + session.queryparser = QueryParser.get_parser() + + # parse the query + try: + query = QueryParser.parse(session.queryparser, queryline) + except Rwhois.RwhoisError, x: + session.wfile.write(Rwhois.error_message(x)) + return + + max = session.limit + if max: max += 1 + + query_result = self.process_full_query(query, max) + + objects = query_result.objects() + referrals = query_result.referrals() + + if not objects and not referrals: + session.wfile.write(Rwhois.error_message(230)) + # session.wfile.write("\r\n") + return + + for obj in objects: + session.wfile.write(obj.to_wire_str()) + session.wfile.write("\r\n") + + if referrals: + session.wfile.write("\r\n".join(referrals)) + session.wfile.write("\r\n") + + if session.limit and len(objects) > session.limit: + session.wfile.write(330) + else: + session.wfile.write(Rwhois.ok()) + +class QueryResult: + + def __init__(self, objs=[], referrals=[]): + self.data = objs + self.ids = [ x.getid() for x in objs ] + self._dict = dict(zip(self.ids, self.ids)) + self.refs = referrals + + def extend(self, list): + if isinstance(list, type(self)): + list = list.objects() + new_objs = [ x for x in list if not self._dict.has_key(x.getid()) ] + new_ids = [ x.getid() for x in new_objs ] + self.data.extend(new_objs) + self.ids.extend(new_ids) + self._dict.update(dict(zip(new_ids, new_ids))) + + def add_referrals(self, referrals): + self.refs.extend(referrals) + + def objects(self): + return self.data + + def referrals(self): + return self.refs + + def ids(self): + return self.ids + + def truncate(self, n=0): + to_del = self.ids[n:] + for i in to_del: del self._dict[i] + self.ids = self.ids[:n] + self.data = self.data[:n] + + +def match_value(searchval, val): + """Determine if a search value matches a data value. If both + matching terms are valid CIDR objects, then they are matched + according the CIDR wildcard rules (i.e., a single trailing * is a + supernet search, ** is a subnet search). If the search value is + not wildcarded, then they are just tested for numeric equality. + Otherwise, the terms are compared using string semantics + (substring, prefix, suffix, and exact match.""" + + if match_cidr(searchval, val): return True + + # normalize the values for comparison. + searchval = searchval.lower() + val = val.lower() + + # the substring case + if searchval.startswith("*") and searchval.endswith("*"): + sv = searchval.strip("*"); + if val.find(sv) >= 0: + return True + else: + return False + # the suffix case + elif searchval.startswith("*"): + sv = searchval.lstrip("*") + return val.endswith(sv) + # the prefix case + elif searchval.endswith("*"): + sv = searchval.rstrip("*") + return val.startswith(sv) + # the exact match case + else: + return searchval == val + +def match_values(searchval, val_list): + + for val in val_list: + if match_value(searchval, val): return True + return False + +def match_cidr(searchval, val): + """If both terms are valid CIDR values (minus any trailing + wildcards of the search value), compare according the CIDR + wildcard rules: subnet, supernet, and exact match. If both terms + are not CIDR address, return False.""" + + + sv = Cidr.valid_cidr(searchval.rstrip("*")) + rv = Cidr.valid_cidr(val) + + if not sv or not rv: return False + + if (searchval.endswith("**")): + return rv.is_subnet(sv) + elif (searchval.endswith("*")): + return rv.is_supernet(sv) + else: + return rv == sv + + +if __name__ == '__main__': + + import MemDB, Session + + db = MemDB.MemDB() + + print "loading schema:", sys.argv[1] + db.init_schema(sys.argv[1]) + for data_file in sys.argv[2:]: + print "loading data file:", data_file + db.load_data(data_file) + db.index_data() + + QueryParser.db = db + processor = QueryProcessor(db) + + session = Session.Context() + session.wfile = sys.stdout + + while 1: + line = sys.stdin.readline().strip(); + if not line: break + if line.startswith("#"): continue + + print "parsing: '%s'" % line + processor.process_query(session, line) + session.wfile.write("\r\n"); + session.wfile.flush() diff --git a/rwhoisd/Rwhois.py b/rwhoisd/Rwhois.py new file mode 100644 index 0000000..54c37d1 --- /dev/null +++ b/rwhoisd/Rwhois.py @@ -0,0 +1,161 @@ +# This modules contains classes that are fairly general to RWhois +# server operation. + +class RwhoisError(Exception): + pass + +# The RWhois error codes. Most of these won't ever be used. +error_codes = { 120 : "Registration Deferred", + 130 : "Object Not Authoritative", + 230 : "No Objects Found", + 320 : "Invalid Attribute", + 321 : "Invalid Attribute Syntax", + 322 : "Required Attribute Missing", + 323 : "Object Reference Not Found", + 324 : "Primary Key Not Unique", + 325 : "Failed to Update Stale Object", + 330 : "Exceeded Response Limit", + 331 : "Invalid Limit", + 332 : "Nothing To Transfer", + 333 : "Not Master for Authority Area", + 336 : "Object Not Found", + 338 : "Invalid Directive Syntax", + 340 : "Invalid Authority Area", + 341 : "Invalid Class", + 342 : "Invalid Host/Port", + 350 : "Invalid Query Syntax", + 351 : "Query Too Complex", + 352 : "Invalid Security Method", + 353 : "Authentication Failed", + 354 : "Encryption Failed", + 360 : "Corrupt Data. Keyadd Failed", + 400 : "Directive Not Available", + 401 : "Not Authorized For Directive", + 402 : "Unidentified Error", + 420 : "Registration Not Authorized", + 436 : "Invalid Display Format", + 500 : "Memory Allocation Problem", + 501 : "Service Not Available", + 502 : "Unrecoverable Error", + 503 : "Idle Time Exceeded", + 560 : "" + } + +def error_message(value): + try: + code, msg = value + code = int(code) + except (TypeError, ValueError): + try: + code = int(value) + msg = None + except ValueError: + msg = value + code = 402 + if msg: + return "%%error %d %s: %s\r\n" % \ + (code, error_codes.get(code, 402), msg) + else: + return "%%error %d %s\r\n" % (code, error_codes.get(code, 402)) + +def ok(): + return "%ok\r\n" + +class rwhoisobject: + """This is the standard class for RWhois data objects.""" + + def __init__(self): + self.data = {} + self.attr_order = [] + + def get_attr(self, attr, default=None): + """This returns a list of values associated with a particular + attribute. The default value, if supplied, must be a single + (non-sequence) value.""" + + if default: + return self.data.get(attr.strip().lower(), [default]) + return self.data.get(attr.strip().lower(), []) + + def get_attr_value(self, attr, default=None): + """This returns a single value associated with the attribute. + If the attribute has multiple values, the first is + returned.""" + + return self.data.get(attr.strip().lower(), [default])[0] + + def getid(self): + """Return the RWhois ID of this object.""" + + return self.get_attr_value("id") + + def add_attr(self, attr, value): + """Adds an attribute to the object.""" + + attr = attr.strip().lower() + if self.data.has_key(attr): self.data[attr].append(value) + else: + self.attr_order.append(attr) + self.data.setdefault(attr, []).append(value) + + def add_attrs(self, attr_list): + """Adds a list of (attribute, value) tuples to the object.""" + for attr, value in attr_list: + self.add_attr(attr, value) + + def items(self): + """Returns the list of (attribute, value) tuples (actually 2 + elements lists). Attributes with multiple values produce + multiple tuples. The items are returned in the same order + they were added to the object.""" + + return [ [x, y] for x in self.attr_order for y in self.data[x] ] + + def values(self): + """Return the list of values in this object.""" + + return [ x for y in self.data.values() for x in y ] + + def __str__(self): + """A convenient string representation of this object""" + return '\n'.join([':'.join(x) for x in self.items()]) + + def __repr__(self): + return "" + + def attrs_to_wire_str(self, attrs, prefix=None): + """Return specific attributes in a response formatted string + (classname:attr:value)""" + + cn = self.get_attr_value("class-name", "unknown-class") + items = [ [cn, x, y] for x in attrs for y in self.data[x] ] + + if prefix: + res = '\r\n'.join([ prefix + ':'.join(x) for x in items ]) + else: + res = '\r\n'.join([ ':'.join(x) for x in items ]) + + return res; + + def to_wire_str(self, prefix=None): + """Return the response formatted string (classname:attr:value)""" + + return self.attrs_to_wire_str(self.attr_order, prefix) + + + +## A basic test driver +if __name__ == '__main__': + + obj = rwhoisobject() + obj.add_attr('id', '001') + obj.add_attr("class-name", 'contact') + obj.add_attr("class-name", "foo") + obj.add_attr('name', 'Aiden Quinn') + obj.add_attr('email', 'aquin@yahoo.com') + obj.add_attr('org-name', 'YoYoDyne Inc.') + obj.add_attr('email', 'aq@aol.net') + obj.add_attr('First-Name', 'Aiden ') + + print "obj:\n", obj + print "wire:\n", obj.to_wire_str() diff --git a/rwhoisd/RwhoisServer.py b/rwhoisd/RwhoisServer.py new file mode 100644 index 0000000..4b91d5f --- /dev/null +++ b/rwhoisd/RwhoisServer.py @@ -0,0 +1,130 @@ +#! /usr/bin/python + +import sys, socket, SocketServer + +import config, Session +import QueryParser, QueryProcessor, DirectiveProcessor, Rwhois + + +# server-wide variables + +query_processor = None +directive_processor = None + +class RwhoisTCPServer(SocketServer.ThreadingTCPServer): + def __init__(self, server_address, RequestHandlerClass): + self.allow_reuse_address = True + SocketServer.TCPServer.__init__(self, server_address, + RequestHandlerClass) + + def verify_request(self, request, client_address): + # implement access control here + return True + +class RwhoisHandler(SocketServer.StreamRequestHandler): + + def readline(self): + try: + return self.rfile.readline().strip()[:1024] + except KeyboardInterrupt: + self.quit_flag = True + return + + def handle(self): + + self.quit_flag = False + + # output a banner + self.wfile.write(config.banner_string); + self.wfile.write("\r\n"); + + # get a session. + session = Session.Context() + session.rfile = self.rfile + session.wfile = self.wfile + + # first line + line = self.readline() + + while not self.rfile.closed: + if not line: + line = self.readline() + continue + + if line.startswith("-"): + self.handle_directive(session, line) + else: + self.handle_query(session, line) + if not session.holdconnect: + self.quit_flag = True + + self.wfile.flush() + + # check to see if we were asked to quit + if self.quit_flag: break + + # next line of input + line = self.readline() + + print "done with", self.client_address + + def handle_directive(self, session, line): + if (line.startswith("-quit")): + self.quit_flag = True + self.wfile.write(Rwhois.ok()) + return + directive_processor.process_directive(session, line) + + def handle_query(self, session, line): + query_processor.process_query(session, line) + + +def usage(pname): + print """usage: %s schema_file data_file [data_file ...]""" % pname + sys.exit(64) + +def init(argv): + import MemDB + + if len(argv) < 3: usage(argv[0]) + schema_file = argv[1] + data_files = argv[2:] + + db = MemDB.MemDB() + + db.init_schema(schema_file) + for df in data_files: + db.load_data(df) + db.index_data() + + QueryParser.db = db + + global query_processor, directive_processor + + query_processor = QueryProcessor.QueryProcessor(db) + directive_processor = DirectiveProcessor.DirectiveProcessor(db) + +def serve(): + # initialize the TCP server + server = RwhoisTCPServer((config.server_address, config.port), + RwhoisHandler) + + # and handle incoming connections + try: + if not config.server_address: + print "listening on port %d" % config.port + else: + print "listening on %s port %d" % \ + (config.server_address, config.port) + server.serve_forever() + except (KeyboardInterrupt, SystemExit): + print "interrupted. exiting." + + print "finished serving" + sys.exit(0) + +if __name__ == "__main__": + + init(sys.argv) + serve() + diff --git a/rwhoisd/Session.py b/rwhoisd/Session.py new file mode 100644 index 0000000..24924af --- /dev/null +++ b/rwhoisd/Session.py @@ -0,0 +1,15 @@ +import config + +class Context: + """This class is used to hold session specific variables.""" + def __init__(self): + # each session gets its own query parser. + self.queryparser = None + + # these should be set by the handler + rfile = None + wfile = None + + # set some default values. + self.limit = config.default_limit + self.holdconnect = False diff --git a/rwhoisd/__init__.py b/rwhoisd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rwhoisd/config.py b/rwhoisd/config.py new file mode 100644 index 0000000..0f2f60d --- /dev/null +++ b/rwhoisd/config.py @@ -0,0 +1,30 @@ +"""Server global variables should be set here.""" + +import socket + +##### Editable Configuration Options + +# the port to listen on. +port = 4321 +# the interface address to bind to. "" means INADDR_ANY. +server_address = "" + +# the hostname to advertise in the banner. +server_hostname = socket.getfqdn() + +# setting this here sets a default session response limit. 0 +# means no response limit. This should be between min_limit and +# max_limit defined below. +default_limit = 0 +# set this to the maximum value that the limit can be set to +max_limit = 256 +# set this to the minimum value that the limit can be set to +# if this is zero, you are allowing clients to disable query limits. +min_limit = 0 + + +#### END Editable Configuration Options + +version = "0.1" +banner_string = "%%rwhois V-1.5 %s (pyrwhoisd %s)" % (server_hostname, version) + diff --git a/rwhoisd/lex.py b/rwhoisd/lex.py new file mode 100644 index 0000000..ca9999d --- /dev/null +++ b/rwhoisd/lex.py @@ -0,0 +1,681 @@ +#----------------------------------------------------------------------------- +# ply: lex.py +# +# Author: David M. Beazley (beazley@cs.uchicago.edu) +# Department of Computer Science +# University of Chicago +# Chicago, IL 60637 +# +# Copyright (C) 2001, David M. Beazley +# +# $Header: /home/davidb/src/cvs/python-rwhoisd/rwhoisd/lex.py,v 1.1 2003/04/22 02:19:07 davidb Exp $ +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# See the file COPYING for a complete copy of the LGPL. +# +# +# This module automatically constructs a lexical analysis module from regular +# expression rules defined in a user-defined module. The idea is essentially the same +# as that used in John Aycock's Spark framework, but the implementation works +# at the module level rather than requiring the use of classes. +# +# This module tries to provide an interface that is closely modeled after +# the traditional lex interface in Unix. It also differs from Spark +# in that: +# +# - It provides more extensive error checking and reporting if +# the user supplies a set of regular expressions that can't +# be compiled or if there is any other kind of a problem in +# the specification. +# +# - The interface is geared towards LALR(1) and LR(1) parser +# generators. That is tokens are generated one at a time +# rather than being generated in advanced all in one step. +# +# There are a few limitations of this module +# +# - The module interface makes it somewhat awkward to support more +# than one lexer at a time. Although somewhat inelegant from a +# design perspective, this is rarely a practical concern for +# most compiler projects. +# +# - The lexer requires that the entire input text be read into +# a string before scanning. I suppose that most machines have +# enough memory to make this a minor issues, but it makes +# the lexer somewhat difficult to use in interactive sessions +# or with streaming data. +# +#----------------------------------------------------------------------------- + +r""" +lex.py + +This module builds lex-like scanners based on regular expression rules. +To use the module, simply write a collection of regular expression rules +and actions like this: + +# lexer.py +import lex + +# Define a list of valid tokens +tokens = ( + 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' + ) + +# Define tokens as functions +def t_IDENTIFIER(t): + r' ([a-zA-Z_](\w|_)* ' + return t + +def t_NUMBER(t): + r' \d+ ' + return t + +# Some simple tokens with no actions +t_PLUS = r'\+' +t_MINUS = r'-' + +# Initialize the lexer +lex.lex() + +The tokens list is required and contains a complete list of all valid +token types that the lexer is allowed to produce. Token types are +restricted to be valid identifiers. This means that 'MINUS' is a valid +token type whereas '-' is not. + +Rules are defined by writing a function with a name of the form +t_rulename. Each rule must accept a single argument which is +a token object generated by the lexer. This token has the following +attributes: + + t.type = type string of the token. This is initially set to the + name of the rule without the leading t_ + t.value = The value of the lexeme. + t.lineno = The value of the line number where the token was encountered + +For example, the t_NUMBER() rule above might be called with the following: + + t.type = 'NUMBER' + t.value = '42' + t.lineno = 3 + +Each rule returns the token object it would like to supply to the +parser. In most cases, the token t is returned with few, if any +modifications. To discard a token for things like whitespace or +comments, simply return nothing. For instance: + +def t_whitespace(t): + r' \s+ ' + pass + +For faster lexing, you can also define this in terms of the ignore set like this: + +t_ignore = ' \t' + +The characters in this string are ignored by the lexer. Use of this feature can speed +up parsing significantly since scanning will immediately proceed to the next token. + +lex requires that the token returned by each rule has an attribute +t.type. Other than this, rules are free to return any kind of token +object that they wish and may construct a new type of token object +from the attributes of t (provided the new object has the required +type attribute). + +If illegal characters are encountered, the scanner executes the +function t_error(t) where t is a token representing the rest of the +string that hasn't been matched. If this function isn't defined, a +LexError exception is raised. The .text attribute of this exception +object contains the part of the string that wasn't matched. + +The t.skip(n) method can be used to skip ahead n characters in the +input stream. This is usually only used in the error handling rule. +For instance, the following rule would print an error message and +continue: + +def t_error(t): + print "Illegal character in input %s" % t.value[0] + t.skip(1) + +Of course, a nice scanner might wish to skip more than one character +if the input looks very corrupted. + +The lex module defines a t.lineno attribute on each token that can be used +to track the current line number in the input. The value of this +variable is not modified by lex so it is up to your lexer module +to correctly update its value depending on the lexical properties +of the input language. To do this, you might write rules such as +the following: + +def t_newline(t): + r' \n+ ' + t.lineno += t.value.count("\n") + +To initialize your lexer so that it can be used, simply call the lex.lex() +function in your rule file. If there are any errors in your +specification, warning messages or an exception will be generated to +alert you to the problem. + +(dave: this needs to be rewritten) +To use the newly constructed lexer from another module, simply do +this: + + import lex + import lexer + plex.input("position = initial + rate*60") + + while 1: + token = plex.token() # Get a token + if not token: break # No more tokens + ... do whatever ... + +Assuming that the module 'lexer' has initialized plex as shown +above, parsing modules can safely import 'plex' without having +to import the rule file or any additional imformation about the +scanner you have defined. +""" + +# ----------------------------------------------------------------------------- + + +__version__ = "1.3" + +import re, types, sys, copy + +# Exception thrown when invalid token encountered and no default +class LexError(Exception): + def __init__(self,message,s): + self.args = (message,) + self.text = s + +# Token class +class LexToken: + def __str__(self): + return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) + def __repr__(self): + return str(self) + def skip(self,n): + try: + self._skipn += n + except AttributeError: + self._skipn = n + +# ----------------------------------------------------------------------------- +# Lexer class +# +# input() - Store a new string in the lexer +# token() - Get the next token +# ----------------------------------------------------------------------------- + +class Lexer: + def __init__(self): + self.lexre = None # Master regular expression + self.lexdata = None # Actual input data (as a string) + self.lexpos = 0 # Current position in input text + self.lexlen = 0 # Length of the input text + self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types + self.lexerrorf = None # Error rule (if any) + self.lextokens = None # List of valid tokens + self.lexignore = None # Ignored characters + self.lineno = 1 # Current line number + self.debug = 0 # Debugging mode + self.optimize = 0 # Optimized mode + self.token = self.errtoken + + def __copy__(self): + c = Lexer() + c.lexre = self.lexre + c.lexdata = self.lexdata + c.lexpos = self.lexpos + c.lexlen = self.lexlen + c.lenindexfunc = self.lexindexfunc + c.lexerrorf = self.lexerrorf + c.lextokens = self.lextokens + c.lexignore = self.lexignore + c.lineno = self.lineno + c.optimize = self.optimize + c.token = c.realtoken + + # ------------------------------------------------------------ + # input() - Push a new string into the lexer + # ------------------------------------------------------------ + def input(self,s): + if not isinstance(s,types.StringType): + raise ValueError, "Expected a string" + self.lexdata = s + self.lexpos = 0 + self.lexlen = len(s) + self.token = self.realtoken + + # Change the token routine to point to realtoken() + global token + if token == self.errtoken: + token = self.token + + # ------------------------------------------------------------ + # errtoken() - Return error if token is called with no data + # ------------------------------------------------------------ + def errtoken(self): + raise RuntimeError, "No input string given with input()" + + # ------------------------------------------------------------ + # token() - Return the next token from the Lexer + # + # Note: This function has been carefully implemented to be as fast + # as possible. Don't make changes unless you really know what + # you are doing + # ------------------------------------------------------------ + def realtoken(self): + # Make local copies of frequently referenced attributes + lexpos = self.lexpos + lexlen = self.lexlen + lexignore = self.lexignore + lexdata = self.lexdata + + while lexpos < lexlen: + # This code provides some short-circuit code for whitespace, tabs, and other ignored characters + if lexdata[lexpos] in lexignore: + lexpos += 1 + continue + + # Look for a regular expression match + m = self.lexre.match(lexdata,lexpos) + if m: + i = m.lastindex + lexpos = m.end() + tok = LexToken() + tok.value = m.group() + tok.lineno = self.lineno + tok.lexer = self + func,tok.type = self.lexindexfunc[i] + if not func: + self.lexpos = lexpos + return tok + + # If token is processed by a function, call it + self.lexpos = lexpos + newtok = func(tok) + self.lineno = tok.lineno # Update line number + + # Every function must return a token, if nothing, we just move to next token + if not newtok: continue + + # Verify type of the token. If not in the token map, raise an error + if not self.optimize: + if not self.lextokens.has_key(newtok.type): + raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( + func.func_code.co_filename, func.func_code.co_firstlineno, + func.__name__, newtok.type),lexdata[lexpos:]) + + return newtok + + # No match. Call t_error() if defined. + if self.lexerrorf: + tok = LexToken() + tok.value = self.lexdata[lexpos:] + tok.lineno = self.lineno + tok.type = "error" + tok.lexer = self + oldpos = lexpos + newtok = self.lexerrorf(tok) + lexpos += getattr(tok,"_skipn",0) + if oldpos == lexpos: + # Error method didn't change text position at all. This is an error. + self.lexpos = lexpos + raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + if not newtok: continue + self.lexpos = lexpos + return newtok + + self.lexpos = lexpos + raise LexError, ("No match found", lexdata[lexpos:]) + + # No more input data + self.lexpos = lexpos + 1 + return None + + +# ----------------------------------------------------------------------------- +# validate_file() +# +# This checks to see if there are duplicated t_rulename() functions or strings +# in the parser input file. This is done using a simple regular expression +# match on each line in the filename. +# ----------------------------------------------------------------------------- + +def validate_file(filename): + import os.path + base,ext = os.path.splitext(filename) + if ext != '.py': return 1 # No idea what the file is. Return OK + + try: + f = open(filename) + lines = f.readlines() + f.close() + except IOError: + return 1 # Oh well + + fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') + sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') + counthash = { } + linen = 1 + noerror = 1 + for l in lines: + m = fre.match(l) + if not m: + m = sre.match(l) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) + noerror = 0 + linen += 1 + return noerror + +# ----------------------------------------------------------------------------- +# _read_lextab(module) +# +# Reads lexer table from a lextab file instead of using introspection. +# ----------------------------------------------------------------------------- + +def _read_lextab(lexer, fdict, module): + exec "import %s as lextab" % module + lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) + lexer.lexindexfunc = lextab._lextab + for i in range(len(lextab._lextab)): + t = lexer.lexindexfunc[i] + if t: + if t[0]: + lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) + lexer.lextokens = lextab._lextokens + lexer.lexignore = lextab._lexignore + if lextab._lexerrorf: + lexer.lexerrorf = fdict[lextab._lexerrorf] + +# ----------------------------------------------------------------------------- +# lex(module) +# +# Build all of the regular expression rules from definitions in the supplied module +# ----------------------------------------------------------------------------- +def lex(module=None,debug=0,optimize=0,lextab="lextab"): + ldict = None + regex = "" + error = 0 + files = { } + lexer = Lexer() + lexer.debug = debug + lexer.optimize = optimize + global token,input + + if module: + if not isinstance(module, types.ModuleType): + raise ValueError,"Expected a module" + + ldict = module.__dict__ + + else: + # No module given. We might be able to get information from the caller. + try: + raise RuntimeError + except RuntimeError: + e,b,t = sys.exc_info() + f = t.tb_frame + f = f.f_back # Walk out to our calling function + ldict = f.f_globals # Grab its globals dictionary + + if optimize and lextab: + try: + _read_lextab(lexer,ldict, lextab) + if not lexer.lexignore: lexer.lexignore = "" + token = lexer.token + input = lexer.input + return lexer + + except ImportError: + pass + + # Get the tokens map + tokens = ldict.get("tokens",None) + if not tokens: + raise SyntaxError,"lex: module does not define 'tokens'" + if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): + raise SyntaxError,"lex: tokens must be a list or tuple." + + # Build a dictionary of valid token names + lexer.lextokens = { } + if not optimize: + + # Utility function for verifying tokens + def is_identifier(s): + for c in s: + if not (c.isalnum() or c == '_'): return 0 + return 1 + + for n in tokens: + if not is_identifier(n): + print "lex: Bad token name '%s'" % n + error = 1 + if lexer.lextokens.has_key(n): + print "lex: Warning. Token '%s' multiply defined." % n + lexer.lextokens[n] = None + else: + for n in tokens: lexer.lextokens[n] = None + + + if debug: + print "lex: tokens = '%s'" % lexer.lextokens.keys() + + # Get a list of symbols with the t_ prefix + tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] + + # Now build up a list of functions and a list of strings + fsymbols = [ ] + ssymbols = [ ] + for f in tsymbols: + if isinstance(ldict[f],types.FunctionType): + fsymbols.append(ldict[f]) + elif isinstance(ldict[f],types.StringType): + ssymbols.append((f,ldict[f])) + else: + print "lex: %s not defined as a function or string" % f + error = 1 + + # Sort the functions by line number + fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) + + # Sort the strings by regular expression length + ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) + + # Check for non-empty symbols + if len(fsymbols) == 0 and len(ssymbols) == 0: + raise SyntaxError,"lex: no rules of the form t_rulename are defined." + + # Add all of the rules defined with actions first + for f in fsymbols: + + line = f.func_code.co_firstlineno + file = f.func_code.co_filename + files[file] = None + + if not optimize: + if f.func_code.co_argcount > 1: + print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) + error = 1 + continue + + if f.func_code.co_argcount < 1: + print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) + error = 1 + continue + + if f.__name__ == 't_ignore': + print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) + error = 1 + continue + + if f.__name__ == 't_error': + lexer.lexerrorf = f + continue + + if f.__doc__: + if not optimize: + try: + c = re.compile(f.__doc__, re.VERBOSE) + except re.error,e: + print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) + error = 1 + continue + + if debug: + print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) + + # Okay. The regular expression seemed okay. Let's append it to the master regular + # expression we're building + + if (regex): regex += "|" + regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) + else: + print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + + # Now add all of the simple rules + for name,r in ssymbols: + + if name == 't_ignore': + lexer.lexignore = r + continue + + if not optimize: + if name == 't_error': + raise SyntaxError,"lex: Rule 't_error' must be defined as a function" + error = 1 + continue + + if not lexer.lextokens.has_key(name[2:]): + print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) + error = 1 + continue + try: + c = re.compile(r,re.VERBOSE) + except re.error,e: + print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) + error = 1 + continue + if debug: + print "lex: Adding rule %s -> '%s'" % (name,r) + + if regex: regex += "|" + regex += "(?P<%s>%s)" % (name,r) + + if not optimize: + for f in files.keys(): + if not validate_file(f): + error = 1 + try: + if debug: + print "lex: regex = '%s'" % regex + lexer.lexre = re.compile(regex, re.VERBOSE) + + # Build the index to function map for the matching engine + lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) + for f,i in lexer.lexre.groupindex.items(): + handle = ldict[f] + if isinstance(handle,types.FunctionType): + lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) + else: + # If rule was specified as a string, we build an anonymous + # callback function to carry out the action + lexer.lexindexfunc[i] = (None,f[2:]) + + # If a lextab was specified, we create a file containing the precomputed + # regular expression and index table + + if lextab and optimize: + lt = open(lextab+".py","w") + lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) + lt.write("_lexre = %s\n" % repr(regex)) + lt.write("_lextab = [\n"); + for i in range(0,len(lexer.lexindexfunc)): + t = lexer.lexindexfunc[i] + if t: + if t[0]: + lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) + else: + lt.write(" (None,%s),\n" % repr(t[1])) + else: + lt.write(" None,\n") + + lt.write("]\n"); + lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) + lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) + if (lexer.lexerrorf): + lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) + else: + lt.write("_lexerrorf = None\n") + lt.close() + + except re.error,e: + print "lex: Fatal error. Unable to compile regular expression rules. %s" % e + error = 1 + if error: + raise SyntaxError,"lex: Unable to build lexer." + if not lexer.lexerrorf: + print "lex: Warning. no t_error rule is defined." + + if not lexer.lexignore: lexer.lexignore = "" + + # Create global versions of the token() and input() functions + token = lexer.token + input = lexer.input + + return lexer + +# ----------------------------------------------------------------------------- +# run() +# +# This runs the lexer as a main program +# ----------------------------------------------------------------------------- + +def runmain(lexer=None,data=None): + if not data: + try: + filename = sys.argv[1] + f = open(filename) + data = f.read() + f.close() + except IndexError: + print "Reading from standard input (type EOF to end):" + data = sys.stdin.read() + + if lexer: + _input = lexer.input + else: + _input = input + _input(data) + if lexer: + _token = lexer.token + else: + _token = token + + while 1: + tok = _token() + if not tok: break + print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) + + + + diff --git a/rwhoisd/parsetab.py b/rwhoisd/parsetab.py new file mode 100644 index 0000000..31d6031 --- /dev/null +++ b/rwhoisd/parsetab.py @@ -0,0 +1,37 @@ + +# parsetab.py +# This file is automatically generated. Do not edit. + +_lr_method = 'SLR' + +_lr_signature = '\xd4p\x02\xe6^\xeb\x1b\xa7\xf4\r\xb2\xf7!\xf32\x1d' + +_lr_action_items = {'AND':([13,1,6,2,15,17,16,14,3,5,4,],[11,-11,-5,-8,-7,-4,-3,-6,-10,11,-9,]),'QUOTEDVALUE':([8,10,12,0,9,11,],[1,1,1,1,1,1,]),'ATTR':([12,0,8,11,],[2,2,2,2,]),'CLASS':([0,],[8,]),'NEQ':([2,],[10,]),'EQ':([2,],[9,]),'OR':([1,15,13,6,4,2,14,5,17,16,3,],[-11,-7,12,-5,-9,-8,-6,12,-4,-3,-10,]),'VALUE':([11,9,12,10,8,0,],[3,3,3,3,3,3,]),'$':([17,14,2,5,3,7,13,16,15,1,4,6,],[-4,-6,-8,-2,-10,0,-1,-3,-7,-11,-9,-5,]),} + +_lr_action = { } +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + _lr_action[(_x,_k)] = _y +del _lr_action_items + +_lr_goto_items = {'querystr':([0,11,12,8,],[6,16,17,6,]),'total':([0,],[7,]),'value':([12,11,10,8,9,0,],[4,4,15,4,14,4,]),'query':([8,0,],[13,5,]),} + +_lr_goto = { } +for _k, _v in _lr_goto_items.items(): + for _x,_y in zip(_v[0],_v[1]): + _lr_goto[(_x,_k)] = _y +del _lr_goto_items +_lr_productions = [ + ("S'",1,None,None,None), + ('total',2,'p_total_class_query','QueryParser.py',74), + ('total',1,'p_total_query','QueryParser.py',80), + ('query',3,'p_query_oper_querystr','QueryParser.py',86), + ('query',3,'p_query_oper_querystr','QueryParser.py',87), + ('query',1,'p_query_querystr','QueryParser.py',97), + ('querystr',3,'p_querystr_attr_value','QueryParser.py',104), + ('querystr',3,'p_querystr_attr_value','QueryParser.py',105), + ('querystr',1,'p_querystr_attr','QueryParser.py',110), + ('querystr',1,'p_querystr_value','QueryParser.py',115), + ('value',1,'p_value','QueryParser.py',121), + ('value',1,'p_quotedvalue','QueryParser.py',128), +] diff --git a/rwhoisd/yacc.py b/rwhoisd/yacc.py new file mode 100644 index 0000000..b79d95f --- /dev/null +++ b/rwhoisd/yacc.py @@ -0,0 +1,1844 @@ +#----------------------------------------------------------------------------- +# ply: yacc.py +# +# Author: David M. Beazley (beazley@cs.uchicago.edu) +# Department of Computer Science +# University of Chicago +# Chicago, IL 60637 +# +# Copyright (C) 2001, David M. Beazley +# +# $Header: /home/davidb/src/cvs/python-rwhoisd/rwhoisd/yacc.py,v 1.1 2003/04/22 02:19:07 davidb Exp $ +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# See the file COPYING for a complete copy of the LGPL. +# +# +# This implements an LR parser that is constructed from grammar rules defined +# as Python functions. Roughly speaking, this module is a cross between +# John Aycock's Spark system and the GNU bison utility. +# +# Disclaimer: This is a work in progress. SLR parsing seems to work fairly +# well and there is extensive error checking. LALR(1) is in progress. The +# rest of this file is a bit of a mess. Please pardon the dust. +# +# The current implementation is only somewhat object-oriented. The +# LR parser itself is defined in terms of an object (which allows multiple +# parsers to co-exist). However, most of the variables used during table +# construction are defined in terms of global variables. Users shouldn't +# notice unless they are trying to define multiple parsers at the same +# time using threads (in which case they should have their head examined). +#----------------------------------------------------------------------------- + +__version__ = "1.3" + +#----------------------------------------------------------------------------- +# === User configurable parameters === +# +# Change these to modify the default behavior of yacc (if you wish) +#----------------------------------------------------------------------------- + +yaccdebug = 1 # Debugging mode. If set, yacc generates a + # a 'parser.out' file in the current directory + +debug_file = 'parser.out' # Default name of the debugging file +tab_module = 'parsetab' # Default name of the table module +default_lr = 'SLR' # Default LR table generation method + +error_count = 3 # Number of symbols that must be shifted to leave recovery mode + +import re, types, sys, cStringIO, md5, os.path + +# Exception raised for yacc-related errors +class YaccError(Exception): pass + +#----------------------------------------------------------------------------- +# === LR Parsing Engine === +# +# The following classes are used for the LR parser itself. These are not +# used during table construction and are independent of the actual LR +# table generation algorithm +#----------------------------------------------------------------------------- + +# This class is used to hold non-terminal grammar symbols during parsing. +# It normally has the following attributes set: +# .type = Grammar symbol type +# .value = Symbol value +# .lineno = Starting line number +# .endlineno = Ending line number (optional, set automatically) + +class YaccSymbol: + def __str__(self): return self.type + def __repr__(self): return str(self) + +# This class is a wrapper around the objects actually passed to each +# grammar rule. Index lookup and assignment actually assign the +# .value attribute of the underlying YaccSymbol object. +# The lineno() method returns the line number of a given +# item (or 0 if not defined). The linespan() method returns +# a tuple of (startline,endline) representing the range of lines +# for a symbol. + +class YaccSlice: + def __init__(self,s): + self.slice = s + self.pbstack = [] + + def __getitem__(self,n): + return self.slice[n].value + + def __setitem__(self,n,v): + self.slice[n].value = v + + def lineno(self,n): + return getattr(self.slice[n],"lineno",0) + + def linespan(self,n): + startline = getattr(self.slice[n],"lineno",0) + endline = getattr(self.slice[n],"endlineno",startline) + return startline,endline + + def pushback(self,n): + if n <= 0: + raise ValueError, "Expected a positive value" + if n > (len(self.slice)-1): + raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) + for i in range(0,n): + self.pbstack.append(self.slice[-i-1]) + +# The LR Parsing engine. This is defined as a class so that multiple parsers +# can exist in the same process. A user never instantiates this directly. +# Instead, the global yacc() function should be used to create a suitable Parser +# object. + +class Parser: + def __init__(self,magic=None): + + # This is a hack to keep users from trying to instantiate a Parser + # object directly. + + if magic != "xyzzy": + raise YaccError, "Can't instantiate Parser. Use yacc() instead." + + # Reset internal state + self.productions = None # List of productions + self.errorfunc = None # Error handling function + self.action = { } # LR Action table + self.goto = { } # LR goto table + self.require = { } # Attribute require table + self.method = "Unknown LR" # Table construction method used + + def errok(self): + self.errorcount = 0 + + def restart(self): + del self.statestack[:] + del self.symstack[:] + sym = YaccSymbol() + sym.type = '$' + self.symstack.append(sym) + self.statestack.append(0) + + def parse(self,input=None,lexer=None,debug=0): + lookahead = None # Current lookahead symbol + lookaheadstack = [ ] # Stack of lookahead symbols + actions = self.action # Local reference to action table + goto = self.goto # Local reference to goto table + prod = self.productions # Local reference to production list + pslice = YaccSlice(None) # Slice object passed to grammar rules + pslice.parser = self # Parser object + self.errorcount = 0 # Used during error recovery + + # If no lexer was given, we will try to use the lex module + if not lexer: + import lex as lexer + + pslice.lexer = lexer + + # If input was supplied, pass to lexer + if input: + lexer.input(input) + + # Tokenize function + get_token = lexer.token + + statestack = [ ] # Stack of parsing states + self.statestack = statestack + symstack = [ ] # Stack of grammar symbols + self.symstack = symstack + + errtoken = None # Err token + + # The start state is assumed to be (0,$) + statestack.append(0) + sym = YaccSymbol() + sym.type = '$' + symstack.append(sym) + + while 1: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$' + if debug: + print "%-20s : %s" % (lookahead, [xx.type for xx in symstack]) + + # Check the action table + s = statestack[-1] + ltype = lookahead.type + t = actions.get((s,ltype),None) + + if t is not None: + if t > 0: + # shift a symbol on the stack + if ltype == '$': + # Error, end of input + print "yacc: Parse error. EOF" + return + statestack.append(t) + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if self.errorcount > 0: + self.errorcount -= 1 + + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + try: + sym.lineno = targ[1].lineno + sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) + except AttributeError: + sym.lineno = 0 + del symstack[-plen:] + del statestack[-plen:] + else: + sym.lineno = 0 + targ = [ sym ] + pslice.slice = targ + pslice.pbstack = [] + # Call the grammar rule with our special slice object + p.func(pslice) + + # Validate attributes of the resulting value attribute +# if require: +# try: +# t0 = targ[0] +# r = Requires.get(t0.type,None) +# t0d = t0.__dict__ +# if r: +# for field in r: +# tn = t0 +# for fname in field: +# try: +# tf = tn.__dict__ +# tn = tf.get(fname) +# except StandardError: +# tn = None +# if not tn: +# print "%s:%d: Rule %s doesn't set required attribute '%s'" % \ +# (p.file,p.line,p.name,".".join(field)) +# except TypeError,LookupError: +# print "Bad requires directive " % r +# pass + + + # If there was a pushback, put that on the stack + if pslice.pbstack: + lookaheadstack.append(lookahead) + for _t in pslice.pbstack: + lookaheadstack.append(_t) + lookahead = None + + symstack.append(sym) + statestack.append(goto[statestack[-1],pname]) + continue + + if t == 0: + n = symstack[-1] + return getattr(n,"value",None) + + if t == None: + # We have some kind of parsing error here. To handle this, + # we are going to push the current token onto the tokenstack + # and replace it with an 'error' token. If there are any synchronization + # rules, they may catch it. + # + # In addition to pushing the error token, we call call the user defined p_error() + # function if this is the first syntax error. This function is only called + # if errorcount == 0. + + if not self.errorcount: + self.errorcount = error_count + errtoken = lookahead + if errtoken.type == '$': + errtoken = None # End of file! + if self.errorfunc: + global errok,token,restart + errok = self.errok # Set some special functions available in error recovery + token = get_token + restart = self.restart + tok = self.errorfunc(errtoken) + del errok, token, restart # Delete special functions + + if not self.errorcount: + # User must have done some kind of panic mode recovery on their own. The returned token + # is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken,"lineno"): lineno = lookahead.lineno + else: lineno = 0 + if lineno: + print "yacc: Syntax error at line %d, token=%s" % (lineno, errtoken.type) + else: + print "yacc: Syntax error, token=%s" % errtoken.type + else: + print "yacc: Parse error in input. EOF" + return + + else: + self.errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != '$': + lookahead = None + errtoken = None + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == '$': + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + lookahead = None + continue + t = YaccSymbol() + t.type = 'error' + if hasattr(lookahead,"lineno"): + t.lineno = lookahead.lineno + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + symstack.pop() + statestack.pop() + + continue + + # Call an error function here + raise RuntimeError, "yacc: internal parser error!!!\n" + +# ----------------------------------------------------------------------------- +# === Parser Construction === +# +# The following functions and variables are used to implement the yacc() function +# itself. This is pretty hairy stuff involving lots of error checking, +# construction of LR items, kernels, and so forth. Although a lot of +# this work is done using global variables, the resulting Parser object +# is completely self contained--meaning that it is safe to repeatedly +# call yacc() with different grammars in the same application. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# validate_file() +# +# This function checks to see if there are duplicated p_rulename() functions +# in the parser module file. Without this function, it is really easy for +# users to make mistakes by cutting and pasting code fragments (and it's a real +# bugger to try and figure out why the resulting parser doesn't work). Therefore, +# we just do a little regular expression pattern matching of def statements +# to try and detect duplicates. +# ----------------------------------------------------------------------------- + +def validate_file(filename): + base,ext = os.path.splitext(filename) + if ext != '.py': return 1 # No idea. Assume it's okay. + + try: + f = open(filename) + lines = f.readlines() + f.close() + except IOError: + return 1 # Oh well + + # Match def p_funcname( + fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') + counthash = { } + linen = 1 + noerror = 1 + for l in lines: + m = fre.match(l) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + print "%s:%d: Function %s redefined. Previously defined on line %d" % (filename,linen,name,prev) + noerror = 0 + linen += 1 + return noerror + +# This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. +def validate_dict(d): + for n,v in d.items(): + if n[0:2] == 'p_' and isinstance(v,types.FunctionType): continue + if n[0:2] == 't_': continue + + if n[0:2] == 'p_': + print "yacc: Warning. '%s' not defined as a function" % n + if isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: + try: + doc = v.__doc__.split(" ") + if doc[1] == ':': + print "%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix." % (v.func_code.co_filename, v.func_code.co_firstlineno,n) + except StandardError: + pass + +# ----------------------------------------------------------------------------- +# === GRAMMAR FUNCTIONS === +# +# The following global variables and functions are used to store, manipulate, +# and verify the grammar rules specified by the user. +# ----------------------------------------------------------------------------- + +# Initialize all of the global variables used during grammar construction +def initialize_vars(): + global Productions, Prodnames, Prodmap, Terminals + global Nonterminals, First, Follow, Precedence, LRitems + global Errorfunc, Signature, Requires + + Productions = [None] # A list of all of the productions. The first + # entry is always reserved for the purpose of + # building an augmented grammar + + Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all + # productions of that nonterminal. + + Prodmap = { } # A dictionary that is only used to detect duplicate + # productions. + + Terminals = { } # A dictionary mapping the names of terminal symbols to a + # list of the rules where they are used. + + Nonterminals = { } # A dictionary mapping names of nonterminals to a list + # of rule numbers where they are used. + + First = { } # A dictionary of precomputed FIRST(x) symbols + + Follow = { } # A dictionary of precomputed FOLLOW(x) symbols + + Precedence = { } # Precedence rules for each terminal. Contains tuples of the + # form ('right',level) or ('nonassoc', level) or ('left',level) + + LRitems = [ ] # A list of all LR items for the grammar. These are the + # productions with the "dot" like E -> E . PLUS E + + Errorfunc = None # User defined error handler + + Signature = md5.new() # Digital signature of the grammar rules, precedence + # and other information. Used to determined when a + # parsing table needs to be regenerated. + + Requires = { } # Requires list + + # File objects used when creating the parser.out debugging file + global _vf, _vfc + _vf = cStringIO.StringIO() + _vfc = cStringIO.StringIO() + +# ----------------------------------------------------------------------------- +# class Production: +# +# This class stores the raw information about a single production or grammar rule. +# It has a few required attributes: +# +# name - Name of the production (nonterminal) +# prod - A list of symbols making up its production +# number - Production number. +# +# In addition, a few additional attributes are used to help with debugging or +# optimization of table generation. +# +# file - File where production action is defined. +# lineno - Line number where action is defined +# func - Action function +# prec - Precedence level +# lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' +# then lr_next refers to 'E -> E PLUS . E' +# lr_index - LR item index (location of the ".") in the prod list. +# len - Length of the production (number of symbols on right hand side) +# ----------------------------------------------------------------------------- + +class Production: + def __init__(self,**kw): + for k,v in kw.items(): + setattr(self,k,v) + self.lr_index = -1 + self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure + self.usyms = [ ] + + def __str__(self): + if self.prod: + s = "%s -> %s" % (self.name," ".join(self.prod)) + else: + s = "%s -> " % self.name + return s + + def __repr__(self): + return str(self) + + # Compute lr_items from the production + def lr_item(self,n): + if n > len(self.prod): return None + p = Production() + p.name = self.name + p.prod = list(self.prod) + p.number = self.number + p.lr_index = n + p.prod.insert(n,".") + p.prod = tuple(p.prod) + p.len = len(p.prod) + p.usyms = self.usyms + + # Precompute list of productions immediately following + try: + p.lrafter = Prodnames[p.prod[n+1]] + except (IndexError,KeyError),e: + p.lrafter = [] + try: + p.lrbefore = p.prod[n-1] + except IndexError: + p.lrbefore = None + + return p + +class MiniProduction: + pass + +# Utility function +def is_identifier(s): + for c in s: + if not (c.isalnum() or c == '_'): return 0 + return 1 + +# ----------------------------------------------------------------------------- +# add_production() +# +# Given an action function, this function assembles a production rule. +# The production rule is assumed to be found in the function's docstring. +# This rule has the general syntax: +# +# name1 ::= production1 +# | production2 +# | production3 +# ... +# | productionn +# name2 ::= production1 +# | production2 +# ... +# ----------------------------------------------------------------------------- + +def add_production(f,file,line,prodname,syms): + + if Terminals.has_key(prodname): + print "%s:%d: Illegal rule name '%s'. Already defined as a token." % (file,line,prodname) + return -1 + if prodname == 'error': + print "%s:%d: Illegal rule name '%s'. error is a reserved word." % (file,line,prodname) + return -1 + + if not is_identifier(prodname): + print "%s:%d: Illegal rule name '%s'" % (file,line,prodname) + return -1 + + for s in syms: + if not is_identifier(s) and s != '%prec': + print "%s:%d: Illegal name '%s' in rule '%s'" % (file,line,s, prodname) + return -1 + + # See if the rule is already in the rulemap + map = "%s -> %s" % (prodname,syms) + if Prodmap.has_key(map): + m = Prodmap[map] + print "%s:%d: Duplicate rule %s." % (file,line, m) + print "%s:%d: Previous definition at %s:%d" % (file,line, m.file, m.line) + return -1 + + p = Production() + p.name = prodname + p.prod = syms + p.file = file + p.line = line + p.func = f + p.number = len(Productions) + + + Productions.append(p) + Prodmap[map] = p + if not Nonterminals.has_key(prodname): + Nonterminals[prodname] = [ ] + + # Add all terminals to Terminals + i = 0 + while i < len(p.prod): + t = p.prod[i] + if t == '%prec': + try: + precname = p.prod[i+1] + except IndexError: + print "%s:%d: Syntax error. Nothing follows %%prec." % (p.file,p.line) + return -1 + + prec = Precedence.get(precname,None) + if not prec: + print "%s:%d: Nothing known about the precedence of '%s'" % (p.file,p.line,precname) + return -1 + else: + p.prec = prec + del p.prod[i] + del p.prod[i] + continue + + if Terminals.has_key(t): + Terminals[t].append(p.number) + # Is a terminal. We'll assign a precedence to p based on this + if not hasattr(p,"prec"): + p.prec = Precedence.get(t,('right',0)) + else: + if not Nonterminals.has_key(t): + Nonterminals[t] = [ ] + Nonterminals[t].append(p.number) + i += 1 + + if not hasattr(p,"prec"): + p.prec = ('right',0) + + # Set final length of productions + p.len = len(p.prod) + p.prod = tuple(p.prod) + + # Calculate unique syms in the production + p.usyms = [ ] + for s in p.prod: + if s not in p.usyms: + p.usyms.append(s) + + # Add to the global productions list + try: + Prodnames[p.name].append(p) + except KeyError: + Prodnames[p.name] = [ p ] + return 0 + +# Given a raw rule function, this function rips out its doc string +# and adds rules to the grammar + +def add_function(f): + line = f.func_code.co_firstlineno + file = f.func_code.co_filename + error = 0 + + if f.func_code.co_argcount > 1: + print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) + return -1 + + if f.func_code.co_argcount < 1: + print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) + return -1 + + if f.__doc__: + # Split the doc string into lines + pstrings = f.__doc__.splitlines() + lastp = None + dline = line + for ps in pstrings: + dline += 1 + p = ps.split() + if not p: continue + try: + if p[0] == '|': + # This is a continuation of a previous rule + if not lastp: + print "%s:%d: Misplaced '|'." % (file,dline) + return -1 + prodname = lastp + if len(p) > 1: + syms = p[1:] + else: + syms = [ ] + else: + prodname = p[0] + lastp = prodname + assign = p[1] + if len(p) > 2: + syms = p[2:] + else: + syms = [ ] + if assign != ':' and assign != '::=': + print "%s:%d: Syntax error. Expected ':'" % (file,dline) + return -1 + e = add_production(f,file,dline,prodname,syms) + error += e + except StandardError: + print "%s:%d: Syntax error in rule '%s'" % (file,dline,ps) + error -= 1 + else: + print "%s:%d: No documentation string specified in function '%s'" % (file,line,f.__name__) + return error + + +# Cycle checking code (Michael Dyck) + +def compute_reachable(): + ''' + Find each symbol that can be reached from the start symbol. + Print a warning for any nonterminals that can't be reached. + (Unused terminals have already had their warning.) + ''' + Reachable = { } + for s in Terminals.keys() + Nonterminals.keys(): + Reachable[s] = 0 + + mark_reachable_from( Productions[0].prod[0], Reachable ) + + for s in Nonterminals.keys(): + if not Reachable[s]: + print "yacc: Symbol '%s' is unreachable." % s + +def mark_reachable_from(s, Reachable): + ''' + Mark all symbols that are reachable from symbol s. + ''' + if Reachable[s]: + # We've already reached symbol s. + return + Reachable[s] = 1 + for p in Prodnames.get(s,[]): + for r in p.prod: + mark_reachable_from(r, Reachable) + +# ----------------------------------------------------------------------------- +# compute_terminates() +# +# This function looks at the various parsing rules and tries to detect +# infinite recursion cycles (grammar rules where there is no possible way +# to derive a string of only terminals). +# ----------------------------------------------------------------------------- +def compute_terminates(): + ''' + Raise an error for any symbols that don't terminate. + ''' + Terminates = {} + + # Terminals: + for t in Terminals.keys(): + Terminates[t] = 1 + + Terminates['$'] = 1 + + # Nonterminals: + + # Initialize to false: + for n in Nonterminals.keys(): + Terminates[n] = 0 + + # Then propagate termination until no change: + while 1: + some_change = 0 + for (n,pl) in Prodnames.items(): + # Nonterminal n terminates iff any of its productions terminates. + for p in pl: + # Production p terminates iff all of its rhs symbols terminate. + for s in p.prod: + if not Terminates[s]: + # The symbol s does not terminate, + # so production p does not terminate. + p_terminates = 0 + break + else: + # didn't break from the loop, + # so every symbol s terminates + # so production p terminates. + p_terminates = 1 + + if p_terminates: + # symbol n terminates! + if not Terminates[n]: + Terminates[n] = 1 + some_change = 1 + # Don't need to consider any more productions for this n. + break + + if not some_change: + break + + some_error = 0 + for (s,terminates) in Terminates.items(): + if not terminates: + if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': + # s is used-but-not-defined, and we've already warned of that, + # so it would be overkill to say that it's also non-terminating. + pass + else: + print "yacc: Infinite recursion detected for symbol '%s'." % s + some_error = 1 + + return some_error + +# ----------------------------------------------------------------------------- +# verify_productions() +# +# This function examines all of the supplied rules to see if they seem valid. +# ----------------------------------------------------------------------------- +def verify_productions(cycle_check=1): + error = 0 + for p in Productions: + if not p: continue + + for s in p.prod: + if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': + print "%s:%d: Symbol '%s' used, but not defined as a token or a rule." % (p.file,p.line,s) + error = 1 + continue + + unused_tok = 0 + # Now verify all of the tokens + if yaccdebug: + _vf.write("Unused terminals:\n\n") + for s,v in Terminals.items(): + if s != 'error' and not v: + print "yacc: Warning. Token '%s' defined, but not used." % s + if yaccdebug: _vf.write(" %s\n"% s) + unused_tok += 1 + + # Print out all of the productions + if yaccdebug: + _vf.write("\nGrammar\n\n") + for i in range(1,len(Productions)): + _vf.write("Rule %-5d %s\n" % (i, Productions[i])) + + unused_prod = 0 + # Verify the use of all productions + for s,v in Nonterminals.items(): + if not v: + p = Prodnames[s][0] + print "%s:%d: Warning. Rule '%s' defined, but not used." % (p.file,p.line, s) + unused_prod += 1 + + + if unused_tok == 1: + print "yacc: Warning. There is 1 unused token." + if unused_tok > 1: + print "yacc: Warning. There are %d unused tokens." % unused_tok + + if unused_prod == 1: + print "yacc: Warning. There is 1 unused rule." + if unused_prod > 1: + print "yacc: Warning. There are %d unused rules." % unused_prod + + if yaccdebug: + _vf.write("\nTerminals, with rules where they appear\n\n") + ks = Terminals.keys() + ks.sort() + for k in ks: + _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) + _vf.write("\nNonterminals, with rules where they appear\n\n") + ks = Nonterminals.keys() + ks.sort() + for k in ks: + _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) + + if (cycle_check): + compute_reachable() + error += compute_terminates() +# error += check_cycles() + return error + +# ----------------------------------------------------------------------------- +# build_lritems() +# +# This function walks the list of productions and builds a complete set of the +# LR items. The LR items are stored in two ways: First, they are uniquely +# numbered and placed in the list _lritems. Second, a linked list of LR items +# is built for each production. For example: +# +# E -> E PLUS E +# +# Creates the list +# +# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] +# ----------------------------------------------------------------------------- + +def build_lritems(): + for p in Productions: + lastlri = p + lri = p.lr_item(0) + i = 0 + while 1: + lri = p.lr_item(i) + lastlri.lr_next = lri + if not lri: break + lri.lr_num = len(LRitems) + LRitems.append(lri) + lastlri = lri + i += 1 + + # In order for the rest of the parser generator to work, we need to + # guarantee that no more lritems are generated. Therefore, we nuke + # the p.lr_item method. (Only used in debugging) + # Production.lr_item = None + +# ----------------------------------------------------------------------------- +# add_precedence() +# +# Given a list of precedence rules, add to the precedence table. +# ----------------------------------------------------------------------------- + +def add_precedence(plist): + plevel = 0 + error = 0 + for p in plist: + plevel += 1 + try: + prec = p[0] + terms = p[1:] + if prec != 'left' and prec != 'right' and prec != 'nonassoc': + print "yacc: Invalid precedence '%s'" % prec + return -1 + for t in terms: + if Precedence.has_key(t): + print "yacc: Precedence already specified for terminal '%s'" % t + error += 1 + continue + Precedence[t] = (prec,plevel) + except: + print "yacc: Invalid precedence table." + error += 1 + + return error + +# ----------------------------------------------------------------------------- +# augment_grammar() +# +# Compute the augmented grammar. This is just a rule S' -> start where start +# is the starting symbol. +# ----------------------------------------------------------------------------- + +def augment_grammar(start=None): + if not start: + start = Productions[1].name + Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) + Productions[0].usyms = [ start ] + Nonterminals[start].append(0) + + +# ------------------------------------------------------------------------- +# first() +# +# Compute the value of FIRST1(beta) where beta is a tuple of symbols. +# +# During execution of compute_first1, the result may be incomplete. +# Afterward (e.g., when called from compute_follow()), it will be complete. +# ------------------------------------------------------------------------- +def first(beta): + + # We are computing First(x1,x2,x3,...,xn) + result = [ ] + for x in beta: + x_produces_empty = 0 + + # Add all the non- symbols of First[x] to the result. + for f in First[x]: + if f == '': + x_produces_empty = 1 + else: + if f not in result: result.append(f) + + if x_produces_empty: + # We have to consider the next x in beta, + # i.e. stay in the loop. + pass + else: + # We don't have to consider any further symbols in beta. + break + else: + # There was no 'break' from the loop, + # so x_produces_empty was true for all x in beta, + # so beta produces empty as well. + result.append('') + + return result + + +# FOLLOW(x) +# Given a non-terminal. This function computes the set of all symbols +# that might follow it. Dragon book, p. 189. + +def compute_follow(start=None): + # Add '$' to the follow list of the start symbol + for k in Nonterminals.keys(): + Follow[k] = [ ] + + if not start: + start = Productions[1].name + + Follow[start] = [ '$' ] + + while 1: + didadd = 0 + for p in Productions[1:]: + # Here is the production set + for i in range(len(p.prod)): + B = p.prod[i] + if Nonterminals.has_key(B): + # Okay. We got a non-terminal in a production + fst = first(p.prod[i+1:]) + hasempty = 0 + for f in fst: + if f != '' and f not in Follow[B]: + Follow[B].append(f) + didadd = 1 + if f == '': + hasempty = 1 + if hasempty or i == (len(p.prod)-1): + # Add elements of follow(a) to follow(b) + for f in Follow[p.name]: + if f not in Follow[B]: + Follow[B].append(f) + didadd = 1 + if not didadd: break + + if 0 and yaccdebug: + _vf.write('\nFollow:\n') + for k in Nonterminals.keys(): + _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) + +# ------------------------------------------------------------------------- +# compute_first1() +# +# Compute the value of FIRST1(X) for all symbols +# ------------------------------------------------------------------------- +def compute_first1(): + + # Terminals: + for t in Terminals.keys(): + First[t] = [t] + + First['$'] = ['$'] + First['#'] = ['#'] # what's this for? + + # Nonterminals: + + # Initialize to the empty set: + for n in Nonterminals.keys(): + First[n] = [] + + # Then propagate symbols until no change: + while 1: + some_change = 0 + for n in Nonterminals.keys(): + for p in Prodnames[n]: + for f in first(p.prod): + if f not in First[n]: + First[n].append( f ) + some_change = 1 + if not some_change: + break + + if 0 and yaccdebug: + _vf.write('\nFirst:\n') + for k in Nonterminals.keys(): + _vf.write("%-20s : %s\n" % + (k, " ".join([str(s) for s in First[k]]))) + +# ----------------------------------------------------------------------------- +# === SLR Generation === +# +# The following functions are used to construct SLR (Simple LR) parsing tables +# as described on p.221-229 of the dragon book. +# ----------------------------------------------------------------------------- + +# Global variables for the LR parsing engine +def lr_init_vars(): + global _lr_action, _lr_goto, _lr_method + global _lr_goto_cache + + _lr_action = { } # Action table + _lr_goto = { } # Goto table + _lr_method = "Unknown" # LR method used + _lr_goto_cache = { } + +# Compute the LR(0) closure operation on I, where I is a set of LR(0) items. +# prodlist is a list of productions. + +_add_count = 0 # Counter used to detect cycles + +def lr0_closure(I): + global _add_count + + _add_count += 1 + prodlist = Productions + + # Add everything in I to J + J = I[:] + didadd = 1 + while didadd: + didadd = 0 + for j in J: + for x in j.lrafter: + if x.lr0_added == _add_count: continue + # Add B --> .G to J + J.append(x.lr_next) + x.lr0_added = _add_count + didadd = 1 + + return J + +# Compute the LR(0) goto function goto(I,X) where I is a set +# of LR(0) items and X is a grammar symbol. This function is written +# in a way that guarantees uniqueness of the generated goto sets +# (i.e. the same goto set will never be returned as two different Python +# objects). With uniqueness, we can later do fast set comparisons using +# id(obj) instead of element-wise comparison. + +def lr0_goto(I,x): + # First we look for a previously cached entry + g = _lr_goto_cache.get((id(I),x),None) + if g: return g + + # Now we generate the goto set in a way that guarantees uniqueness + # of the result + + s = _lr_goto_cache.get(x,None) + if not s: + s = { } + _lr_goto_cache[x] = s + + gs = [ ] + for p in I: + n = p.lr_next + if n and n.lrbefore == x: + s1 = s.get(id(n),None) + if not s1: + s1 = { } + s[id(n)] = s1 + gs.append(n) + s = s1 + g = s.get('$',None) + if not g: + if gs: + g = lr0_closure(gs) + s['$'] = g + else: + s['$'] = gs + _lr_goto_cache[(id(I),x)] = g + return g + +# Compute the kernel of a set of LR(0) items +def lr0_kernel(I): + KI = [ ] + for p in I: + if p.name == "S'" or p.lr_index > 0 or p.len == 0: + KI.append(p) + + return KI + +_lr0_cidhash = { } + +# Compute the LR(0) sets of item function +def lr0_items(): + + C = [ lr0_closure([Productions[0].lr_next]) ] + i = 0 + for I in C: + _lr0_cidhash[id(I)] = i + i += 1 + + # Loop over the items in C and each grammar symbols + i = 0 + while i < len(C): + I = C[i] + i += 1 + + # Collect all of the symbols that could possibly be in the goto(I,X) sets + asyms = { } + for ii in I: + for s in ii.usyms: + asyms[s] = None + + for x in asyms.keys(): + g = lr0_goto(I,x) + if not g: continue + if _lr0_cidhash.has_key(id(g)): continue + _lr0_cidhash[id(g)] = len(C) + C.append(g) + + return C + +# ----------------------------------------------------------------------------- +# slr_parse_table() +# +# This function constructs an SLR table. +# ----------------------------------------------------------------------------- +def slr_parse_table(): + global _lr_method + goto = _lr_goto # Goto array + action = _lr_action # Action array + actionp = { } # Action production array (temporary) + + _lr_method = "SLR" + + n_srconflict = 0 + n_rrconflict = 0 + + print "yacc: Generating SLR parsing table..." + if yaccdebug: + _vf.write("\n\nParsing method: SLR\n\n") + + # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items + # This determines the number of states + + C = lr0_items() + + # Build the parser table, state by state + st = 0 + for I in C: + # Loop over each production in I + actlist = [ ] # List of actions + + if yaccdebug: + _vf.write("\nstate %d\n\n" % st) + for p in I: + _vf.write(" (%d) %s\n" % (p.number, str(p))) + _vf.write("\n") + + for p in I: + try: + if p.prod[-1] == ".": + if p.name == "S'": + # Start symbol. Accept! + action[st,"$"] = 0 + actionp[st,"$"] = p + else: + # We are at the end of a production. Reduce! + for a in Follow[p.name]: + actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) + r = action.get((st,a),None) + if r is not None: + # Whoa. Have a shift/reduce or reduce/reduce conflict + if r > 0: + # Need to decide on shift or reduce here + # By default we favor shifting. Need to add + # some precedence rules here. + sprec,slevel = Productions[actionp[st,a].number].prec + rprec,rlevel = Precedence.get(a,('right',0)) + if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): + # We really need to reduce here. + action[st,a] = -p.number + actionp[st,a] = p + if not slevel and not rlevel: + _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) + _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) + n_srconflict += 1 + elif (slevel == rlevel) and (rprec == 'nonassoc'): + action[st,a] = None + else: + # Hmmm. Guess we'll keep the shift + if not slevel and not rlevel: + _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) + _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) + n_srconflict +=1 + elif r < 0: + # Reduce/reduce conflict. In this case, we favor the rule + # that was defined first in the grammar file + oldp = Productions[-r] + pp = Productions[p.number] + if oldp.line > pp.line: + action[st,a] = -p.number + actionp[st,a] = p + # print "Reduce/reduce conflict in state %d" % st + n_rrconflict += 1 + _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) + _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) + else: + print "Unknown conflict in state %d" % st + else: + action[st,a] = -p.number + actionp[st,a] = p + else: + i = p.lr_index + a = p.prod[i+1] # Get symbol right after the "." + if Terminals.has_key(a): + g = lr0_goto(I,a) + j = _lr0_cidhash.get(id(g),-1) + if j >= 0: + # We are in a shift state + actlist.append((a,p,"shift and go to state %d" % j)) + r = action.get((st,a),None) + if r is not None: + # Whoa have a shift/reduce or shift/shift conflict + if r > 0: + if r != j: + print "Shift/shift conflict in state %d" % st + elif r < 0: + # Do a precedence check. + # - if precedence of reduce rule is higher, we reduce. + # - if precedence of reduce is same and left assoc, we reduce. + # - otherwise we shift + rprec,rlevel = Productions[actionp[st,a].number].prec + sprec,slevel = Precedence.get(a,('right',0)) + if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): + # We decide to shift here... highest precedence to shift + action[st,a] = j + actionp[st,a] = p + if not slevel and not rlevel: + n_srconflict += 1 + _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) + _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) + elif (slevel == rlevel) and (rprec == 'nonassoc'): + action[st,a] = None + else: + # Hmmm. Guess we'll keep the reduce + if not slevel and not rlevel: + n_srconflict +=1 + _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) + _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) + + else: + print "Unknown conflict in state %d" % st + else: + action[st,a] = j + actionp[st,a] = p + + except StandardError,e: + raise YaccError, "Hosed in slr_parse_table", e + + # Print the actions associated with each terminal + if yaccdebug: + for a,p,m in actlist: + if action.has_key((st,a)): + if p is actionp[st,a]: + _vf.write(" %-15s %s\n" % (a,m)) + _vf.write("\n") + for a,p,m in actlist: + if action.has_key((st,a)): + if p is not actionp[st,a]: + _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + + # Construct the goto table for this state + if yaccdebug: + _vf.write("\n") + nkeys = { } + for ii in I: + for s in ii.usyms: + if Nonterminals.has_key(s): + nkeys[s] = None + for n in nkeys.keys(): + g = lr0_goto(I,n) + j = _lr0_cidhash.get(id(g),-1) + if j >= 0: + goto[st,n] = j + if yaccdebug: + _vf.write(" %-15s shift and go to state %d\n" % (n,j)) + + st += 1 + + if n_srconflict == 1: + print "yacc: %d shift/reduce conflict" % n_srconflict + if n_srconflict > 1: + print "yacc: %d shift/reduce conflicts" % n_srconflict + if n_rrconflict == 1: + print "yacc: %d reduce/reduce conflict" % n_rrconflict + if n_rrconflict > 1: + print "yacc: %d reduce/reduce conflicts" % n_rrconflict + + +# ----------------------------------------------------------------------------- +# ==== LALR(1) Parsing ==== +# **** UNFINISHED! 6/16/01 +# ----------------------------------------------------------------------------- + + +# Compute the lr1_closure of a set I. I is a list of tuples (p,a) where +# p is a LR0 item and a is a terminal + +_lr1_add_count = 0 + +def lr1_closure(I): + global _lr1_add_count + + _lr1_add_count += 1 + + J = I[:] + + # Loop over items (p,a) in I. + ji = 0 + while ji < len(J): + p,a = J[ji] + # p = [ A -> alpha . B beta] + + # For each production B -> gamma + for B in p.lr1_after: + f = tuple(p.lr1_beta + (a,)) + + # For each terminal b in first(Beta a) + for b in first(f): + # Check if (B -> . gamma, b) is in J + # Only way this can happen is if the add count mismatches + pn = B.lr_next + if pn.lr_added.get(b,0) == _lr1_add_count: continue + pn.lr_added[b] = _lr1_add_count + J.append((pn,b)) + ji += 1 + + return J + +def lalr_parse_table(): + + # Compute some lr1 information about all of the productions + for p in LRitems: + try: + after = p.prod[p.lr_index + 1] + p.lr1_after = Prodnames[after] + p.lr1_beta = p.prod[p.lr_index + 2:] + except LookupError: + p.lr1_after = [ ] + p.lr1_beta = [ ] + p.lr_added = { } + + # Compute the LR(0) items + C = lr0_items() + CK = [] + for I in C: + CK.append(lr0_kernel(I)) + + print CK + +# ----------------------------------------------------------------------------- +# ==== LR Utility functions ==== +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# _lr_write_tables() +# +# This function writes the LR parsing tables to a file +# ----------------------------------------------------------------------------- + +def lr_write_tables(modulename=tab_module): + filename = modulename + ".py" + try: + f = open(filename,"w") + + f.write(""" +# %s +# This file is automatically generated. Do not edit. + +_lr_method = %s + +_lr_signature = %s +""" % (filename, repr(_lr_method), repr(Signature.digest()))) + + # Change smaller to 0 to go back to original tables + smaller = 1 + + # Factor out names to try and make smaller + if smaller: + items = { } + + for k,v in _lr_action.items(): + i = items.get(k[1]) + if not i: + i = ([],[]) + items[k[1]] = i + i[0].append(k[0]) + i[1].append(v) + + f.write("\n_lr_action_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" +_lr_action = { } +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + _lr_action[(_x,_k)] = _y +del _lr_action_items +""") + + else: + f.write("\n_lr_action = { "); + for k,v in _lr_action.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + if smaller: + # Factor out names to try and make smaller + items = { } + + for k,v in _lr_goto.items(): + i = items.get(k[1]) + if not i: + i = ([],[]) + items[k[1]] = i + i[0].append(k[0]) + i[1].append(v) + + f.write("\n_lr_goto_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" +_lr_goto = { } +for _k, _v in _lr_goto_items.items(): + for _x,_y in zip(_v[0],_v[1]): + _lr_goto[(_x,_k)] = _y +del _lr_goto_items +""") + else: + f.write("\n_lr_goto = { "); + for k,v in _lr_goto.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + # Write production table + f.write("_lr_productions = [\n") + for p in Productions: + if p: + if (p.func): + f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) + else: + f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) + else: + f.write(" None,\n") + f.write("]\n") + f.close() + + except IOError,e: + print "Unable to create '%s'" % filename + print e + return + +def lr_read_tables(module=tab_module,optimize=0): + global _lr_action, _lr_goto, _lr_productions, _lr_method + try: + exec "import %s as parsetab" % module + + if (optimize) or (Signature.digest() == parsetab._lr_signature): + _lr_action = parsetab._lr_action + _lr_goto = parsetab._lr_goto + _lr_productions = parsetab._lr_productions + _lr_method = parsetab._lr_method + return 1 + else: + return 0 + + except (ImportError,AttributeError): + return 0 + +# ----------------------------------------------------------------------------- +# yacc(module) +# +# Build the parser module +# ----------------------------------------------------------------------------- + +def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0): + global yaccdebug + yaccdebug = debug + + initialize_vars() + files = { } + error = 0 + + # Add starting symbol to signature + if start: + Signature.update(start) + + # Try to figure out what module we are working with + if module: + # User supplied a module object. + if not isinstance(module, types.ModuleType): + raise ValueError,"Expected a module" + + ldict = module.__dict__ + + else: + # No module given. We might be able to get information from the caller. + # Throw an exception and unwind the traceback to get the globals + + try: + raise RuntimeError + except RuntimeError: + e,b,t = sys.exc_info() + f = t.tb_frame + f = f.f_back # Walk out to our calling function + ldict = f.f_globals # Grab its globals dictionary + + # If running in optimized mode. We're going to + + if (optimize and lr_read_tables(tabmodule,1)): + # Read parse table + del Productions[:] + for p in _lr_productions: + if not p: + Productions.append(None) + else: + m = MiniProduction() + m.name = p[0] + m.len = p[1] + m.file = p[3] + m.line = p[4] + if p[2]: + m.func = ldict[p[2]] + Productions.append(m) + + else: + # Get the tokens map + tokens = ldict.get("tokens",None) + + if not tokens: + raise YaccError,"module does not define a list 'tokens'" + if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): + raise YaccError,"tokens must be a list or tuple." + + # Check to see if a requires dictionary is defined. + requires = ldict.get("require",None) + if requires: + if not (isinstance(requires,types.DictType)): + raise YaccError,"require must be a dictionary." + + for r,v in requires.items(): + try: + if not (isinstance(v,types.ListType)): + raise TypeError + v1 = [x.split(".") for x in v] + Requires[r] = v1 + except StandardError: + print "Invalid specification for rule '%s' in require. Expected a list of strings" % r + + + # Build the dictionary of terminals. We a record a 0 in the + # dictionary to track whether or not a terminal is actually + # used in the grammar + + if 'error' in tokens: + print "yacc: Illegal token 'error'. Is a reserved word." + raise YaccError,"Illegal token name" + + for n in tokens: + if Terminals.has_key(n): + print "yacc: Warning. Token '%s' multiply defined." % n + Terminals[n] = [ ] + + Terminals['error'] = [ ] + + # Get the precedence map (if any) + prec = ldict.get("precedence",None) + if prec: + if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): + raise YaccError,"precedence must be a list or tuple." + add_precedence(prec) + Signature.update(repr(prec)) + + for n in tokens: + if not Precedence.has_key(n): + Precedence[n] = ('right',0) # Default, right associative, 0 precedence + + # Look for error handler + ef = ldict.get('p_error',None) + if ef: + if not isinstance(ef,types.FunctionType): + raise YaccError,"'p_error' defined, but is not a function." + eline = ef.func_code.co_firstlineno + efile = ef.func_code.co_filename + files[efile] = None + + if (ef.func_code.co_argcount != 1): + raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) + global Errorfunc + Errorfunc = ef + else: + print "yacc: Warning. no p_error() function is defined." + + # Get the list of built-in functions with p_ prefix + symbols = [ldict[f] for f in ldict.keys() + if (isinstance(ldict[f],types.FunctionType) and ldict[f].__name__[:2] == 'p_' + and ldict[f].__name__ != 'p_error')] + + # Check for non-empty symbols + if len(symbols) == 0: + raise YaccError,"no rules of the form p_rulename are defined." + + # Sort the symbols by line number + symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) + + # Add all of the symbols to the grammar + for f in symbols: + if (add_function(f)) < 0: + error += 1 + else: + files[f.func_code.co_filename] = None + + # Make a signature of the docstrings + for f in symbols: + if f.__doc__: + Signature.update(f.__doc__) + + lr_init_vars() + + if error: + raise YaccError,"Unable to construct parser." + + if not lr_read_tables(tabmodule): + + # Validate files + for filename in files.keys(): + if not validate_file(filename): + error = 1 + + # Validate dictionary + validate_dict(ldict) + + if start and not Prodnames.has_key(start): + raise YaccError,"Bad starting symbol '%s'" % start + + augment_grammar(start) + error = verify_productions(cycle_check=check_recursion) + otherfunc = [ldict[f] for f in ldict.keys() + if (isinstance(ldict[f],types.FunctionType) and ldict[f].__name__[:2] != 'p_')] + + if error: + raise YaccError,"Unable to construct parser." + + build_lritems() + compute_first1() + compute_follow(start) + + if method == 'SLR': + slr_parse_table() + elif method == 'LALR1': + lalr_parse_table() + return + else: + raise YaccError, "Unknown parsing method '%s'" % method + + lr_write_tables(tabmodule) + + if yaccdebug: + try: + f = open(debug_file,"w") + f.write(_vfc.getvalue()) + f.write("\n\n") + f.write(_vf.getvalue()) + f.close() + except IOError,e: + print "yacc: can't create '%s'" % debug_file,e + + # Made it here. Create a parser object and set up its internal state. + # Set global parse() method to bound method of parser object. + + p = Parser("xyzzy") + p.productions = Productions + p.errorfunc = Errorfunc + p.action = _lr_action + p.goto = _lr_goto + p.method = _lr_method + p.require = Requires + + global parse + parse = p.parse + + # Clean up all of the globals we created + if (not optimize): + yacc_cleanup() + return p + +# yacc_cleanup function. Delete all of the global variables +# used during table construction + +def yacc_cleanup(): + global _lr_action, _lr_goto, _lr_method, _lr_goto_cache + del _lr_action, _lr_goto, _lr_method, _lr_goto_cache + + global Productions, Prodnames, Prodmap, Terminals + global Nonterminals, First, Follow, Precedence, LRitems + global Errorfunc, Signature, Requires + + del Productions, Prodnames, Prodmap, Terminals + del Nonterminals, First, Follow, Precedence, LRitems + del Errorfunc, Signature, Requires + + global _vf, _vfc + del _vf, _vfc + + +# Stub that raises an error if parsing is attempted without first calling yacc() +def parse(*args,**kwargs): + raise YaccError, "yacc: No parser built with yacc()" + diff --git a/sample_data/example_data b/sample_data/example_data new file mode 100644 index 0000000..ee51e24 --- /dev/null +++ b/sample_data/example_data @@ -0,0 +1,99 @@ +## Data file: +## This is much like a rwhoisd-1.5.x series data file. It consists +## of "attribute: value" pairs, separated by lines of "---" or (for +## this file format) blank lines. Unlike the rwhoisd file format, +## the Class-Name attribute is required. + +ID: 666.10.0.0.0/8 +Class-Name: network +Auth-Area: 10.0.0.0/8 +Network-Name: A-NET +IP-Network: 10.0.0.0/8 +Organization:777.a.com +Tech-Contact:222.a.com +Admin-Contact:222.a.com +Created: 19961022 +Updated: 19961023 +Updated-By: hostmaster@a.com + +ID:333.a.com +Auth-Area:a.com +Class-Name: domain +Guardian:444.a.com +Domain-Name: a.com +Primary-Server:5551.a.com +Secondary-Server:5552.a.com +Organization:777.a.com +Admin-Contact:222.a.com +Tech-Contact:222.a.com +Billing-Contact:222.a.com +Created:19961022 +Updated:19961023 +Updated-By:hostmaster@a.com + +ID:222.a.com +Auth-Area: a.com +Class-Name: contact +Name:Public, John Q. +Email:johnq@a.com +Type:I +First-Name:John +Last-Name:Public +Phone:(847)-391-7926 +Fax:(847)-338-0340 +Organization:777.a.com +See-Also:http://www.a.com/~johnq +Created:11961022 +Updated:11961023 +Updated-By:hostmaster@a.com + +ID:223.a.com +Auth-Area:a.com +Class-Name: contact +Name:Doe, Jane +Email:janed@a.com +Type:I +First-Name:Jane +Last-Name:Doe +Last-Name:Doe +Last-Name:Doe +Phone:(847)-391-7943 +Fax:(847)-338-0340 +Organization:777.a.com +Created:11961025 +Updated:11961025 +Updated-By:hostmaster@a.com + +ID: 5552.a.com +Auth-Area: a.com +Class-Name: host +Host-Name: ns2.a.com +IP-Address: 10.0.0.2 +Created: 19961022 +Updated: 19961023 +Updated-By: hostmaster@a.com + +ID: 777.a.com +Auth-Area: a.com +Class-Name: organization +Org-Name: A Communications, Inc. +Street-Address: #600 - 1380 Burrars St. +City: Vaner +State: CM +Postal-Code: V6Z 2H3 +Country-Code: NL +Phone: (401) 555-6721 +Created: 19961022 +Updated: 19961023 +Updated-By: hostmaster@a.com + +ID:888.a.com +Auth-Area: a.com +Class-Name: referral +Guardian:444.a.com +Referral:rwhois://rwhois.second.a.com:4321/Auth-Area=fddi.a.com +Organization:777.a.com +Referred-Auth-Area:fddi.a.com +Created:19961022 +Updated:19961023 +Updated-By:hostmaster@a.com diff --git a/sample_data/example_queries b/sample_data/example_queries new file mode 100644 index 0000000..eb56971 --- /dev/null +++ b/sample_data/example_queries @@ -0,0 +1,13 @@ +#10.131.252.227 +network 10.131.252/24 +network 24.210.66.0/24 +domain a.com +domain domain-Name = a.com +contact doe +contact public and type = "I" +contact public and type = O or doe and type = I +contact Pub* +contact pub* +contact "Pub*" +network 11/8 +network foo=bar \ No newline at end of file diff --git a/sample_data/example_random_queries b/sample_data/example_random_queries new file mode 100644 index 0000000..2fc02ae --- /dev/null +++ b/sample_data/example_random_queries @@ -0,0 +1,5 @@ +network class-name = bar and foo OR ip-address != foo and "1 2 3 4" and First-Name = " Bob**" +Bob +network 10.0.0.0/8 +domain domain-name="example*" and domain-name=*ple.net and host-name="f*" +domain organization="foo" diff --git a/sample_data/example_schema b/sample_data/example_schema new file mode 100644 index 0000000..d14beca --- /dev/null +++ b/sample_data/example_schema @@ -0,0 +1,16 @@ +## Schema description file: +## consists of = pairs, one per line. +## index_type is one of N, C, or A. +## N = normal, string valued index. +## C = cidr index (for ip addresses and netblocks) +## A = all, both a normal and a cidr index + +domain-name = N +email = N +host-name = N +ip-address = C +ip-network = C +last-name = N +name = N +network-name = N +org-name = N diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f5db8f7 --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from distutils.core import setup +import os + +setup(name="python-rwhoisd", + version="0.1", + description="A lightweight RWhois server written in Python", + author="David Blacka", + author_email="davidb@verisignlabs.com", + url="http://www.rwhois.net/", + packages=['rwhoisd'], + scripts=os.listdir('bin'), + data_files=[('sample_data', + ['sample_data/example_schema', + 'sample_data/example_data'])] + ) -- 2.36.6