# This file is part of python-rwhoisd # # Copyright (C) 2003, David E. Blacka # # $Id: QueryProcessor.py,v 1.3 2003/04/28 16:44:56 davidb Exp $ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA import sys, re import Cidr, Rwhois, QueryParser class QueryProcessor: def __init__(self, db): self.db = db def _filter_obj_term(self, obj, term): """Given a rwhoisobject and a query term (a 3 element tuple: attr, operator, value), determine if the object satisfies the term. Returns True if the object matches the term, False if not.""" attr, op, searchval = term res = False # filter by named attribute if attr: vals = obj.get_attr(attr) if not vals: res = False else: res = match_values(searchval, vals) if op == "!=": return not res return res # filter by general term else: for val in obj.values(): if match_value(searchval, val): return True return False def _filter_obj(self, obj, terms): """Given a rwhoisobject and a list of query terms (i.e., a whole AND clause), return True if the object satisfies the terms.""" for term in terms: if not self._filter_obj_term(obj, term): return False return True def _filter_results(self, reslist, terms): """Given list of result objects (not simply the ids returned from the search) and a list of query terms (i.e., a query clause), remove elements that do not satisfy the terms. Returns a list of objects that satisfy the filters.""" if not terms: return reslist return [ x for x in reslist if self._filter_obj(x, terms) ] def process_query_clause(self, clause, max=0): """Process a query clause (a grouping of terms ANDed together). This is where the indexed searches actually get done. The technique used here is to search on one index and use the rest of the clause to filter the results. Returns a QueryResult object""" # the technique is to do an index search on the first (or # maybe best) indexed term (bare terms are always considered # indexed), and filter those results with the remaining terms. # Note: this could be better if we found the "optimal" query # term. One approach may be to create a cost function and # search for the minimum cost term. # Note: another approach might be to actually do indexed # searches on all applicable terms (bare or using an indexed # attribute) and find the intersection of the results. st = None sti = 0 orig_clause = clause[:] # find the first searchable term: for term, i in zip(clause, xrange(sys.maxint)): attr, op, value = term if op == "!=": continue if not attr or self.db.is_indexed_attr(attr): st, sti = term, i break if not st: raise Rwhois.RwhoisError, (351, "No indexed terms in query clause") # remove the search term from the clause, what remains is the # filter. del clause[sti] # if we have an attribute name, search on that. if st[0]: res = self.db.search_attr(st[0], st[2], max) else: if Cidr.valid_cidr(st[2].strip("*")): res = self.db.search_cidr(st[2], max) else: res = self.db.search_normal(st[2], max) objs = self._filter_results(self.db.fetch_objects(res.list()), clause) queryres = QueryResult(objs) # look for referrals refs = self.process_referral_search(orig_clause) queryres.add_referrals(refs) return queryres def _is_in_autharea(self, value): """Returns True if value could be considered to be contained within an authority area. That is, is a subnet of a network-type authority area or a subdomain of a domainname type authority area.""" aas = self.db.get_authareas() if isinstance(value, Cidr.Cidr): for aa in aas: cv = Cidr.valid_cidr(aa) if cv and cv.is_supernet(value): return True else: for aa in aas: if is_domainname(aa) and is_subdomain(aa, value): return True return False def _referral_search_cidr(self, cv, value): """Return the IndexResult of a referral search for value, or None if the value doesn't qualify for a Cidr referral search.""" if not cv: return None if not self._is_in_autharea(cv): return None return self.db.search_referral(value) def _referral_search_domain(self, value): """Return the IndexResult of a referral search for value, or None if the value doesn't qualify for a domain referral search.""" if not is_domainname(value): return None if not self._is_in_autharea(value): return None dn = value res = None while dn: res = self.db.search_referral(dn) if res.list(): break dn = reduce_domain(dn) return res def _referral_search_term(self, value): """Return the IndexResult of a referral search for value, or None if the value didn't qualify for a referral search.""" cv = Cidr.valid_cidr(value) if cv: return self._referral_search_cidr(cv, value) elif is_domainname(value): return self._referral_search_domain(value) return None def process_referral_search(self, clause): """Given a query clause, attempt to search for referrals associated with the terms. Return a list of referral strings that matched terms in the clause (if any). The only terms that actually get searched are the ones that look 'heirarchical'. For now, the attribute part of the term is essentially ignored, so a search for something like 'name=127.0.0.1' might concievably generate a referral, when perhaps it shouldn't.""" # first check to see if the search is explictly for a referral for term in clause: if (term[0] == "class-name" and term[1] == "=" and term[2] == "referral") or term[0] == "referred-auth-area": # in which case, we return nothing return [] referrals = [] # look for heirarchical-looking terms. for attr, op, value in clause: if op == "!=": continue res = self._referral_search_term(value) if not res or not res.list(): continue ref_objs = self.db.fetch_objects(res.list()) ref_strs = [x for y in ref_objs for x in y.get_attr("referral")] referrals.extend(ref_strs) return referrals def process_full_query(self, query, max=0): """Given a parsed query object, process it by unioning the results of the various ORed together clauses""" # shortcut for the very common single clause case: if len(query.clauses) == 1: res = self.process_query_clause(query.clauses[0], max) return res # otherwise, union the results from all the causes res = QueryResult() for clause in query.clauses: res.extend(self.process_query_clause(clause), max) if max and len(res) >= max: res.truncate(max) break return res def process_query(self, session, queryline): """Given a session config and a query line, parse the query, perform any searches, return any referrals.""" if not session.queryparser: session.queryparser = QueryParser.get_parser() # parse the query try: query = QueryParser.parse(session.queryparser, queryline) except Rwhois.RwhoisError, x: session.wfile.write(Rwhois.error_message(x)) return max = session.limit if max: max += 1 query_result = self.process_full_query(query, max) objects = query_result.objects() referrals = query_result.referrals() if not objects and not referrals: session.wfile.write(Rwhois.error_message(230)) # session.wfile.write("\r\n") return limit_exceeded = False if session.limit and len(objects) > session.limit: del objects[session.limit:] limit_exceeded = True for obj in objects: session.wfile.write(obj.to_wire_str()) session.wfile.write("\r\n") if referrals: if objects: session.wfile.write("\r\n") session.wfile.write("\r\n".join(referrals)) session.wfile.write("\r\n") if limit_exceeded: session.wfile.write(Rwhois.error_message(330)) else: session.wfile.write(Rwhois.ok()) class QueryResult: def __init__(self, objs=[], referrals=[]): self.data = objs self.ids = [ x.getid() for x in objs ] self._dict = dict(zip(self.ids, self.ids)) self.refs = referrals def extend(self, list): if isinstance(list, type(self)): list = list.objects() new_objs = [ x for x in list if not self._dict.has_key(x.getid()) ] new_ids = [ x.getid() for x in new_objs ] self.data.extend(new_objs) self.ids.extend(new_ids) self._dict.update(dict(zip(new_ids, new_ids))) def add_referrals(self, referrals): self.refs.extend(referrals) def objects(self): return self.data def referrals(self): return self.refs def ids(self): return self.ids def truncate(self, n=0): to_del = self.ids[n:] for i in to_del: del self._dict[i] self.ids = self.ids[:n] self.data = self.data[:n] def match_value(searchval, val): """Determine if a search value matches a data value. If both matching terms are valid CIDR objects, then they are matched according the CIDR wildcard rules (i.e., a single trailing * is a supernet search, ** is a subnet search). If the search value is not wildcarded, then they are just tested for numeric equality. Otherwise, the terms are compared using string semantics (substring, prefix, suffix, and exact match.""" if match_cidr(searchval, val): return True # normalize the values for comparison. searchval = searchval.lower() val = val.lower() # the substring case if searchval.startswith("*") and searchval.endswith("*"): sv = searchval.strip("*"); if val.find(sv) >= 0: return True else: return False # the suffix case elif searchval.startswith("*"): sv = searchval.lstrip("*") return val.endswith(sv) # the prefix case elif searchval.endswith("*"): sv = searchval.rstrip("*") return val.startswith(sv) # the exact match case else: return searchval == val def match_values(searchval, val_list): for val in val_list: if match_value(searchval, val): return True return False def match_cidr(searchval, val): """If both terms are valid CIDR values (minus any trailing wildcards of the search value), compare according the CIDR wildcard rules: subnet, supernet, and exact match. If both terms are not CIDR address, return False.""" sv = Cidr.valid_cidr(searchval.rstrip("*")) rv = Cidr.valid_cidr(val) if not sv or not rv: return False if (searchval.endswith("**")): return rv.is_subnet(sv) elif (searchval.endswith("*")): return rv.is_supernet(sv) else: return rv == sv # this forms a pretty basic heuristic to see of a value looks like a # domain name. domain_regex = re.compile("[a-z0-9-]+\.[a-z0-9-.]+", re.I) def is_domainname(value): if domain_regex.match(value): return True return False def is_subdomain(domain, subdomain): domain = domain.lower(); subdomain = subdomain.lower(); dlist = domain.split('.') sdlist = subdomain.split('.') if len(dlist) > len(sdlist): return False if len(dlist) == len(sdlist): return domain == subdomain dlist.reverse(); sdlist.reverse() return dlist == sdlist[:len(dlist)] def reduce_domain(domain): dlist = domain.split('.') dlist.pop(0) return '.'.join(dlist) def is_heirarchical(value): if cidr.valid_cidr(value): return True if is_domainname(value): return True return False if __name__ == '__main__': import MemDB, Session db = MemDB.MemDB() print "loading schema:", sys.argv[1] db.init_schema(sys.argv[1]) for data_file in sys.argv[2:]: print "loading data file:", data_file db.load_data(data_file) db.index_data() QueryParser.db = db processor = QueryProcessor(db) session = Session.Context() session.wfile = sys.stdout while 1: line = sys.stdin.readline().strip(); if not line: break if line.startswith("#"): continue print "parsing: '%s'" % line processor.process_query(session, line) session.wfile.write("\r\n"); session.wfile.flush()