Using Regular Expressions to Parse SQL

Paul McGuire ptmcg at austin.rr.com
Tue Feb 5 10:26:58 EST 2008


On Feb 5, 8:31 am, c... at aol.com wrote:
> Hello again -
>
> I do not seem to be able to get a handle on non-greedy pattern
> matching.
>

Regexps wont cut it when you have to parse nested ()'s in a logical
expression.

Here is a pyparsing solution.  For your specific application, you will
need to sift through WHERE clauses too, you should be able to use this
as a starting point for creating a WHERE clause parser.

-- Paul


from pyparsing import *

# define terminal items
identifier = Word(alphas,alphanums+'_')
column_name = Combine( identifier + ZeroOrMore('.' + identifier) )
integer = Word(nums)
value = integer | sglQuotedString

relationalOperator = oneOf("< = > >= <= != <>")
simple_comparison = Group(column_name + relationalOperator +
                            (column_name | value))

AND_cl = CaselessLiteral("AND")
OR_cl = CaselessLiteral("OR")
NOT_cl = CaselessLiteral("NOT")

logical_expression = operatorPrecedence( simple_comparison,
    [
    (NOT_cl, 1, opAssoc.RIGHT),
    (OR_cl, 2, opAssoc.LEFT),
    (AND_cl, 2, opAssoc.LEFT),
    ])

# syntax for FROM ... join expression
FROM_cl = CaselessLiteral("FROM")
ON_cl = CaselessLiteral("ON")
INNER_cl = CaselessLiteral("INNER")
JOIN_cl = CaselessLiteral("JOIN")

join_expr = identifier('table1') + INNER_cl + JOIN_cl + \
            identifier('table2') + ON_cl + \
            logical_expression('join_condition')
from_clause = FROM_cl + '(' + join_expr + ')'


test = """ FROM (qry_Scores_Lookup1 INNER JOIN CSS_Rpt1 ON
            (qry_Scores_Lookup1.desc = CSS_Rpt1.desc) AND
            (qry_Scores_Lookup1.lastcdu = CSS_Rpt1.lastcdu))"""

# parsing the FROM clause normally
join_fields = from_clause.parseString(test)
print "\nParsed tokens, plus any named fields"
print join_fields.dump()
print "\nAccessing some named fields directly"
print "table1 =", join_fields.table1
print "join_condition =", join_fields.join_condition

# create parse action to tally up column names as they are found
# during the parsing process
def tally_names(tokens):
    column_names.append(tokens[0])
column_name.setParseAction(tally_names)

# parse the FROM clause, but just let the parse action do the work
print "\nExtracting the column names"
column_names = []
from_clause.parseString(test)
print column_names


Prints:

Parsed tokens, plus any named fields
['FROM', '(', 'qry_Scores_Lookup1', 'INNER', 'JOIN', 'CSS_Rpt1', 'ON',
[['qry_Scores_Lookup1.desc', '=', 'CSS_Rpt1.desc'], 'AND',
['qry_Scores_Lookup1.lastcdu', '=', 'CSS_Rpt1.lastcdu']], ')']
- join_condition: [['qry_Scores_Lookup1.desc', '=', 'CSS_Rpt1.desc'],
'AND', ['qry_Scores_Lookup1.lastcdu', '=', 'CSS_Rpt1.lastcdu']]
- table1: qry_Scores_Lookup1
- table2: CSS_Rpt1

Accessing some named fields directly
table1 = qry_Scores_Lookup1
join_condition = [['qry_Scores_Lookup1.desc', '=', 'CSS_Rpt1.desc'],
'AND', ['qry_Scores_Lookup1.lastcdu', '=', 'CSS_Rpt1.lastcdu']]

Extracting the column names
['qry_Scores_Lookup1.desc', 'CSS_Rpt1.desc',
'qry_Scores_Lookup1.lastcdu', 'CSS_Rpt1.lastcdu']





More information about the Python-list mailing list