SqlRegExp.h

Go to the documentation of this file.
00001 /*
00002 // $Id: //open/dev/fennel/calculator/SqlRegExp.h#2 $
00003 // Fennel is a library of data storage and processing components.
00004 // Copyright (C) 2005-2009 The Eigenbase Project
00005 // Copyright (C) 2004-2009 SQLstream, Inc.
00006 // Copyright (C) 2009-2009 LucidEra, Inc.
00007 //
00008 // This program is free software; you can redistribute it and/or modify it
00009 // under the terms of the GNU General Public License as published by the Free
00010 // Software Foundation; either version 2 of the License, or (at your option)
00011 // any later version approved by The Eigenbase Project.
00012 //
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public License for more details.
00017 //
00018 // You should have received a copy of the GNU General Public License
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00021 //
00022 // SqlRegExp
00023 //
00024 // An ASCII & UCS2 string library that adheres to the SQL99 and/or
00025 // SQL2003 standard definitions, and implements LIKE and SIMILAR.
00026 */
00027 #ifndef Fennel_SqlRegExp_Included
00028 #define Fennel_SqlRegExp_Included
00029 
00030 #include <string>
00031 #include <boost/regex.hpp>
00032 
00033 #ifdef HAVE_ICU
00034 #include <unicode/ustring.h>
00035 #endif
00036 
00037 FENNEL_BEGIN_NAMESPACE
00038 
00039 #if !(defined LITTLEENDIAN || defined BIGENDIAN)
00040 #error "endian not defined"
00041 #endif
00042 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00065 void
00066 SqlLikePrep(
00067     char const * const pattern,
00068     int patternLenBytes,
00069     char const * const escape,  // may be null
00070     int escapeLenBytes,
00071     std::string& expPat)
00072 {
00073     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) {
00074         if (CodeUnitBytes == 1) {
00075             // ASCII
00076 
00077             if (patternLenBytes == 0) {
00078                 // SQL99 Part 2 Section 8.5 General Rule 3.d.i.
00079                 // LIKE always matches if matchValueLenBytes == 0
00080                 // if != 0, then I believe this cannot match anything
00081                 // Must still assign a valid regex here.
00082                 expPat.assign("UNUSED");
00083                 return;
00084             }
00085 
00086             bool escapeIsRegexpSpecial = false;
00087             std::string special("_%.|*?+(){}[]^$\\");
00088             char escapeChar;
00089             if (escapeLenBytes == 1) {
00090                 escapeChar = *escape;
00091                 if (special.find(escapeChar) != std::string::npos &&
00092                     escapeChar != '_' &&
00093                     escapeChar != '%') {
00094                     // escape char is a special char to regex (not
00095                     // sql, just regex) and must be escaped if it
00096                     // makes it through to the pattern. (e.g.: escape
00097                     // = '*', pattern = '**' then regex should be fed
00098                     // '\*'
00099                     escapeIsRegexpSpecial = true;
00100                 }
00101                 special.append(1, escapeChar);
00102             } else {
00103                 if (!escape & !escapeLenBytes) {
00104                     // Default to no escape character
00105                     escapeChar = 0; // should not match anything
00106                 } else {
00107                     // SQL99 Part 2 Section 8.5 General Rule 3.b.i1
00108                     // Invalid Escape Character
00109                     throw "22019";
00110                 }
00111             }
00112 
00113             expPat.assign(pattern, patternLenBytes);
00114 
00115             // Escape all of ".", "|", "*", "?", "+",
00116             //        "(", ")", "{", "}", "[", "]", "^", "$", and "\"
00117             //        so they have no meaning to regex.
00118             // Convert pat from SQL to Posix (or Perl, tbd) RegExp
00119             //         _ -> .
00120             //         % -> .*
00121             size_t pos = 0;
00122             while ((pos = expPat.find_first_of(special, pos)) !=
00123                    std::string::npos) {
00124                 if (expPat[pos] == escapeChar) {
00125                     if (pos + 1 >= expPat.size() ||
00126                         (expPat[pos + 1] != '_'
00127                          && expPat[pos + 1] != '%'
00128                          && expPat[pos + 1] != escapeChar))
00129                     {
00130                         // SQL99 Part 2 Section 8.5 General Rule
00131                         // 3.d.ii, I think.
00132                         // Invalid Escape Sequence
00133                         throw "22025";
00134                     }
00135                     if (escapeIsRegexpSpecial
00136                         && expPat[pos + 1] == escapeChar)
00137                     {
00138                         expPat[pos] = '\\'; // replace escape char
00139                         pos += 2;           // move past subsequent escape char
00140                     } else {
00141                         expPat.erase(pos, 1); // remove escape char
00142                         pos++;               // move past subsequent '_' or '%'
00143                     }
00144                 } else {
00145                     switch (expPat[pos]) {
00146                     case '_':   // SQL '_' -> regex '.'
00147                         expPat.replace(pos, 1, ".");
00148                         pos++;
00149                         break;
00150                     case '%':   // SQL '%' -> regex '.*'
00151                         expPat.replace(pos, 1, ".*");
00152                         pos += 2;
00153                         break;
00154                     case '\\':
00155                         // \ is not a special character in LIKE, but
00156                         // it must be escaped from regex.  Is treated
00157                         // specially only if it is *not* the escape
00158                         // char. Note that this also has the side
00159                         // effect of turning off various character
00160                         // escape sequences (e.g. \n) and operators
00161                         // (e.g. \w) that regex supports.
00162                         expPat.insert(pos, "\\", 1);
00163                         pos += 2;
00164                         break;
00165 
00166                     default:    // escape regex special chars
00167                         // A single \ is the regex escape char
00168                         expPat.insert(pos, "\\", 1);
00169                         pos += 2;
00170                     }
00171                 }
00172             }
00173 
00174         } else if (CodeUnitBytes == 2) {
00175             // TODO: Add UCS2 here
00176             // Convert pattern to ICU regex pattern
00177             throw std::logic_error("no UCS2");
00178         } else {
00179             throw std::logic_error("no such encoding");
00180         }
00181     } else {
00182         throw std::logic_error("no UTF8/16/32");
00183     }
00184 }
00185 
00188 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00189 void
00190 SqlSimilarPrepEscapeProcessing(
00191     char const * const escape,
00192     int escapeLenBytes,
00193     char& escapeChar,
00194     std::string const & expPat,
00195     std::string& sqlSpecial)
00196 {
00197     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint &&
00198         CodeUnitBytes == 1) {
00199         // ASCII
00200 
00201         if (escapeLenBytes == 1) {
00202             escapeChar = *escape;
00203             sqlSpecial.append(1, escapeChar);
00204 
00205             // Define special characters for SQL2003 Part 2 Section 8.6 General
00206             // Rule 3.b. (See also Syntax Rule 6.)  Added <right brace> to these
00207             // list as it appears at first glance to be an omission from the
00208             // rules.  Could easily be wrong though. There could be a subtle
00209             // reason why '}' is omitted from these rules
00210             char const * const SqlSimilarPrepGeneralRule3b = "[]()|^-+*_%?{}";
00211 
00212             if (strchr(SqlSimilarPrepGeneralRule3b, escapeChar)) {
00213                 // Escape char is special char. Must not be
00214                 // present in pattern unless it part of a
00215                 // correctly formed <escape character>
00216                 size_t pos = 0;
00217                 while ((pos = expPat.find(escapeChar, pos)) !=
00218                        std::string::npos) {
00219                     if (pos + 1 >= expPat.size() ||
00220                         !strchr(
00221                             SqlSimilarPrepGeneralRule3b,
00222                             expPat[pos + 1]))
00223                     {
00224                         // SQL2003 Part 2 Section 8.6 General Rule 3.b
00225                         // Data Exception - Invalid Use of Escape Character
00226                         throw "2200C";
00227                     }
00228                     pos += 2; // skip by <escape><special char>
00229                 }
00230             }
00231             if (escapeChar == ':' &&
00232                 ((expPat.find("[:") != std::string::npos ||
00233                   expPat.find(":]") != std::string::npos))) {
00234                 // SQL2003 Part 2 Section 8.6 General Rule 3.c
00235                 // Data Exception -- Escape Character Conflict
00236                 throw "2200B";
00237             }
00238         } else {
00239             if (!escape & ! escapeLenBytes) {
00240                 // Default to no escape character
00241                 escapeChar = 0; // should not match anything
00242             } else {
00243                 // SQL2003 Part 2 Section 8.6 General Rule 3,
00244                 // Invalid Escape Character
00245                 throw "22019";
00246             }
00247         }
00248     }
00249 }
00250 
00251 
00252 // StrSimilarPrepRewriteCharEnumeration
00253 // Helper to StrSimilarPrepReWrite -
00254 // Changes regular character set identifier strings (e.g.: [:ALPHA:])
00255 // into corresponding regex strings.
00256 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00257 void
00258 SqlSimilarPrepRewriteCharEnumeration(
00259     std::string& expPat,
00260     size_t& pos,
00261     char const * const SqlSimilarPrepSyntaxRule6,
00262     char escapeChar)
00263 {
00264     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint &&
00265         CodeUnitBytes == 1) {
00266         // ASCII
00267 
00268         // If a <character enumeration> contains a <regular character
00269         // set identifier> it must be either [[:foo:]] or
00270         // [^[:foo:]]. All other patterns containing a <regular
00271         // character set identifier>, e.g. [abc[:foo:]], are assumed
00272         // (by my reading of the BNF) to be ill-formed.
00273         if (!expPat.compare(pos, 3, "^[:")) {
00274             // skip past ^ and process as a usual (e.g. as [:foo:])
00275             pos++;
00276         } else if (!expPat.compare(pos, 2, "[:")) {
00277             // no-op
00278         } else {
00279             // The <character enumeration> does not contain a
00280             // <regular character set identifier>.
00281             //
00282             // SQL2003 Part 2 Section 8.6 Syntax Rule 5 and Syntax Rule 6.  Only
00283             // <escaped character> and <non-escaped character> are
00284             // legal between [ and ]. i.e. Unescaped special
00285             // characters not allowed in <character enumeration>.
00286             // Of course the exception is '-' <minus sign> and '^'
00287             // <circumflex>.
00288 
00289             std::string syntaxRule6ForCharEnum("[]()|+*_%?{}");
00290             syntaxRule6ForCharEnum.append(escapeChar, 1);
00291 
00292             size_t pos2 = pos;
00293             while ((pos2 = expPat.find_first_of(syntaxRule6ForCharEnum, pos2))
00294                    != std::string::npos) {
00295                 if (expPat[pos2] == escapeChar) {
00296                     // skip over next char, assume that it is special
00297                     pos2 += 2;
00298                 } else if (expPat[pos2] == ']') {
00299                     // no more special chars. Set is OK
00300                     break;
00301                 } else {
00302                     // A special char (as defined by Syntax Rule 6) found
00303                     // unescaped inside character enumeration
00304                     //
00305                     // SQL2003 Part 2 Section 8.6 General Rule 2
00306                     // Data Exception - Invalid Regular Expression
00307                     throw "2201B";
00308                 }
00309             }
00310             return;
00311         }
00312 
00313         //
00314         // Continue with <regular character set identifier> processing
00315         //
00316 
00317         // SQL2003 Part 2 Section 8.6 Syntax Rule 3
00318         // and 8.6 BNF <character enumeration>
00319         // Must make a few substitutions as regex doesn't match
00320         // SIMILAR. Also, regex doesn't use [:ALPHA:], only [:alpha:].
00321         // See General Rule 7.m - 7.s
00322         //
00323         // SQL2003 Part 2 Section 8.6 General Rule 7.r, Note 189 refers to
00324         // SQL2003 3.1.6.42: Whitespace is defined as:
00325         // U+0009, Horizontal Tabulation
00326         // U+000A, Line Feed
00327         // U+000B, Vertical Tabulation
00328         // U+000C, Form Feed
00329         // U+000D, Carriage Return
00330         // U+0085, Next Line
00331         // Plus Unicode General Category classes Zs, Zl, Zp:
00332         // (see NOTE 6 & NOTE 7), ignoring those > U+00FF
00333         // U+0020, Space
00334         // U+00A0, No-Break Space
00335         //
00336         // Table below assumes that only [[:foo:]] and [^[:foo:]] are
00337         // legal. All other patterns, e.g. [abc[:foo:]] are assumed
00338         // (by my reading of the BNF) to be ill-formed.
00339         //
00340         // TODO: Move this table to .cpp file.
00341         char const * const regCharSetIdent[][2] = {
00342             { "[:ALPHA:]", "[:alpha:]" },
00343             { "[:alpha:]", "[:alpha:]" },
00344             { "[:UPPER:]", "[:upper:]" },
00345             { "[:upper:]", "[:upper:]" },
00346             { "[:LOWER:]", "[:lower:]" },
00347             { "[:lower:]", "[:lower:]" },
00348             { "[:DIGIT:]", "[:digit:]" },
00349             { "[:digit:]", "[:digit:]" },
00350             { "[:SPACE:]", " " },
00351             { "[:space:]", " " },
00352             { "[:WHITESPACE:]", "\x20\xa0\x09\x0a\x0b\x0c\x0d\x85" },
00353             { "[:whitespace:]", "\x20\xa0\x09\x0a\x0b\x0c\x0d\x85" },
00354             { "[:ALNUM:]", "[:alnum:]" },
00355             { "[:alnum:]", "[:alnum:]" },
00356             { "", "" },
00357         };
00358         int i, len;
00359         for (i = 0; *regCharSetIdent[i][0]; i++) {
00360             len = strlen(regCharSetIdent[i][0]);
00361             if (!expPat.compare(pos, len, regCharSetIdent[i][0])) {
00362                 expPat.replace(pos, len, regCharSetIdent[i][1]);
00363                 pos += strlen(regCharSetIdent[i][1]);
00364                 return;
00365             }
00366         }
00367         // SQL2003 Part 2 Section 8.6 General Rule 2
00368         // Data Exception - Invalid Regular Expression
00369         throw "2201B";
00370     }
00371 }
00372 
00373 
00374 // StrSimilarPrepRewrite
00375 // helper to StrSimilarPrep - changes SQL SIMILAR format to Boost::RegEx format
00376 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00377 void
00378 SqlSimilarPrepReWrite(
00379     char escapeChar,
00380     std::string& expPat,
00381     std::string& sqlSpecial)
00382 {
00383     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint &&
00384         CodeUnitBytes == 1) {
00385         // ASCII
00386 
00387         // Define special characters for SQL2003 Part 2 Section 8.6 Syntax Rule
00388         // 6 (similar to Section 8.6 General Rule 3.b).
00389         //
00390         // Added <right brace> to these list as it appears at first glance to
00391         // be an omission from the rules.  Could easily be wrong though. There
00392         // could be a subtle reason why '}' is omitted from these rules
00393         //
00394         char const * const SqlSimilarPrepSyntaxRule6 = "[]()|^-+*_%?{}";
00395 
00396         char const * const BoostRegExEscapeChar = "\\";
00397 
00398         // Escape only "\" so it has no meaning to regex.
00399         // Convert pat from SQL to Posix (or Perl, tbd) RegExp
00400         //         _ -> .
00401         //         % -> .*
00402 
00403         size_t pos = 0;
00404         bool characterEnumeration = false; // e.g. [A-Z]
00405         while ((pos = expPat.find_first_of(sqlSpecial, pos)) !=
00406                std::string::npos)
00407         {
00408             if (expPat[pos] == escapeChar) {
00409                 if (pos + 1 >= expPat.size()) {
00410                     // Escape char at end of string. See large note above
00411                     // SQL2003 Part 2 Section 8.6 General Rule 2
00412                     // Data Exception - Invalid Regular Expression
00413                     throw "2201B";
00414                 }
00415                 if (strchr(SqlSimilarPrepSyntaxRule6, expPat[pos + 1])) {
00416                     // Valid <escaped char>, per SQL2003 Part 2 Section 8.6
00417                     // Syntax Rule 6.  Replace user defined escape char with
00418                     // regex escape char.
00419                     expPat.replace(pos, 1, BoostRegExEscapeChar);
00420                     // Move past subsequent special character.
00421                     pos += 2;
00422                 } else if (expPat[pos + 1] == escapeChar) {
00423                     // By inference, escapeChar is not a special char.
00424                     // Can let the escape char fall through w/o an regex esc.
00425                     // Delete one of the two <escape><escape> chars:
00426                     expPat.erase(pos, 1);
00427                     // Move past the sole remaining <escape> char:
00428                     pos++;
00429                 } else {
00430                     // Malformed <escaped char>. Attempt to escape a
00431                     // non special character.  SQL2003 Part 2 Section 8.6 Syntax
00432                     // Rules 5 & 6, combined with General Rule 2 imply
00433                     // that if an escape character is not followed by
00434                     // a special character, then the result does not
00435                     // have the format of a <regular expression> since
00436                     // the character is neither an <non-escaped
00437                     // character> nor an <escaped character>.
00438                     //
00439                     // SQL2003 Part 2 Section 8.6 General Rule 2
00440                     // Data Exception - Invalid Regular Expression
00441                     throw "2201B";
00442                 }
00443             } else {
00444                 switch (expPat[pos]) {
00445                 case '[':
00446                     // See long note above on SR5, SR6 and GR 2: by
00447                     // the BNF, a non-escaped special character is not
00448                     // legal inside a character enumeration. Besides,
00449                     // it doesn't make sense.
00450                     characterEnumeration = true;
00451                     pos++;
00452                     SqlSimilarPrepRewriteCharEnumeration
00453                         <CodeUnitBytes, MaxCodeUnitsPerCodePoint>
00454                         (expPat, pos, SqlSimilarPrepSyntaxRule6, escapeChar);
00455                     break;
00456                 case ']':
00457                     if (!characterEnumeration) {
00458                         // Closing ']'  w/o opening ']'
00459                         // SQL2003 Part 2 Section 8.6 General Rule 2
00460                         // Data Exception - Invalid Regular Expression
00461                         throw "2201B";
00462                     }
00463                     characterEnumeration = false;
00464                     pos++;
00465                     break;
00466                 case '_':   // SQL '_' -> regex '.'
00467                     expPat.replace(pos, 1, ".");
00468                     pos++;
00469                     break;
00470                 case '%':   // SQL '%' -> regex '.*'
00471                     expPat.replace(pos, 1, ".*");
00472                     pos += 2;
00473                     break;
00474                 case '\\':
00475                     //
00476                     // Characters that boost::regex treats as special:
00477                     // ".|*?+(){}[]^$\\":
00478                     // Characters boost::regex treats as special, on top of
00479                     // SqlSimilarPrepSyntaxRule6:
00480                     // "\\.$"
00481                     //
00482                     // \ is not a special character in SIMILAR, but it
00483                     // must be escaped from regex.  Is treated
00484                     // specially only if it is *not* the escape
00485                     // char. Note that this also has the side effect
00486                     // of turning off various character escape
00487                     // sequences (e.g. \n) and operators (e.g. \w)
00488                     // that regex supports.
00489                     expPat.replace(pos, 1, "\\\\");
00490                     pos += 2;
00491                     break;
00492                 case '.':
00493                     // see comment just above for '\\'
00494                     expPat.replace(pos, 1, "\\.");
00495                     pos += 2;
00496                     break;
00497                 case '$':
00498                     // see comment just above for '\\'
00499                     expPat.replace(pos, 1, "\\$");
00500                     pos += 2;
00501                     break;
00502                 default:
00503                     throw std::logic_error("SqlSimilarPrep:escapeSwitch");
00504                 }
00505             }
00506         } // while()
00507 
00508         if (characterEnumeration) {
00509             // Opening '[' w/o closing ']'
00510             // SQL2003 Part 2 Section 8.6 General Rule 2
00511             // Data Exception - Invalid Regular Expression
00512             throw "2201B";
00513         }
00514     }
00515 }
00516 
00536 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00537 void
00538 SqlSimilarPrep(
00539     char const * const pattern,
00540     int patternLenBytes,
00541     char const * const escape,  // may be null
00542     int escapeLenBytes,
00543     std::string& expPat)
00544 {
00545     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint &&
00546         CodeUnitBytes == 1) {
00547         // ASCII
00548 
00549         if (patternLenBytes == 0) {
00550             // SQL99 and SQL2003 Part 2 Section 8.6 General Rule 2 may
00551             // come into play here if boost::regex doesn't
00552             // handle this case properly. Also see
00553             // SQL2003 Part 2 Section 8.6 General Rule 7.u.
00554             expPat.assign("UNUSED");
00555             return;
00556         }
00557 
00558         expPat.assign(pattern, patternLenBytes);
00559 
00560         // Chars that have different meanings in SIMILAR & regex
00561         // Note that \\ becomes \ in the string.
00562         std::string sqlSpecial("\\.$_%[]");
00563         char escapeChar;
00564 
00565         SqlSimilarPrepEscapeProcessing<CodeUnitBytes, MaxCodeUnitsPerCodePoint>(
00566                 escape,
00567                 escapeLenBytes,
00568                 escapeChar,
00569                 expPat,
00570                 sqlSpecial);
00571 
00572         SqlSimilarPrepReWrite<CodeUnitBytes, MaxCodeUnitsPerCodePoint>(
00573             escapeChar,
00574             expPat,
00575             sqlSpecial);
00576 
00577     } else if (CodeUnitBytes == MaxCodeUnitsPerCodePoint &&
00578                CodeUnitBytes == 2) {
00579         // TODO: Add UCS2 here
00580         // Convert pattern to ICU regex pattern.
00581         //
00582         // Use of std::string in function signature may have to change
00583         // when ICU support is added.
00584         throw std::logic_error("no UCS2");
00585     } else if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) {
00586         throw std::logic_error("no such encoding");
00587     } else {
00588         throw std::logic_error("no UTF8/16/32");
00589     }
00590 }
00591 
00601 template <int CodeUnitBytes, int MaxCodeUnitsPerCodePoint>
00602 bool
00603 SqlRegExp(
00604     char const * const matchValue,
00605     int matchValueLenBytes,
00606     int patternLenBytes,
00607     const boost::regex& exp)
00608 {
00609     if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) {
00610         if (CodeUnitBytes == 1) {
00611             // ASCII
00612 
00613             if (patternLenBytes == 0) {
00614                 if (matchValueLenBytes == 0) {
00615                     // SQL99 Part 2 Section 8.5 General Rule 3.d.i.  Not
00616                     // explicitly defined in SQL2003 Part 2 Section 8.6 for
00617                     // SIMILAR but let this pass as seems reasonable.
00618                     return true;
00619                 } else {
00620                     // Believe that this cannot match anything.
00621                     // Avoid tussle with regex over empty exp
00622                     return false;
00623                 }
00624             }
00625 
00626             bool result;
00627             try {
00628                 result = boost::regex_match(
00629                     matchValue,
00630                     matchValue + matchValueLenBytes,
00631                     exp);
00632             } catch (...) {
00633                 // TODO: Make this a catch bad_expression or similar
00634                 // TODO: and rethrow a SQL error code.
00635                 throw std::logic_error("boost::regex error in SqlLike");
00636             }
00637             return result;
00638 
00639         } else if (CodeUnitBytes == 2) {
00640             // TODO: Add UCS2 here
00641             throw std::logic_error("no UCS2");
00642         } else {
00643             throw std::logic_error("no such encoding");
00644         }
00645     } else {
00646         throw std::logic_error("no UTF8/16/32");
00647     }
00648 }
00649 
00650 
00651 
00652 FENNEL_END_NAMESPACE
00653 
00654 #endif
00655 
00656 // End SqlRegExp.h

Generated on Mon Jun 22 04:00:17 2009 for Fennel by  doxygen 1.5.1