Go to the source code of this file.
Functions | |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
void | SqlLikePrep (char const *const pattern, int patternLenBytes, char const *const escape, int escapeLenBytes, std::string &expPat) |
StrLikePrep. | |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
void | SqlSimilarPrepEscapeProcessing (char const *const escape, int escapeLenBytes, char &escapeChar, std::string const &expPat, std::string &sqlSpecial) |
StrSimilarPrepEscapeProcessing helper to StrSimilarPrep. | |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
void | SqlSimilarPrepRewriteCharEnumeration (std::string &expPat, size_t &pos, char const *const SqlSimilarPrepSyntaxRule6, char escapeChar) |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
void | SqlSimilarPrepReWrite (char escapeChar, std::string &expPat, std::string &sqlSpecial) |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
void | SqlSimilarPrep (char const *const pattern, int patternLenBytes, char const *const escape, int escapeLenBytes, std::string &expPat) |
StrSimilarPrep. | |
template<int CodeUnitBytes, int MaxCodeUnitsPerCodePoint> | |
bool | SqlRegExp (char const *const matchValue, int matchValueLenBytes, int patternLenBytes, const boost::regex &exp) |
SqlRegExp. |
These functions are called by ExtendedInstructions in ExtRegExp.h
See also file SqlString.h
Definition in file SqlRegExp.h.
void SqlLikePrep | ( | char const *const | pattern, | |
int | patternLenBytes, | |||
char const *const | escape, | |||
int | escapeLenBytes, | |||
std::string & | expPat | |||
) |
StrLikePrep.
Prepares a pattern string to feed to regex and perhaps also ICU's regex.
See SQL99 Part 2 Section 8.5
Set escape and escapeLenBytes to 0 if escape character is not defined.
May throw "22019" or "22025".
Definition at line 66 of file SqlRegExp.h.
00072 { 00073 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) { 00074 if (CodeUnitBytes == 1) { 00075 // ASCII 00076 00077 if (patternLenBytes == 0) { 00078 // SQL99 Part 2 Section 8.5 General Rule 3.d.i. 00079 // LIKE always matches if matchValueLenBytes == 0 00080 // if != 0, then I believe this cannot match anything 00081 // Must still assign a valid regex here. 00082 expPat.assign("UNUSED"); 00083 return; 00084 } 00085 00086 bool escapeIsRegexpSpecial = false; 00087 std::string special("_%.|*?+(){}[]^$\\"); 00088 char escapeChar; 00089 if (escapeLenBytes == 1) { 00090 escapeChar = *escape; 00091 if (special.find(escapeChar) != std::string::npos && 00092 escapeChar != '_' && 00093 escapeChar != '%') { 00094 // escape char is a special char to regex (not 00095 // sql, just regex) and must be escaped if it 00096 // makes it through to the pattern. (e.g.: escape 00097 // = '*', pattern = '**' then regex should be fed 00098 // '\*' 00099 escapeIsRegexpSpecial = true; 00100 } 00101 special.append(1, escapeChar); 00102 } else { 00103 if (!escape & !escapeLenBytes) { 00104 // Default to no escape character 00105 escapeChar = 0; // should not match anything 00106 } else { 00107 // SQL99 Part 2 Section 8.5 General Rule 3.b.i1 00108 // Invalid Escape Character 00109 throw "22019"; 00110 } 00111 } 00112 00113 expPat.assign(pattern, patternLenBytes); 00114 00115 // Escape all of ".", "|", "*", "?", "+", 00116 // "(", ")", "{", "}", "[", "]", "^", "$", and "\" 00117 // so they have no meaning to regex. 00118 // Convert pat from SQL to Posix (or Perl, tbd) RegExp 00119 // _ -> . 00120 // % -> .* 00121 size_t pos = 0; 00122 while ((pos = expPat.find_first_of(special, pos)) != 00123 std::string::npos) { 00124 if (expPat[pos] == escapeChar) { 00125 if (pos + 1 >= expPat.size() || 00126 (expPat[pos + 1] != '_' 00127 && expPat[pos + 1] != '%' 00128 && expPat[pos + 1] != escapeChar)) 00129 { 00130 // SQL99 Part 2 Section 8.5 General Rule 00131 // 3.d.ii, I think. 00132 // Invalid Escape Sequence 00133 throw "22025"; 00134 } 00135 if (escapeIsRegexpSpecial 00136 && expPat[pos + 1] == escapeChar) 00137 { 00138 expPat[pos] = '\\'; // replace escape char 00139 pos += 2; // move past subsequent escape char 00140 } else { 00141 expPat.erase(pos, 1); // remove escape char 00142 pos++; // move past subsequent '_' or '%' 00143 } 00144 } else { 00145 switch (expPat[pos]) { 00146 case '_': // SQL '_' -> regex '.' 00147 expPat.replace(pos, 1, "."); 00148 pos++; 00149 break; 00150 case '%': // SQL '%' -> regex '.*' 00151 expPat.replace(pos, 1, ".*"); 00152 pos += 2; 00153 break; 00154 case '\\': 00155 // \ is not a special character in LIKE, but 00156 // it must be escaped from regex. Is treated 00157 // specially only if it is *not* the escape 00158 // char. Note that this also has the side 00159 // effect of turning off various character 00160 // escape sequences (e.g. \n) and operators 00161 // (e.g. \w) that regex supports. 00162 expPat.insert(pos, "\\", 1); 00163 pos += 2; 00164 break; 00165 00166 default: // escape regex special chars 00167 // A single \ is the regex escape char 00168 expPat.insert(pos, "\\", 1); 00169 pos += 2; 00170 } 00171 } 00172 } 00173 00174 } else if (CodeUnitBytes == 2) { 00175 // TODO: Add UCS2 here 00176 // Convert pattern to ICU regex pattern 00177 throw std::logic_error("no UCS2"); 00178 } else { 00179 throw std::logic_error("no such encoding"); 00180 } 00181 } else { 00182 throw std::logic_error("no UTF8/16/32"); 00183 } 00184 }
bool SqlRegExp | ( | char const *const | matchValue, | |
int | matchValueLenBytes, | |||
int | patternLenBytes, | |||
const boost::regex & | exp | |||
) |
SqlRegExp.
Execs LIKE and SIMILAR. SQL VARCHAR & CHAR. Ascii. No UCS2 yet.
See SQL99 Part 2 Section 8.5 & SQL2003 Part 2 Section 8.6
patternLenBytes must be passed in to support SQL99 Part 2 Section 8.5 General Rule 3.d.i, patLen = matchLen = 0.
TODO: Function signature will change when unicode is supported TODO: to allow either/or regex and ICU regex to be passed in.
Definition at line 603 of file SqlRegExp.h.
00608 { 00609 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) { 00610 if (CodeUnitBytes == 1) { 00611 // ASCII 00612 00613 if (patternLenBytes == 0) { 00614 if (matchValueLenBytes == 0) { 00615 // SQL99 Part 2 Section 8.5 General Rule 3.d.i. Not 00616 // explicitly defined in SQL2003 Part 2 Section 8.6 for 00617 // SIMILAR but let this pass as seems reasonable. 00618 return true; 00619 } else { 00620 // Believe that this cannot match anything. 00621 // Avoid tussle with regex over empty exp 00622 return false; 00623 } 00624 } 00625 00626 bool result; 00627 try { 00628 result = boost::regex_match( 00629 matchValue, 00630 matchValue + matchValueLenBytes, 00631 exp); 00632 } catch (...) { 00633 // TODO: Make this a catch bad_expression or similar 00634 // TODO: and rethrow a SQL error code. 00635 throw std::logic_error("boost::regex error in SqlLike"); 00636 } 00637 return result; 00638 00639 } else if (CodeUnitBytes == 2) { 00640 // TODO: Add UCS2 here 00641 throw std::logic_error("no UCS2"); 00642 } else { 00643 throw std::logic_error("no such encoding"); 00644 } 00645 } else { 00646 throw std::logic_error("no UTF8/16/32"); 00647 } 00648 }
void SqlSimilarPrep | ( | char const *const | pattern, | |
int | patternLenBytes, | |||
char const *const | escape, | |||
int | escapeLenBytes, | |||
std::string & | expPat | |||
) |
StrSimilarPrep.
Prepares a pattern string to feed to regex and perhaps also ICU's regex.
Set escape and escapeLenBytes to 0 if escape character is not defined.
May throw "2200B" "22019", "2201B", or "2200C"
See SQL99 Part 2 Section 8.6 and SQL2003 Part 2 Section 8.6. This routine adheres to SQL2003 as published in the working draft, except as noted below:
Does not support General Rule 7L which allows the definition of both included and excluded characters from sets at the same time. e.g.: [abc^def]. Seems to be of low value. TODO: Add support for General Rule 7L? TODO: Add support For Note 190: Handling of blanks at the end of the pattern. TODO: Understand and implement General Rule 7.t. (Confused.)
Definition at line 538 of file SqlRegExp.h.
00544 { 00545 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint && 00546 CodeUnitBytes == 1) { 00547 // ASCII 00548 00549 if (patternLenBytes == 0) { 00550 // SQL99 and SQL2003 Part 2 Section 8.6 General Rule 2 may 00551 // come into play here if boost::regex doesn't 00552 // handle this case properly. Also see 00553 // SQL2003 Part 2 Section 8.6 General Rule 7.u. 00554 expPat.assign("UNUSED"); 00555 return; 00556 } 00557 00558 expPat.assign(pattern, patternLenBytes); 00559 00560 // Chars that have different meanings in SIMILAR & regex 00561 // Note that \\ becomes \ in the string. 00562 std::string sqlSpecial("\\.$_%[]"); 00563 char escapeChar; 00564 00565 SqlSimilarPrepEscapeProcessing<CodeUnitBytes, MaxCodeUnitsPerCodePoint>( 00566 escape, 00567 escapeLenBytes, 00568 escapeChar, 00569 expPat, 00570 sqlSpecial); 00571 00572 SqlSimilarPrepReWrite<CodeUnitBytes, MaxCodeUnitsPerCodePoint>( 00573 escapeChar, 00574 expPat, 00575 sqlSpecial); 00576 00577 } else if (CodeUnitBytes == MaxCodeUnitsPerCodePoint && 00578 CodeUnitBytes == 2) { 00579 // TODO: Add UCS2 here 00580 // Convert pattern to ICU regex pattern. 00581 // 00582 // Use of std::string in function signature may have to change 00583 // when ICU support is added. 00584 throw std::logic_error("no UCS2"); 00585 } else if (CodeUnitBytes == MaxCodeUnitsPerCodePoint) { 00586 throw std::logic_error("no such encoding"); 00587 } else { 00588 throw std::logic_error("no UTF8/16/32"); 00589 } 00590 }
void SqlSimilarPrepEscapeProcessing | ( | char const *const | escape, | |
int | escapeLenBytes, | |||
char & | escapeChar, | |||
std::string const & | expPat, | |||
std::string & | sqlSpecial | |||
) |
StrSimilarPrepEscapeProcessing helper to StrSimilarPrep.
Definition at line 190 of file SqlRegExp.h.
00196 { 00197 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint && 00198 CodeUnitBytes == 1) { 00199 // ASCII 00200 00201 if (escapeLenBytes == 1) { 00202 escapeChar = *escape; 00203 sqlSpecial.append(1, escapeChar); 00204 00205 // Define special characters for SQL2003 Part 2 Section 8.6 General 00206 // Rule 3.b. (See also Syntax Rule 6.) Added <right brace> to these 00207 // list as it appears at first glance to be an omission from the 00208 // rules. Could easily be wrong though. There could be a subtle 00209 // reason why '}' is omitted from these rules 00210 char const * const SqlSimilarPrepGeneralRule3b = "[]()|^-+*_%?{}"; 00211 00212 if (strchr(SqlSimilarPrepGeneralRule3b, escapeChar)) { 00213 // Escape char is special char. Must not be 00214 // present in pattern unless it part of a 00215 // correctly formed <escape character> 00216 size_t pos = 0; 00217 while ((pos = expPat.find(escapeChar, pos)) != 00218 std::string::npos) { 00219 if (pos + 1 >= expPat.size() || 00220 !strchr( 00221 SqlSimilarPrepGeneralRule3b, 00222 expPat[pos + 1])) 00223 { 00224 // SQL2003 Part 2 Section 8.6 General Rule 3.b 00225 // Data Exception - Invalid Use of Escape Character 00226 throw "2200C"; 00227 } 00228 pos += 2; // skip by <escape><special char> 00229 } 00230 } 00231 if (escapeChar == ':' && 00232 ((expPat.find("[:") != std::string::npos || 00233 expPat.find(":]") != std::string::npos))) { 00234 // SQL2003 Part 2 Section 8.6 General Rule 3.c 00235 // Data Exception -- Escape Character Conflict 00236 throw "2200B"; 00237 } 00238 } else { 00239 if (!escape & ! escapeLenBytes) { 00240 // Default to no escape character 00241 escapeChar = 0; // should not match anything 00242 } else { 00243 // SQL2003 Part 2 Section 8.6 General Rule 3, 00244 // Invalid Escape Character 00245 throw "22019"; 00246 } 00247 } 00248 } 00249 }
void SqlSimilarPrepReWrite | ( | char | escapeChar, | |
std::string & | expPat, | |||
std::string & | sqlSpecial | |||
) |
Definition at line 378 of file SqlRegExp.h.
References SqlSimilarPrepRewriteCharEnumeration().
00382 { 00383 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint && 00384 CodeUnitBytes == 1) { 00385 // ASCII 00386 00387 // Define special characters for SQL2003 Part 2 Section 8.6 Syntax Rule 00388 // 6 (similar to Section 8.6 General Rule 3.b). 00389 // 00390 // Added <right brace> to these list as it appears at first glance to 00391 // be an omission from the rules. Could easily be wrong though. There 00392 // could be a subtle reason why '}' is omitted from these rules 00393 // 00394 char const * const SqlSimilarPrepSyntaxRule6 = "[]()|^-+*_%?{}"; 00395 00396 char const * const BoostRegExEscapeChar = "\\"; 00397 00398 // Escape only "\" so it has no meaning to regex. 00399 // Convert pat from SQL to Posix (or Perl, tbd) RegExp 00400 // _ -> . 00401 // % -> .* 00402 00403 size_t pos = 0; 00404 bool characterEnumeration = false; // e.g. [A-Z] 00405 while ((pos = expPat.find_first_of(sqlSpecial, pos)) != 00406 std::string::npos) 00407 { 00408 if (expPat[pos] == escapeChar) { 00409 if (pos + 1 >= expPat.size()) { 00410 // Escape char at end of string. See large note above 00411 // SQL2003 Part 2 Section 8.6 General Rule 2 00412 // Data Exception - Invalid Regular Expression 00413 throw "2201B"; 00414 } 00415 if (strchr(SqlSimilarPrepSyntaxRule6, expPat[pos + 1])) { 00416 // Valid <escaped char>, per SQL2003 Part 2 Section 8.6 00417 // Syntax Rule 6. Replace user defined escape char with 00418 // regex escape char. 00419 expPat.replace(pos, 1, BoostRegExEscapeChar); 00420 // Move past subsequent special character. 00421 pos += 2; 00422 } else if (expPat[pos + 1] == escapeChar) { 00423 // By inference, escapeChar is not a special char. 00424 // Can let the escape char fall through w/o an regex esc. 00425 // Delete one of the two <escape><escape> chars: 00426 expPat.erase(pos, 1); 00427 // Move past the sole remaining <escape> char: 00428 pos++; 00429 } else { 00430 // Malformed <escaped char>. Attempt to escape a 00431 // non special character. SQL2003 Part 2 Section 8.6 Syntax 00432 // Rules 5 & 6, combined with General Rule 2 imply 00433 // that if an escape character is not followed by 00434 // a special character, then the result does not 00435 // have the format of a <regular expression> since 00436 // the character is neither an <non-escaped 00437 // character> nor an <escaped character>. 00438 // 00439 // SQL2003 Part 2 Section 8.6 General Rule 2 00440 // Data Exception - Invalid Regular Expression 00441 throw "2201B"; 00442 } 00443 } else { 00444 switch (expPat[pos]) { 00445 case '[': 00446 // See long note above on SR5, SR6 and GR 2: by 00447 // the BNF, a non-escaped special character is not 00448 // legal inside a character enumeration. Besides, 00449 // it doesn't make sense. 00450 characterEnumeration = true; 00451 pos++; 00452 SqlSimilarPrepRewriteCharEnumeration 00453 <CodeUnitBytes, MaxCodeUnitsPerCodePoint> 00454 (expPat, pos, SqlSimilarPrepSyntaxRule6, escapeChar); 00455 break; 00456 case ']': 00457 if (!characterEnumeration) { 00458 // Closing ']' w/o opening ']' 00459 // SQL2003 Part 2 Section 8.6 General Rule 2 00460 // Data Exception - Invalid Regular Expression 00461 throw "2201B"; 00462 } 00463 characterEnumeration = false; 00464 pos++; 00465 break; 00466 case '_': // SQL '_' -> regex '.' 00467 expPat.replace(pos, 1, "."); 00468 pos++; 00469 break; 00470 case '%': // SQL '%' -> regex '.*' 00471 expPat.replace(pos, 1, ".*"); 00472 pos += 2; 00473 break; 00474 case '\\': 00475 // 00476 // Characters that boost::regex treats as special: 00477 // ".|*?+(){}[]^$\\": 00478 // Characters boost::regex treats as special, on top of 00479 // SqlSimilarPrepSyntaxRule6: 00480 // "\\.$" 00481 // 00482 // \ is not a special character in SIMILAR, but it 00483 // must be escaped from regex. Is treated 00484 // specially only if it is *not* the escape 00485 // char. Note that this also has the side effect 00486 // of turning off various character escape 00487 // sequences (e.g. \n) and operators (e.g. \w) 00488 // that regex supports. 00489 expPat.replace(pos, 1, "\\\\"); 00490 pos += 2; 00491 break; 00492 case '.': 00493 // see comment just above for '\\' 00494 expPat.replace(pos, 1, "\\."); 00495 pos += 2; 00496 break; 00497 case '$': 00498 // see comment just above for '\\' 00499 expPat.replace(pos, 1, "\\$"); 00500 pos += 2; 00501 break; 00502 default: 00503 throw std::logic_error("SqlSimilarPrep:escapeSwitch"); 00504 } 00505 } 00506 } // while() 00507 00508 if (characterEnumeration) { 00509 // Opening '[' w/o closing ']' 00510 // SQL2003 Part 2 Section 8.6 General Rule 2 00511 // Data Exception - Invalid Regular Expression 00512 throw "2201B"; 00513 } 00514 } 00515 }
void SqlSimilarPrepRewriteCharEnumeration | ( | std::string & | expPat, | |
size_t & | pos, | |||
char const *const | SqlSimilarPrepSyntaxRule6, | |||
char | escapeChar | |||
) |
Definition at line 258 of file SqlRegExp.h.
Referenced by SqlSimilarPrepReWrite().
00263 { 00264 if (CodeUnitBytes == MaxCodeUnitsPerCodePoint && 00265 CodeUnitBytes == 1) { 00266 // ASCII 00267 00268 // If a <character enumeration> contains a <regular character 00269 // set identifier> it must be either [[:foo:]] or 00270 // [^[:foo:]]. All other patterns containing a <regular 00271 // character set identifier>, e.g. [abc[:foo:]], are assumed 00272 // (by my reading of the BNF) to be ill-formed. 00273 if (!expPat.compare(pos, 3, "^[:")) { 00274 // skip past ^ and process as a usual (e.g. as [:foo:]) 00275 pos++; 00276 } else if (!expPat.compare(pos, 2, "[:")) { 00277 // no-op 00278 } else { 00279 // The <character enumeration> does not contain a 00280 // <regular character set identifier>. 00281 // 00282 // SQL2003 Part 2 Section 8.6 Syntax Rule 5 and Syntax Rule 6. Only 00283 // <escaped character> and <non-escaped character> are 00284 // legal between [ and ]. i.e. Unescaped special 00285 // characters not allowed in <character enumeration>. 00286 // Of course the exception is '-' <minus sign> and '^' 00287 // <circumflex>. 00288 00289 std::string syntaxRule6ForCharEnum("[]()|+*_%?{}"); 00290 syntaxRule6ForCharEnum.append(escapeChar, 1); 00291 00292 size_t pos2 = pos; 00293 while ((pos2 = expPat.find_first_of(syntaxRule6ForCharEnum, pos2)) 00294 != std::string::npos) { 00295 if (expPat[pos2] == escapeChar) { 00296 // skip over next char, assume that it is special 00297 pos2 += 2; 00298 } else if (expPat[pos2] == ']') { 00299 // no more special chars. Set is OK 00300 break; 00301 } else { 00302 // A special char (as defined by Syntax Rule 6) found 00303 // unescaped inside character enumeration 00304 // 00305 // SQL2003 Part 2 Section 8.6 General Rule 2 00306 // Data Exception - Invalid Regular Expression 00307 throw "2201B"; 00308 } 00309 } 00310 return; 00311 } 00312 00313 // 00314 // Continue with <regular character set identifier> processing 00315 // 00316 00317 // SQL2003 Part 2 Section 8.6 Syntax Rule 3 00318 // and 8.6 BNF <character enumeration> 00319 // Must make a few substitutions as regex doesn't match 00320 // SIMILAR. Also, regex doesn't use [:ALPHA:], only [:alpha:]. 00321 // See General Rule 7.m - 7.s 00322 // 00323 // SQL2003 Part 2 Section 8.6 General Rule 7.r, Note 189 refers to 00324 // SQL2003 3.1.6.42: Whitespace is defined as: 00325 // U+0009, Horizontal Tabulation 00326 // U+000A, Line Feed 00327 // U+000B, Vertical Tabulation 00328 // U+000C, Form Feed 00329 // U+000D, Carriage Return 00330 // U+0085, Next Line 00331 // Plus Unicode General Category classes Zs, Zl, Zp: 00332 // (see NOTE 6 & NOTE 7), ignoring those > U+00FF 00333 // U+0020, Space 00334 // U+00A0, No-Break Space 00335 // 00336 // Table below assumes that only [[:foo:]] and [^[:foo:]] are 00337 // legal. All other patterns, e.g. [abc[:foo:]] are assumed 00338 // (by my reading of the BNF) to be ill-formed. 00339 // 00340 // TODO: Move this table to .cpp file. 00341 char const * const regCharSetIdent[][2] = { 00342 { "[:ALPHA:]", "[:alpha:]" }, 00343 { "[:alpha:]", "[:alpha:]" }, 00344 { "[:UPPER:]", "[:upper:]" }, 00345 { "[:upper:]", "[:upper:]" }, 00346 { "[:LOWER:]", "[:lower:]" }, 00347 { "[:lower:]", "[:lower:]" }, 00348 { "[:DIGIT:]", "[:digit:]" }, 00349 { "[:digit:]", "[:digit:]" }, 00350 { "[:SPACE:]", " " }, 00351 { "[:space:]", " " }, 00352 { "[:WHITESPACE:]", "\x20\xa0\x09\x0a\x0b\x0c\x0d\x85" }, 00353 { "[:whitespace:]", "\x20\xa0\x09\x0a\x0b\x0c\x0d\x85" }, 00354 { "[:ALNUM:]", "[:alnum:]" }, 00355 { "[:alnum:]", "[:alnum:]" }, 00356 { "", "" }, 00357 }; 00358 int i, len; 00359 for (i = 0; *regCharSetIdent[i][0]; i++) { 00360 len = strlen(regCharSetIdent[i][0]); 00361 if (!expPat.compare(pos, len, regCharSetIdent[i][0])) { 00362 expPat.replace(pos, len, regCharSetIdent[i][1]); 00363 pos += strlen(regCharSetIdent[i][1]); 00364 return; 00365 } 00366 } 00367 // SQL2003 Part 2 Section 8.6 General Rule 2 00368 // Data Exception - Invalid Regular Expression 00369 throw "2201B"; 00370 } 00371 }