#include <FlatFileParser.h>
Public Member Functions | |
FlatFileParser (const char fieldDelim, const char rowDelim, const char quote, const char escape, bool doTrim=false) | |
Constructs a FlatFileParser. | |
void | scanRow (const char *buffer, int size, const FlatFileRowDescriptor &columns, FlatFileRowParseResult &result) |
Scans through buffer until the end of a row is reached, and locates columns within the row. | |
void | scanColumn (const char *buffer, uint size, uint maxLength, FlatFileColumnParseResult &result) |
Scans through buffer to find the length of a column value. | |
void | scanFixedColumn (const char *buffer, uint size, uint maxLength, FlatFileColumnParseResult &result) |
Scans a fixed format column. | |
void | stripQuoting (FlatFileRowParseResult &rowResult, bool trim) |
Remove quoting and escape characters from a row result, saving the results into the row result. | |
uint | stripQuoting (char *buffer, uint size, bool untrimmed) |
Removes quoting and escape characters from a column value. | |
uint | trim (char *buffer, uint size) |
Trim spaces from beginning and end of text. | |
Private Member Functions | |
const char * | scanRowEnd (const char *buffer, int size, bool rowDelim, FlatFileRowParseResult &result) |
Scans through buffer to recover from any row errors. | |
const char * | scanRowDelim (const char *buffer, int size, bool search) |
Scan through buffer to find a row delimiter, or non row delimiter character. | |
bool | isRowDelim (char c) |
Determines whether or not character is a row delimiter. | |
Private Attributes | |
char | fieldDelim |
char | rowDelim |
char | quote |
char | escape |
bool | doTrim |
bool | fixed |
Whether to perform fixed mode parsing. |
The main entry point is scanRow()
which returns pointers into a text buffer, representing columns. The method stripQuoting()
may be useful for decoding a quoted value. Other methods are primarily made public for testing purposes.
Definition at line 389 of file FlatFileParser.h.
FlatFileParser::FlatFileParser | ( | const char | fieldDelim, | |
const char | rowDelim, | |||
const char | quote, | |||
const char | escape, | |||
bool | doTrim = false | |||
) |
Constructs a FlatFileParser.
See FlatFileExecStreamParams for more detail on the parameters.
[in] | fieldDelim | character delimiter |
[in] | rowDelim | row delimiter |
[in] | quote | quote character |
[in] | escape | escape character |
[in] | doTrim | whether to trim column values before processing them. A column value is trimmed before it is unquoted. |
Definition at line 78 of file FlatFileParser.cpp.
References fixed.
00080 { 00081 this->fieldDelim = fieldDelim; 00082 this->rowDelim = rowDelim; 00083 this->quote = quote; 00084 this->escape = escape; 00085 this->doTrim = doTrim; 00086 00087 fixed = (fieldDelim == 0); 00088 if (fixed) { 00089 assert(quote == 0); 00090 assert(escape == 0); 00091 } 00092 }
const char * FlatFileParser::scanRowEnd | ( | const char * | buffer, | |
int | size, | |||
bool | rowDelim, | |||
FlatFileRowParseResult & | result | |||
) | [private] |
Scans through buffer to recover from any row errors.
Scans to row delimiter if one was not found. Then scans past spurious row delimiters. On success, increments row delimiter count.
[in] | buffer | flat file buffer to be parsed |
[in] | size | size of buffer |
[in] | rowDelim | whether a row delimiter was read from previous row |
[in,out] | result | result from scanning for row |
Definition at line 187 of file FlatFileParser.cpp.
References FlatFileRowParseResult::INCOMPLETE_COLUMN, FlatFileRowParseResult::nRowDelimsRead, FlatFileRowParseResult::ROW_TOO_LARGE, scanRowDelim(), and FlatFileRowParseResult::status.
Referenced by scanRow().
00192 { 00193 const char *read = buffer; 00194 const char *end = buffer + size; 00195 switch (result.status) { 00196 case FlatFileRowParseResult::INCOMPLETE_COLUMN: 00197 case FlatFileRowParseResult::ROW_TOO_LARGE: 00198 assert(read == end); 00199 return read; 00200 default: 00201 break; 00202 } 00203 00204 // if a row delimiter was not encountered while scanning the row, 00205 // search for the next row delimiter character 00206 if (!rowDelim) { 00207 read = scanRowDelim(read, end - read, true); 00208 if (read == end) { 00209 return read; 00210 } 00211 } 00212 result.nRowDelimsRead++; 00213 00214 // search for the first non- row delimiter character 00215 read = scanRowDelim(read, end - read, false); 00216 return read; 00217 }
const char * FlatFileParser::scanRowDelim | ( | const char * | buffer, | |
int | size, | |||
bool | search | |||
) | [private] |
Scan through buffer to find a row delimiter, or non row delimiter character.
[in] | buffer | flat file buffer to be parsed |
[in] | size | size of buffer |
[in] | search | if true, look for row delimiter, else a non row delim |
Definition at line 219 of file FlatFileParser.cpp.
References isRowDelim().
Referenced by scanRow(), and scanRowEnd().
00223 { 00224 const char *read = buffer; 00225 const char *end = buffer + size; 00226 while (read < end) { 00227 if (isRowDelim(*read) == search) { 00228 break; 00229 } else { 00230 read++; 00231 } 00232 } 00233 return read; 00234 }
bool FlatFileParser::isRowDelim | ( | char | c | ) | [private] |
Determines whether or not character is a row delimiter.
If the row delimiter is any of the line characters (/r or /n), then it must be encoded as newline (/n) and it matches any other line character.
Definition at line 236 of file FlatFileParser.cpp.
References rowDelim.
Referenced by scanColumn(), scanFixedColumn(), and scanRowDelim().
00237 { 00238 assert(rowDelim != '\r'); 00239 return (rowDelim == '\n') ? (c == '\r' || c == '\n') : (c == rowDelim); 00240 }
void FlatFileParser::scanRow | ( | const char * | buffer, | |
int | size, | |||
const FlatFileRowDescriptor & | columns, | |||
FlatFileRowParseResult & | result | |||
) |
Scans through buffer until the end of a row is reached, and locates columns within the row.
The main options are a "bounded", "lenient", and "mapped".
If a scan is "bounded", the output must match an expected format as specified by the column descriptions. Rows with the wrong number of columns are treated as bad rows. However, if a "bounded" scan is also "lenient", the parser is forgiving. Missing values are filled in with null, and extra values are discarded. If columns are "mapped" then they default to null, and columns read are assigned to output columns according to the mapping.
If a scan is "unbounded", then the output may have any number of columns. The other options are not applicable to unbounded mode.
[in] | buffer | buffer with text to be parsed |
[in] | size | size of buffer, in characters |
[in] | columns | description of columns to be parsed |
[out] | result | result of parsing row |
Definition at line 94 of file FlatFileParser.cpp.
References FlatFileRowParseResult::addColumn(), FlatFileRowParseResult::clear(), FlatFileRowParseResult::current, FlatFileColumnParseResult::FIELD_DELIM, FlatFileRowDescriptor::getMap(), FlatFileRowDescriptor::getMaxColumns(), FlatFileRowDescriptor::getMaxLength(), FlatFileRowParseResult::INCOMPLETE_COLUMN, FlatFileRowDescriptor::isBounded(), FlatFileRowDescriptor::isLenient(), FlatFileRowDescriptor::isMapped(), FlatFileColumnParseResult::MAX_LENGTH, FlatFileRowParseResult::next, FlatFileColumnParseResult::next, FlatFileRowParseResult::NO_COLUMN_DELIM, FlatFileColumnParseResult::NO_DELIM, FlatFileRowParseResult::NO_STATUS, FlatFileRowParseResult::resize(), FlatFileColumnParseResult::ROW_DELIM, rowDelim, scanColumn(), scanRowDelim(), scanRowEnd(), FlatFileRowParseResult::setColumn(), FlatFileRowParseResult::setNull(), FlatFileColumnParseResult::size, FlatFileRowParseResult::status, FlatFileRowParseResult::TOO_FEW_COLUMNS, FlatFileRowParseResult::TOO_MANY_COLUMNS, and FlatFileColumnParseResult::type.
00099 { 00100 assert(size >= 0); 00101 const char *row = buffer; 00102 uint offset = 0; 00103 FlatFileColumnParseResult columnResult; 00104 00105 result.status = FlatFileRowParseResult::NO_STATUS; 00106 bool bounded = columns.isBounded(); 00107 bool lenient = columns.isLenient(); 00108 bool mapped = columns.isMapped(); 00109 bool strict = (bounded && (!lenient)); 00110 00111 uint maxColumns = columns.getMaxColumns(); 00112 uint resultColumns = columns.size(); 00113 if (bounded) { 00114 result.resize(resultColumns); 00115 for (uint i = 0; i < resultColumns; i++) { 00116 result.setNull(i); 00117 } 00118 } else { 00119 result.clear(); 00120 } 00121 00122 // Scan any initial row delimiters, helps for the case when a row 00123 // delimiter is multiple characters like \r\n and the delimiter 00124 // characters are split between two buffers. (The previous row could 00125 // be complete due to \r, and parsing could begin at \n. 00126 const char *nonDelim = scanRowDelim(row, size, false); 00127 offset = nonDelim - row; 00128 00129 bool done = false; 00130 bool rowDelim = false; 00131 for (uint i = 0; i < maxColumns; i++) { 00132 uint maxLength = columns.getMaxLength(i); 00133 scanColumn( 00134 row + offset, 00135 size - offset, 00136 maxLength, 00137 columnResult); 00138 switch (columnResult.type) { 00139 case FlatFileColumnParseResult::NO_DELIM: 00140 result.status = FlatFileRowParseResult::INCOMPLETE_COLUMN; 00141 done = true; 00142 break; 00143 case FlatFileColumnParseResult::ROW_DELIM: 00144 if (strict && (i+1 != columns.size())) { 00145 if (i == 0) { 00146 result.status = FlatFileRowParseResult::NO_COLUMN_DELIM; 00147 } else { 00148 result.status = FlatFileRowParseResult::TOO_FEW_COLUMNS; 00149 } 00150 } 00151 done = true; 00152 rowDelim = true; 00153 break; 00154 case FlatFileColumnParseResult::MAX_LENGTH: 00155 case FlatFileColumnParseResult::FIELD_DELIM: 00156 if (strict && (i+1 == columns.size())) { 00157 result.status = FlatFileRowParseResult::TOO_MANY_COLUMNS; 00158 done = true; 00159 } 00160 break; 00161 default: 00162 permAssert(false); 00163 } 00164 if (bounded) { 00165 int target = mapped ? columns.getMap(i) : i; 00166 if (target >= 0) { 00167 assert (target < resultColumns); 00168 result.setColumn(target, offset, columnResult.size); 00169 } 00170 } else { 00171 result.addColumn(offset, columnResult.size); 00172 } 00173 offset = columnResult.next - row; 00174 if (done) { 00175 break; 00176 } 00177 } 00178 result.current = const_cast<char *>(row); 00179 result.next = const_cast<char *>( 00180 scanRowEnd( 00181 columnResult.next, 00182 buffer + size - columnResult.next, 00183 rowDelim, 00184 result)); 00185 }
void FlatFileParser::scanColumn | ( | const char * | buffer, | |
uint | size, | |||
uint | maxLength, | |||
FlatFileColumnParseResult & | result | |||
) |
Scans through buffer to find the length of a column value.
Keeps going until it reads a delimiter or it completes a fixed column. A column is considered to be quoted if and only if the first character is a quote character.
[in] | buffer | buffer containing text to scan |
[in] | size | size of buffer contents, in characters |
[in] | maxLength | max length of column, excluding escapes and quotes |
[out] | result | result of scanning buffer |
Definition at line 242 of file FlatFileParser.cpp.
References doTrim, escape, FlatFileColumnParseResult::FIELD_DELIM, fieldDelim, fixed, isRowDelim(), FlatFileColumnParseResult::NO_DELIM, quote, FlatFileColumnParseResult::ROW_DELIM, scanFixedColumn(), FlatFileColumnParseResult::setResult(), and SPACE_CHAR.
Referenced by FlatFileExecStreamTest::checkColumnScan(), and scanRow().
00247 { 00248 if (fixed) { 00249 return scanFixedColumn(buffer, size, maxLength, result); 00250 } 00251 00252 assert(buffer != NULL); 00253 const char *read = buffer; 00254 const char *end = buffer + size; 00255 00256 // read past leading spaces before checking for quotes 00257 if (doTrim) { 00258 while (read < end && SPACE_CHAR == *read) { 00259 read++; 00260 } 00261 } 00262 00263 bool quoted = (read < end && *read == quote); 00264 bool quoteEscape = (quoted && quote == escape); 00265 00266 FlatFileColumnParseResult::DelimiterType type = 00267 FlatFileColumnParseResult::NO_DELIM; 00268 if (quoted) { 00269 read++; 00270 } 00271 while (read < end) { 00272 if (*read == quote) { 00273 read++; 00274 if (quoteEscape) { 00275 // read next character to determine whether purpose of 00276 // this character is an escape character or an end quote 00277 if (read == end) { 00278 break; 00279 } 00280 if (*read == quote) { 00281 // two consecutive quote/escape characters is an 00282 // escaped quote 00283 read++; 00284 continue; 00285 } 00286 } 00287 if (quoted) { 00288 // otherwise a quote may be a close quote 00289 quoteEscape = quoted = false; 00290 } 00291 } else if (*read == escape) { 00292 read++; 00293 // an escape escapes the next character 00294 if (read == end) { 00295 break; 00296 } 00297 read++; 00298 } else if (quoted) { 00299 read++; 00300 } else if (*read == fieldDelim) { 00301 type = FlatFileColumnParseResult::FIELD_DELIM; 00302 break; 00303 } else if (isRowDelim(*read)) { 00304 type = FlatFileColumnParseResult::ROW_DELIM; 00305 break; 00306 } else { 00307 read++; 00308 } 00309 } 00310 00311 uint resultSize = read - buffer; 00312 result.setResult(type, const_cast<char *>(buffer), resultSize); 00313 }
void FlatFileParser::scanFixedColumn | ( | const char * | buffer, | |
uint | size, | |||
uint | maxLength, | |||
FlatFileColumnParseResult & | result | |||
) |
Scans a fixed format column.
In this mode, the quote character and column delimiter are ignored. It is possible to stop parsing because (1) a row delimiter is read (2) the max length is reached or (3) the end of buffer is reached
Definition at line 315 of file FlatFileParser.cpp.
References isRowDelim(), FlatFileColumnParseResult::MAX_LENGTH, FlatFileColumnParseResult::NO_DELIM, FlatFileColumnParseResult::ROW_DELIM, and FlatFileColumnParseResult::setResult().
Referenced by scanColumn().
00320 { 00321 assert(buffer != NULL); 00322 const char *read = buffer; 00323 const char *end = buffer + size; 00324 uint remaining = maxLength; 00325 00326 FlatFileColumnParseResult::DelimiterType type = 00327 FlatFileColumnParseResult::NO_DELIM; 00328 while (read < end && remaining > 0) { 00329 if (isRowDelim(*read)) { 00330 type = FlatFileColumnParseResult::ROW_DELIM; 00331 break; 00332 } 00333 read++; 00334 remaining--; 00335 } 00336 00337 // Resolve delimiter type if another character can be read. This allows 00338 // us to catch the case where a row delimiter follows a max length field. 00339 if (type == FlatFileColumnParseResult::NO_DELIM && read < end) { 00340 if (isRowDelim(*read)) { 00341 type = FlatFileColumnParseResult::ROW_DELIM; 00342 } else if (remaining == 0) { 00343 type = FlatFileColumnParseResult::MAX_LENGTH; 00344 } 00345 } 00346 00347 uint resultSize = read - buffer; 00348 result.setResult(type, const_cast<char *>(buffer), resultSize); 00349 }
void FlatFileParser::stripQuoting | ( | FlatFileRowParseResult & | rowResult, | |
bool | trim | |||
) |
Remove quoting and escape characters from a row result, saving the results into the row result.
[in,out] | rowResult | the row result to be stripped |
[in] | trim | whether to trim columns before processing them |
Definition at line 351 of file FlatFileParser.cpp.
References FlatFileRowParseResult::getColumn(), FlatFileRowParseResult::getRawColumnSize(), FlatFileRowParseResult::getReadCount(), and FlatFileRowParseResult::strippedSizes.
Referenced by FlatFileExecStreamTest::checkStrip().
00354 { 00355 int nFields = rowResult.getReadCount(); 00356 00357 if (rowResult.strippedSizes.size() < nFields) { 00358 rowResult.strippedSizes.resize(nFields); 00359 } 00360 00361 for (uint i = 0; i < nFields; i++) { 00362 char *value = rowResult.getColumn(i); 00363 uint newSize = 0; 00364 if (value != NULL) { 00365 uint oldSize = rowResult.getRawColumnSize(i); 00366 newSize = stripQuoting(value, oldSize, trim); 00367 } 00368 rowResult.strippedSizes[i] = newSize; 00369 } 00370 }
Removes quoting and escape characters from a column value.
If untrimmed is set, then the value will be trimmed first. Otherwise, quoted values are expected to begin and end with a quote.
Examples (assuming both quote and escape are double quote):
"a quote"
becomes a quote
"""a quote"""
becomes "a quote"
"a quote
becomes a quote
a quote"
becomes a quote"
""aquote"
becomes (empty string)
[in,out] | buffer | buffer containing column value text |
[in] | size | size of column value text, in characters |
[in] | untrimmed | if value is untrimmed, it will be trimmed first |
Definition at line 372 of file FlatFileParser.cpp.
References escape, quote, and trim().
00374 { 00375 assert(buffer != NULL); 00376 if (sizeIn == 0) { 00377 return 0; 00378 } 00379 int size = untrimmed ? trim(buffer, sizeIn) : sizeIn; 00380 bool quoted = false; 00381 char *read = buffer; 00382 char *end = buffer + size; 00383 char *write = buffer; 00384 00385 if (*buffer == quote) { 00386 quoted = true; 00387 read++; 00388 } 00389 bool quoteEscape = (quoted && quote == escape); 00390 while (read < end) { 00391 if (quoteEscape && *read == quote) { 00392 read++; 00393 if ((read < end) && (*read == quote)) { 00394 // two consecutive quote/escape characters is an escaped quote 00395 *write++ = *read++; 00396 } else { 00397 // single quote/escape is end quote 00398 break; 00399 } 00400 } else if (quoted && *read == quote) { 00401 break; 00402 } else if (*read == escape) { 00403 read++; 00404 if (read < end) { 00405 *write++ = *read++; 00406 } 00407 } else { 00408 *write++ = *read++; 00409 } 00410 } 00411 return write - buffer; 00412 }
Trim spaces from beginning and end of text.
[in,out] | buffer | buffer containing text |
[in] | size | size of text, in characters |
Definition at line 414 of file FlatFileParser.cpp.
Referenced by FlatFileExecStreamTest::checkTrim(), and stripQuoting().
00415 { 00416 assert(buffer != NULL); 00417 if (size == 0) { 00418 return 0; 00419 } 00420 char *read = buffer; 00421 char *write = buffer; 00422 char *end = buffer + size; 00423 00424 while (read < end && *read == ' ') { 00425 read++; 00426 } 00427 end--; 00428 while (end >= read && *end == ' ') { 00429 end--; 00430 } 00431 end++; 00432 while (read < end) { 00433 *write++ = *read++; 00434 } 00435 return write - buffer; 00436 }
char FlatFileParser::fieldDelim [private] |
char FlatFileParser::rowDelim [private] |
char FlatFileParser::quote [private] |
char FlatFileParser::escape [private] |
bool FlatFileParser::doTrim [private] |
bool FlatFileParser::fixed [private] |
Whether to perform fixed mode parsing.
Definition at line 400 of file FlatFileParser.h.
Referenced by FlatFileParser(), and scanColumn().