00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "fennel/common/CommonPreamble.h"
00024 #include "fennel/flatfile/FlatFileParser.h"
00025
00026 FENNEL_BEGIN_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $");
00027
00028 const char SPACE_CHAR = ' ';
00029
00030 void FlatFileColumnParseResult::setResult(
00031 FlatFileColumnParseResult::DelimiterType type, char *buffer, uint size)
00032 {
00033 this->type = type;
00034 this->size = size;
00035
00036 next = buffer + size;
00037 switch (type) {
00038 case NO_DELIM:
00039 case MAX_LENGTH:
00040 break;
00041 case FlatFileColumnParseResult::FIELD_DELIM:
00042 case FlatFileColumnParseResult::ROW_DELIM:
00043 next++;
00044 break;
00045 default:
00046 permAssert(false);
00047 }
00048 }
00049
00050 FlatFileRowDescriptor::FlatFileRowDescriptor() :
00051 std::vector<FlatFileColumnDescriptor>()
00052 {
00053 bounded = true;
00054 }
00055
00056 void FlatFileRowDescriptor::setUnbounded()
00057 {
00058 bounded = false;
00059 }
00060
00061 bool FlatFileRowDescriptor::isBounded() const
00062 {
00063 return bounded;
00064 }
00065
00066 FlatFileRowParseResult::FlatFileRowParseResult()
00067 {
00068 reset();
00069 }
00070
00071 void FlatFileRowParseResult::reset()
00072 {
00073 status = NO_STATUS;
00074 current = next = NULL;
00075 nRowDelimsRead = 0;
00076 }
00077
00078 FlatFileParser::FlatFileParser(
00079 char fieldDelim, char rowDelim, char quote, char escape, bool doTrim)
00080 {
00081 this->fieldDelim = fieldDelim;
00082 this->rowDelim = rowDelim;
00083 this->quote = quote;
00084 this->escape = escape;
00085 this->doTrim = doTrim;
00086
00087 fixed = (fieldDelim == 0);
00088 if (fixed) {
00089 assert(quote == 0);
00090 assert(escape == 0);
00091 }
00092 }
00093
00094 void FlatFileParser::scanRow(
00095 const char *buffer,
00096 int size,
00097 const FlatFileRowDescriptor &columns,
00098 FlatFileRowParseResult &result)
00099 {
00100 assert(size >= 0);
00101 const char *row = buffer;
00102 uint offset = 0;
00103 FlatFileColumnParseResult columnResult;
00104
00105 result.status = FlatFileRowParseResult::NO_STATUS;
00106 bool bounded = columns.isBounded();
00107 bool lenient = columns.isLenient();
00108 bool mapped = columns.isMapped();
00109 bool strict = (bounded && (!lenient));
00110
00111 uint maxColumns = columns.getMaxColumns();
00112 uint resultColumns = columns.size();
00113 if (bounded) {
00114 result.resize(resultColumns);
00115 for (uint i = 0; i < resultColumns; i++) {
00116 result.setNull(i);
00117 }
00118 } else {
00119 result.clear();
00120 }
00121
00122
00123
00124
00125
00126 const char *nonDelim = scanRowDelim(row, size, false);
00127 offset = nonDelim - row;
00128
00129 bool done = false;
00130 bool rowDelim = false;
00131 for (uint i = 0; i < maxColumns; i++) {
00132 uint maxLength = columns.getMaxLength(i);
00133 scanColumn(
00134 row + offset,
00135 size - offset,
00136 maxLength,
00137 columnResult);
00138 switch (columnResult.type) {
00139 case FlatFileColumnParseResult::NO_DELIM:
00140 result.status = FlatFileRowParseResult::INCOMPLETE_COLUMN;
00141 done = true;
00142 break;
00143 case FlatFileColumnParseResult::ROW_DELIM:
00144 if (strict && (i+1 != columns.size())) {
00145 if (i == 0) {
00146 result.status = FlatFileRowParseResult::NO_COLUMN_DELIM;
00147 } else {
00148 result.status = FlatFileRowParseResult::TOO_FEW_COLUMNS;
00149 }
00150 }
00151 done = true;
00152 rowDelim = true;
00153 break;
00154 case FlatFileColumnParseResult::MAX_LENGTH:
00155 case FlatFileColumnParseResult::FIELD_DELIM:
00156 if (strict && (i+1 == columns.size())) {
00157 result.status = FlatFileRowParseResult::TOO_MANY_COLUMNS;
00158 done = true;
00159 }
00160 break;
00161 default:
00162 permAssert(false);
00163 }
00164 if (bounded) {
00165 int target = mapped ? columns.getMap(i) : i;
00166 if (target >= 0) {
00167 assert (target < resultColumns);
00168 result.setColumn(target, offset, columnResult.size);
00169 }
00170 } else {
00171 result.addColumn(offset, columnResult.size);
00172 }
00173 offset = columnResult.next - row;
00174 if (done) {
00175 break;
00176 }
00177 }
00178 result.current = const_cast<char *>(row);
00179 result.next = const_cast<char *>(
00180 scanRowEnd(
00181 columnResult.next,
00182 buffer + size - columnResult.next,
00183 rowDelim,
00184 result));
00185 }
00186
00187 const char *FlatFileParser::scanRowEnd(
00188 const char *buffer,
00189 int size,
00190 bool rowDelim,
00191 FlatFileRowParseResult &result)
00192 {
00193 const char *read = buffer;
00194 const char *end = buffer + size;
00195 switch (result.status) {
00196 case FlatFileRowParseResult::INCOMPLETE_COLUMN:
00197 case FlatFileRowParseResult::ROW_TOO_LARGE:
00198 assert(read == end);
00199 return read;
00200 default:
00201 break;
00202 }
00203
00204
00205
00206 if (!rowDelim) {
00207 read = scanRowDelim(read, end - read, true);
00208 if (read == end) {
00209 return read;
00210 }
00211 }
00212 result.nRowDelimsRead++;
00213
00214
00215 read = scanRowDelim(read, end - read, false);
00216 return read;
00217 }
00218
00219 const char *FlatFileParser::scanRowDelim(
00220 const char *buffer,
00221 int size,
00222 bool search)
00223 {
00224 const char *read = buffer;
00225 const char *end = buffer + size;
00226 while (read < end) {
00227 if (isRowDelim(*read) == search) {
00228 break;
00229 } else {
00230 read++;
00231 }
00232 }
00233 return read;
00234 }
00235
00236 bool FlatFileParser::isRowDelim(char c)
00237 {
00238 assert(rowDelim != '\r');
00239 return (rowDelim == '\n') ? (c == '\r' || c == '\n') : (c == rowDelim);
00240 }
00241
00242 void FlatFileParser::scanColumn(
00243 const char *buffer,
00244 uint size,
00245 uint maxLength,
00246 FlatFileColumnParseResult &result)
00247 {
00248 if (fixed) {
00249 return scanFixedColumn(buffer, size, maxLength, result);
00250 }
00251
00252 assert(buffer != NULL);
00253 const char *read = buffer;
00254 const char *end = buffer + size;
00255
00256
00257 if (doTrim) {
00258 while (read < end && SPACE_CHAR == *read) {
00259 read++;
00260 }
00261 }
00262
00263 bool quoted = (read < end && *read == quote);
00264 bool quoteEscape = (quoted && quote == escape);
00265
00266 FlatFileColumnParseResult::DelimiterType type =
00267 FlatFileColumnParseResult::NO_DELIM;
00268 if (quoted) {
00269 read++;
00270 }
00271 while (read < end) {
00272 if (*read == quote) {
00273 read++;
00274 if (quoteEscape) {
00275
00276
00277 if (read == end) {
00278 break;
00279 }
00280 if (*read == quote) {
00281
00282
00283 read++;
00284 continue;
00285 }
00286 }
00287 if (quoted) {
00288
00289 quoteEscape = quoted = false;
00290 }
00291 } else if (*read == escape) {
00292 read++;
00293
00294 if (read == end) {
00295 break;
00296 }
00297 read++;
00298 } else if (quoted) {
00299 read++;
00300 } else if (*read == fieldDelim) {
00301 type = FlatFileColumnParseResult::FIELD_DELIM;
00302 break;
00303 } else if (isRowDelim(*read)) {
00304 type = FlatFileColumnParseResult::ROW_DELIM;
00305 break;
00306 } else {
00307 read++;
00308 }
00309 }
00310
00311 uint resultSize = read - buffer;
00312 result.setResult(type, const_cast<char *>(buffer), resultSize);
00313 }
00314
00315 void FlatFileParser::scanFixedColumn(
00316 const char *buffer,
00317 uint size,
00318 uint maxLength,
00319 FlatFileColumnParseResult &result)
00320 {
00321 assert(buffer != NULL);
00322 const char *read = buffer;
00323 const char *end = buffer + size;
00324 uint remaining = maxLength;
00325
00326 FlatFileColumnParseResult::DelimiterType type =
00327 FlatFileColumnParseResult::NO_DELIM;
00328 while (read < end && remaining > 0) {
00329 if (isRowDelim(*read)) {
00330 type = FlatFileColumnParseResult::ROW_DELIM;
00331 break;
00332 }
00333 read++;
00334 remaining--;
00335 }
00336
00337
00338
00339 if (type == FlatFileColumnParseResult::NO_DELIM && read < end) {
00340 if (isRowDelim(*read)) {
00341 type = FlatFileColumnParseResult::ROW_DELIM;
00342 } else if (remaining == 0) {
00343 type = FlatFileColumnParseResult::MAX_LENGTH;
00344 }
00345 }
00346
00347 uint resultSize = read - buffer;
00348 result.setResult(type, const_cast<char *>(buffer), resultSize);
00349 }
00350
00351 void FlatFileParser::stripQuoting(
00352 FlatFileRowParseResult &rowResult,
00353 bool trim)
00354 {
00355 int nFields = rowResult.getReadCount();
00356
00357 if (rowResult.strippedSizes.size() < nFields) {
00358 rowResult.strippedSizes.resize(nFields);
00359 }
00360
00361 for (uint i = 0; i < nFields; i++) {
00362 char *value = rowResult.getColumn(i);
00363 uint newSize = 0;
00364 if (value != NULL) {
00365 uint oldSize = rowResult.getRawColumnSize(i);
00366 newSize = stripQuoting(value, oldSize, trim);
00367 }
00368 rowResult.strippedSizes[i] = newSize;
00369 }
00370 }
00371
00372 uint FlatFileParser::stripQuoting(
00373 char *buffer, uint sizeIn, bool untrimmed)
00374 {
00375 assert(buffer != NULL);
00376 if (sizeIn == 0) {
00377 return 0;
00378 }
00379 int size = untrimmed ? trim(buffer, sizeIn) : sizeIn;
00380 bool quoted = false;
00381 char *read = buffer;
00382 char *end = buffer + size;
00383 char *write = buffer;
00384
00385 if (*buffer == quote) {
00386 quoted = true;
00387 read++;
00388 }
00389 bool quoteEscape = (quoted && quote == escape);
00390 while (read < end) {
00391 if (quoteEscape && *read == quote) {
00392 read++;
00393 if ((read < end) && (*read == quote)) {
00394
00395 *write++ = *read++;
00396 } else {
00397
00398 break;
00399 }
00400 } else if (quoted && *read == quote) {
00401 break;
00402 } else if (*read == escape) {
00403 read++;
00404 if (read < end) {
00405 *write++ = *read++;
00406 }
00407 } else {
00408 *write++ = *read++;
00409 }
00410 }
00411 return write - buffer;
00412 }
00413
00414 uint FlatFileParser::trim(char *buffer, uint size)
00415 {
00416 assert(buffer != NULL);
00417 if (size == 0) {
00418 return 0;
00419 }
00420 char *read = buffer;
00421 char *write = buffer;
00422 char *end = buffer + size;
00423
00424 while (read < end && *read == ' ') {
00425 read++;
00426 }
00427 end--;
00428 while (end >= read && *end == ' ') {
00429 end--;
00430 }
00431 end++;
00432 while (read < end) {
00433 *write++ = *read++;
00434 }
00435 return write - buffer;
00436 }
00437
00438 FENNEL_END_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $");
00439
00440