FlatFileParser.cpp

Go to the documentation of this file.
00001 /*
00002 // $Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $
00003 // Fennel is a library of data storage and processing components.
00004 // Copyright (C) 2005-2009 The Eigenbase Project
00005 // Copyright (C) 2009-2009 SQLstream, Inc.
00006 // Copyright (C) 2005-2009 LucidEra, Inc.
00007 //
00008 // This program is free software; you can redistribute it and/or modify it
00009 // under the terms of the GNU General Public License as published by the Free
00010 // Software Foundation; either version 2 of the License, or (at your option)
00011 // any later version approved by The Eigenbase Project.
00012 //
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public License for more details.
00017 //
00018 // You should have received a copy of the GNU General Public License
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00021 */
00022 
00023 #include "fennel/common/CommonPreamble.h"
00024 #include "fennel/flatfile/FlatFileParser.h"
00025 
00026 FENNEL_BEGIN_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $");
00027 
00028 const char SPACE_CHAR = ' ';
00029 
00030 void FlatFileColumnParseResult::setResult(
00031     FlatFileColumnParseResult::DelimiterType type, char *buffer, uint size)
00032 {
00033     this->type = type;
00034     this->size = size;
00035 
00036     next = buffer + size;
00037     switch (type) {
00038     case NO_DELIM:
00039     case MAX_LENGTH:
00040         break;
00041     case FlatFileColumnParseResult::FIELD_DELIM:
00042     case FlatFileColumnParseResult::ROW_DELIM:
00043         next++;
00044         break;
00045     default:
00046         permAssert(false);
00047     }
00048 }
00049 
00050 FlatFileRowDescriptor::FlatFileRowDescriptor() :
00051     std::vector<FlatFileColumnDescriptor>()
00052 {
00053     bounded = true;
00054 }
00055 
00056 void FlatFileRowDescriptor::setUnbounded()
00057 {
00058     bounded = false;
00059 }
00060 
00061 bool FlatFileRowDescriptor::isBounded() const
00062 {
00063     return bounded;
00064 }
00065 
00066 FlatFileRowParseResult::FlatFileRowParseResult()
00067 {
00068     reset();
00069 }
00070 
00071 void FlatFileRowParseResult::reset()
00072 {
00073     status = NO_STATUS;
00074     current = next = NULL;
00075     nRowDelimsRead = 0;
00076 }
00077 
00078 FlatFileParser::FlatFileParser(
00079     char fieldDelim, char rowDelim, char quote, char escape, bool doTrim)
00080 {
00081     this->fieldDelim = fieldDelim;
00082     this->rowDelim = rowDelim;
00083     this->quote = quote;
00084     this->escape = escape;
00085     this->doTrim = doTrim;
00086 
00087     fixed = (fieldDelim == 0);
00088     if (fixed) {
00089         assert(quote == 0);
00090         assert(escape == 0);
00091     }
00092 }
00093 
00094 void FlatFileParser::scanRow(
00095     const char *buffer,
00096     int size,
00097     const FlatFileRowDescriptor &columns,
00098     FlatFileRowParseResult &result)
00099 {
00100     assert(size >= 0);
00101     const char *row = buffer;
00102     uint offset = 0;
00103     FlatFileColumnParseResult columnResult;
00104 
00105     result.status = FlatFileRowParseResult::NO_STATUS;
00106     bool bounded = columns.isBounded();
00107     bool lenient = columns.isLenient();
00108     bool mapped = columns.isMapped();
00109     bool strict = (bounded && (!lenient));
00110 
00111     uint maxColumns = columns.getMaxColumns();
00112     uint resultColumns = columns.size();
00113     if (bounded) {
00114         result.resize(resultColumns);
00115         for (uint i = 0; i < resultColumns; i++) {
00116             result.setNull(i);
00117         }
00118     } else {
00119         result.clear();
00120     }
00121 
00122     // Scan any initial row delimiters, helps for the case when a row
00123     // delimiter is multiple characters like \r\n and the delimiter
00124     // characters are split between two buffers. (The previous row could
00125     // be complete due to \r, and parsing could begin at \n.
00126     const char *nonDelim = scanRowDelim(row, size, false);
00127     offset = nonDelim - row;
00128 
00129     bool done = false;
00130     bool rowDelim = false;
00131     for (uint i = 0; i < maxColumns; i++) {
00132         uint maxLength = columns.getMaxLength(i);
00133         scanColumn(
00134             row + offset,
00135             size - offset,
00136             maxLength,
00137             columnResult);
00138         switch (columnResult.type) {
00139         case FlatFileColumnParseResult::NO_DELIM:
00140             result.status = FlatFileRowParseResult::INCOMPLETE_COLUMN;
00141             done = true;
00142             break;
00143         case FlatFileColumnParseResult::ROW_DELIM:
00144             if (strict && (i+1 != columns.size())) {
00145                 if (i == 0) {
00146                     result.status = FlatFileRowParseResult::NO_COLUMN_DELIM;
00147                 } else {
00148                     result.status = FlatFileRowParseResult::TOO_FEW_COLUMNS;
00149                 }
00150             }
00151             done = true;
00152             rowDelim = true;
00153             break;
00154         case FlatFileColumnParseResult::MAX_LENGTH:
00155         case FlatFileColumnParseResult::FIELD_DELIM:
00156             if (strict && (i+1 == columns.size())) {
00157                 result.status = FlatFileRowParseResult::TOO_MANY_COLUMNS;
00158                 done = true;
00159             }
00160             break;
00161         default:
00162             permAssert(false);
00163         }
00164         if (bounded) {
00165             int target = mapped ? columns.getMap(i) : i;
00166             if (target >= 0) {
00167                 assert (target < resultColumns);
00168                 result.setColumn(target, offset, columnResult.size);
00169             }
00170         } else {
00171             result.addColumn(offset, columnResult.size);
00172         }
00173         offset = columnResult.next - row;
00174         if (done) {
00175             break;
00176         }
00177     }
00178     result.current = const_cast<char *>(row);
00179     result.next = const_cast<char *>(
00180         scanRowEnd(
00181             columnResult.next,
00182             buffer + size - columnResult.next,
00183             rowDelim,
00184             result));
00185 }
00186 
00187 const char *FlatFileParser::scanRowEnd(
00188     const char *buffer,
00189     int size,
00190     bool rowDelim,
00191     FlatFileRowParseResult &result)
00192 {
00193     const char *read = buffer;
00194     const char *end = buffer + size;
00195     switch (result.status) {
00196     case FlatFileRowParseResult::INCOMPLETE_COLUMN:
00197     case FlatFileRowParseResult::ROW_TOO_LARGE:
00198         assert(read == end);
00199         return read;
00200     default:
00201         break;
00202     }
00203 
00204     // if a row delimiter was not encountered while scanning the row,
00205     // search for the next row delimiter character
00206     if (!rowDelim) {
00207         read = scanRowDelim(read, end - read, true);
00208         if (read == end) {
00209             return read;
00210         }
00211     }
00212     result.nRowDelimsRead++;
00213 
00214     // search for the first non- row delimiter character
00215     read = scanRowDelim(read, end - read, false);
00216     return read;
00217 }
00218 
00219 const char *FlatFileParser::scanRowDelim(
00220     const char *buffer,
00221     int size,
00222     bool search)
00223 {
00224     const char *read = buffer;
00225     const char *end = buffer + size;
00226     while (read < end) {
00227         if (isRowDelim(*read) == search) {
00228             break;
00229         } else {
00230             read++;
00231         }
00232     }
00233     return read;
00234 }
00235 
00236 bool FlatFileParser::isRowDelim(char c)
00237 {
00238     assert(rowDelim != '\r');
00239     return (rowDelim == '\n') ? (c == '\r' || c == '\n') : (c == rowDelim);
00240 }
00241 
00242 void FlatFileParser::scanColumn(
00243     const char *buffer,
00244     uint size,
00245     uint maxLength,
00246     FlatFileColumnParseResult &result)
00247 {
00248     if (fixed) {
00249         return scanFixedColumn(buffer, size, maxLength, result);
00250     }
00251 
00252     assert(buffer != NULL);
00253     const char *read = buffer;
00254     const char *end = buffer + size;
00255 
00256     // read past leading spaces before checking for quotes
00257     if (doTrim) {
00258         while (read < end && SPACE_CHAR == *read) {
00259             read++;
00260         }
00261     }
00262 
00263     bool quoted = (read < end && *read == quote);
00264     bool quoteEscape = (quoted && quote == escape);
00265 
00266     FlatFileColumnParseResult::DelimiterType type =
00267         FlatFileColumnParseResult::NO_DELIM;
00268     if (quoted) {
00269         read++;
00270     }
00271     while (read < end) {
00272         if (*read == quote) {
00273             read++;
00274             if (quoteEscape) {
00275                 // read next character to determine whether purpose of
00276                 // this character is an escape character or an end quote
00277                 if (read == end) {
00278                     break;
00279                 }
00280                 if (*read == quote) {
00281                     // two consecutive quote/escape characters is an
00282                     // escaped quote
00283                     read++;
00284                     continue;
00285                 }
00286             }
00287             if (quoted) {
00288                 // otherwise a quote may be a close quote
00289                 quoteEscape = quoted = false;
00290             }
00291         } else if (*read == escape) {
00292             read++;
00293             // an escape escapes the next character
00294             if (read == end) {
00295                 break;
00296             }
00297             read++;
00298         } else if (quoted) {
00299             read++;
00300         } else if (*read == fieldDelim) {
00301             type = FlatFileColumnParseResult::FIELD_DELIM;
00302             break;
00303         } else if (isRowDelim(*read)) {
00304             type = FlatFileColumnParseResult::ROW_DELIM;
00305             break;
00306         } else {
00307             read++;
00308         }
00309     }
00310 
00311     uint resultSize = read - buffer;
00312     result.setResult(type, const_cast<char *>(buffer), resultSize);
00313 }
00314 
00315 void FlatFileParser::scanFixedColumn(
00316     const char *buffer,
00317     uint size,
00318     uint maxLength,
00319     FlatFileColumnParseResult &result)
00320 {
00321     assert(buffer != NULL);
00322     const char *read = buffer;
00323     const char *end = buffer + size;
00324     uint remaining = maxLength;
00325 
00326     FlatFileColumnParseResult::DelimiterType type =
00327         FlatFileColumnParseResult::NO_DELIM;
00328     while (read < end && remaining > 0) {
00329         if (isRowDelim(*read)) {
00330             type = FlatFileColumnParseResult::ROW_DELIM;
00331             break;
00332         }
00333         read++;
00334         remaining--;
00335     }
00336 
00337     // Resolve delimiter type if another character can be read. This allows
00338     // us to catch the case where a row delimiter follows a max length field.
00339     if (type == FlatFileColumnParseResult::NO_DELIM && read < end) {
00340         if (isRowDelim(*read)) {
00341             type = FlatFileColumnParseResult::ROW_DELIM;
00342         } else if (remaining == 0) {
00343             type = FlatFileColumnParseResult::MAX_LENGTH;
00344         }
00345     }
00346 
00347     uint resultSize = read - buffer;
00348     result.setResult(type, const_cast<char *>(buffer), resultSize);
00349 }
00350 
00351 void FlatFileParser::stripQuoting(
00352     FlatFileRowParseResult &rowResult,
00353     bool trim)
00354 {
00355     int nFields = rowResult.getReadCount();
00356 
00357     if (rowResult.strippedSizes.size() < nFields) {
00358         rowResult.strippedSizes.resize(nFields);
00359     }
00360 
00361     for (uint i = 0; i < nFields; i++) {
00362         char *value = rowResult.getColumn(i);
00363         uint newSize = 0;
00364         if (value != NULL) {
00365             uint oldSize = rowResult.getRawColumnSize(i);
00366             newSize = stripQuoting(value, oldSize, trim);
00367         }
00368         rowResult.strippedSizes[i] = newSize;
00369     }
00370 }
00371 
00372 uint FlatFileParser::stripQuoting(
00373     char *buffer, uint sizeIn, bool untrimmed)
00374 {
00375     assert(buffer != NULL);
00376     if (sizeIn == 0) {
00377         return 0;
00378     }
00379     int size = untrimmed ? trim(buffer, sizeIn) : sizeIn;
00380     bool quoted = false;
00381     char *read = buffer;
00382     char *end = buffer + size;
00383     char *write = buffer;
00384 
00385     if (*buffer == quote) {
00386         quoted = true;
00387         read++;
00388     }
00389     bool quoteEscape = (quoted && quote == escape);
00390     while (read < end) {
00391         if (quoteEscape && *read == quote) {
00392             read++;
00393             if ((read < end) && (*read == quote)) {
00394                 // two consecutive quote/escape characters is an escaped quote
00395                 *write++ = *read++;
00396             } else {
00397                 // single quote/escape is end quote
00398                 break;
00399             }
00400         } else if (quoted && *read == quote) {
00401             break;
00402         } else if (*read == escape) {
00403             read++;
00404             if (read < end) {
00405                 *write++ = *read++;
00406             }
00407         } else {
00408             *write++ = *read++;
00409         }
00410     }
00411     return write - buffer;
00412 }
00413 
00414 uint FlatFileParser::trim(char *buffer, uint size)
00415 {
00416     assert(buffer != NULL);
00417     if (size == 0) {
00418         return 0;
00419     }
00420     char *read = buffer;
00421     char *write = buffer;
00422     char *end = buffer + size;
00423 
00424     while (read < end && *read == ' ') {
00425         read++;
00426     }
00427     end--;
00428     while (end >= read && *end == ' ') {
00429         end--;
00430     }
00431     end++;
00432     while (read < end) {
00433         *write++ = *read++;
00434     }
00435     return write - buffer;
00436 }
00437 
00438 FENNEL_END_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $");
00439 
00440 // End FlatFileParser.cpp

Generated on Mon Jun 22 04:00:19 2009 for Fennel by  doxygen 1.5.1