FlatFileParser.h

Go to the documentation of this file.
00001 /*
00002 // $Id: //open/dev/fennel/flatfile/FlatFileParser.h#2 $
00003 // Fennel is a library of data storage and processing components.
00004 // Copyright (C) 2005-2009 The Eigenbase Project
00005 // Copyright (C) 2009-2009 SQLstream, Inc.
00006 // Copyright (C) 2005-2009 LucidEra, Inc.
00007 //
00008 // This program is free software; you can redistribute it and/or modify it
00009 // under the terms of the GNU General Public License as published by the Free
00010 // Software Foundation; either version 2 of the License, or (at your option)
00011 // any later version approved by The Eigenbase Project.
00012 //
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public License for more details.
00017 //
00018 // You should have received a copy of the GNU General Public License
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00021 */
00022 
00023 #ifndef Fennel_FlatFileParser_Included
00024 #define Fennel_FlatFileParser_Included
00025 
00026 
00027 #include <vector>
00028 
00029 FENNEL_BEGIN_NAMESPACE
00030 
00031 class FlatFileParser;
00032 typedef boost::shared_ptr<FlatFileParser> SharedFlatFileParser;
00033 
00037 class FENNEL_FLATFILE_EXPORT FlatFileColumnParseResult
00038 {
00039 public:
00041     enum DelimiterType {
00043         NO_DELIM = 0,
00045         FIELD_DELIM,
00047         ROW_DELIM,
00049         MAX_LENGTH
00050     };
00051 
00055     DelimiterType type;
00056 
00060     uint size;
00061 
00065     char *next;
00066 
00071     void setResult(DelimiterType type, char *buffer, uint size);
00072 };
00073 
00077 class FENNEL_FLATFILE_EXPORT FlatFileRowParseResult
00078 {
00079 public:
00081     enum RowStatus {
00085         NO_STATUS = 0,
00090         INCOMPLETE_COLUMN,
00094         ROW_TOO_LARGE,
00099         NO_COLUMN_DELIM,
00103         TOO_FEW_COLUMNS,
00107         TOO_MANY_COLUMNS
00108     };
00109 
00110     explicit FlatFileRowParseResult();
00111     void reset();
00112 
00116     RowStatus status;
00117 
00121     VectorOfUint offsets;
00122 
00126     VectorOfUint sizes;
00127 
00131     VectorOfUint strippedSizes;
00132 
00136     char *current;
00137 
00142     char *next;
00143 
00147     uint nRowDelimsRead;
00148 
00152     uint getReadCount()
00153     {
00154         return offsets.size();
00155     }
00156 
00163     char *getColumn(uint iColumn)
00164     {
00165         if (sizes[iColumn] == 0) {
00166             return NULL;
00167         }
00168         return current + offsets[iColumn];
00169     }
00170 
00174     uint getRawColumnSize(uint iColumn)
00175     {
00176         return sizes[iColumn];
00177     }
00178 
00182     uint getColumnSize(uint iColumn)
00183     {
00184         return strippedSizes[iColumn];
00185     }
00186 
00190     void clear()
00191     {
00192         offsets.clear();
00193         sizes.clear();
00194     }
00195 
00199     void resize(uint nColumns)
00200     {
00201         offsets.resize(nColumns);
00202         sizes.resize(nColumns);
00203     }
00204 
00208     void setColumn(uint iColumn, uint offset, uint size)
00209     {
00210         offsets[iColumn] = offset;
00211         sizes[iColumn] = size;
00212     }
00213 
00217     void setNull(uint iColumn)
00218     {
00219         setColumn(iColumn, 0, 0);
00220     }
00221 
00225     void addColumn(uint offset, uint size)
00226     {
00227         offsets.push_back(offset);
00228         sizes.push_back(size);
00229     }
00230 };
00231 
00235 class FENNEL_FLATFILE_EXPORT FlatFileColumnDescriptor
00236 {
00237 public:
00238     uint maxLength;
00239 
00240 #ifdef __MSVC__
00241     explicit FlatFileColumnDescriptor()
00242     {
00243         maxLength = 0;
00244     }
00245 #endif
00246 
00247     explicit FlatFileColumnDescriptor(uint maxLengthInit)
00248     {
00249         maxLength = maxLengthInit;
00250     }
00251 };
00252 
00261 class FENNEL_FLATFILE_EXPORT FlatFileRowDescriptor
00262     : public std::vector<FlatFileColumnDescriptor>
00263 {
00264     bool bounded;
00265     bool lenient;
00266 
00267     VectorOfUint columnMap;
00268 
00269 public:
00273     static const int MAX_COLUMNS = 1024;
00274 
00279     static const int MAX_COLUMN_LENGTH = 65535;
00280 
00284     FlatFileRowDescriptor();
00285 
00290     void setUnbounded();
00291 
00296     bool isBounded() const;
00297 
00304     void setMap(VectorOfUint map)
00305     {
00306         columnMap = map;
00307     }
00308 
00312     bool isMapped() const
00313     {
00314         return columnMap.size() > 0;
00315     }
00316 
00321     int getMap(uint iSource) const
00322     {
00323         if (iSource >= columnMap.size()) {
00324             return -1;
00325         }
00326         return columnMap[iSource];
00327     }
00328 
00329     void setLenient(bool lenientIn)
00330     {
00331         lenient = lenientIn;
00332     }
00333 
00334     bool isLenient() const
00335     {
00336         return lenient;
00337     }
00338 
00345     uint getMaxColumns() const
00346     {
00347         if (!bounded) {
00348             return MAX_COLUMNS;
00349         } else if (isMapped()) {
00350             return columnMap.size();
00351         } else {
00352             return size();
00353         }
00354     }
00355 
00361     uint getMaxLength(uint i) const
00362     {
00363         uint realIndex = 0;
00364         if (!bounded) {
00365             return MAX_COLUMN_LENGTH;
00366         } else if (isMapped()) {
00367             realIndex = getMap(i);
00368         } else {
00369             realIndex = i;
00370         }
00371         if (realIndex < 0 || realIndex >= size()) {
00372             return MAX_COLUMN_LENGTH;
00373         } else {
00374             return (*this)[realIndex].maxLength;
00375         }
00376     }
00377 };
00378 
00389 class FENNEL_FLATFILE_EXPORT FlatFileParser
00390 {
00391     char fieldDelim;
00392     char rowDelim;
00393     char quote;
00394     char escape;
00395     bool doTrim;
00396 
00400     bool fixed;
00401 
00417     const char *scanRowEnd(
00418         const char *buffer,
00419         int size,
00420         bool rowDelim,
00421         FlatFileRowParseResult &result);
00422 
00435     const char *scanRowDelim(
00436         const char *buffer,
00437         int size,
00438         bool search);
00439 
00445     bool isRowDelim(char c);
00446 
00447 public:
00463     FlatFileParser(
00464         const char fieldDelim,
00465         const char rowDelim,
00466         const char quote,
00467         const char escape,
00468         bool doTrim = false);
00469 
00498     void scanRow(
00499         const char *buffer,
00500         int size,
00501         const FlatFileRowDescriptor &columns,
00502         FlatFileRowParseResult &result);
00503 
00518     void scanColumn(
00519         const char *buffer,
00520         uint size,
00521         uint maxLength,
00522         FlatFileColumnParseResult &result);
00523 
00524 
00531     void scanFixedColumn(
00532         const char *buffer,
00533         uint size,
00534         uint maxLength,
00535         FlatFileColumnParseResult &result);
00536 
00545     void stripQuoting(
00546         FlatFileRowParseResult &rowResult,
00547         bool trim);
00548 
00571     uint stripQuoting(char *buffer, uint size, bool untrimmed);
00572 
00582     uint trim(char *buffer, uint size);
00583 };
00584 
00585 FENNEL_END_NAMESPACE
00586 
00587 #endif
00588 
00589 // End FlatFileParser.h

Generated on Mon Jun 22 04:00:19 2009 for Fennel by  doxygen 1.5.1