BTreeBuilder.cpp

Go to the documentation of this file.
00001 /*
00002 // $Id: //open/dev/fennel/btree/BTreeBuilder.cpp#12 $
00003 // Fennel is a library of data storage and processing components.
00004 // Copyright (C) 2005-2009 The Eigenbase Project
00005 // Copyright (C) 2005-2009 SQLstream, Inc.
00006 // Copyright (C) 2005-2009 LucidEra, Inc.
00007 // Portions Copyright (C) 1999-2009 John V. Sichi
00008 //
00009 // This program is free software; you can redistribute it and/or modify it
00010 // under the terms of the GNU General Public License as published by the Free
00011 // Software Foundation; either version 2 of the License, or (at your option)
00012 // any later version approved by The Eigenbase Project.
00013 //
00014 // This program is distributed in the hope that it will be useful,
00015 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 // GNU General Public License for more details.
00018 //
00019 // You should have received a copy of the GNU General Public License
00020 // along with this program; if not, write to the Free Software
00021 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00022 */
00023 
00024 #include "fennel/common/CommonPreamble.h"
00025 #include "fennel/btree/BTreeBuilder.h"
00026 #include "fennel/btree/BTreeBuildLevel.h"
00027 #include "fennel/btree/BTreeAccessBaseImpl.h"
00028 #include "fennel/btree/BTreeReader.h"
00029 #include "fennel/segment/SegInputStream.h"
00030 #include "fennel/segment/SegOutputStream.h"
00031 
00032 FENNEL_BEGIN_CPPFILE("$Id: //open/dev/fennel/btree/BTreeBuilder.cpp#12 $");
00033 
00034 BTreeBuilder::BTreeBuilder(
00035     BTreeDescriptor const &descriptor,
00036     SharedSegment pTempSegmentInit)
00037     : BTreeAccessBase(descriptor)
00038 {
00039     pTempSegment = pTempSegmentInit;
00040 }
00041 
00042 BTreeBuilder::~BTreeBuilder()
00043 {
00044 }
00045 
00046 uint BTreeBuilder::calculateNodesOnLevel(
00047     uint nEntries,uint nEntriesPerNode)
00048 {
00049     uint nNodes = nEntries / nEntriesPerNode;
00050     if (nEntries % nEntriesPerNode) {
00051         // need an extra node for the remainder
00052         ++nNodes;
00053     }
00054     return nNodes;
00055 }
00056 
00057 void BTreeBuilder::build(
00058     ByteInputStream &sortedInputStream,
00059     RecordNum nEntriesTotal,
00060     double fillFactor)
00061 {
00062     assert(fillFactor <= 1.0);
00063     assert(fillFactor > 0);
00064     try {
00065         if (pLeafNodeAccessor->hasFixedWidthEntries()
00066             && pNonLeafNodeAccessor->hasFixedWidthEntries())
00067         {
00068             buildBalanced(sortedInputStream,0,nEntriesTotal,fillFactor);
00069         } else if (pNonLeafNodeAccessor->hasFixedWidthEntries()) {
00070             buildTwoPass(
00071                 sortedInputStream,nEntriesTotal,fillFactor);
00072         } else {
00073             assert(!pLeafNodeAccessor->hasFixedWidthEntries());
00074             buildUnbalanced(sortedInputStream,nEntriesTotal,fillFactor);
00075         }
00076     } catch (...) {
00077         try {
00078             levels.clear();
00079         } catch (...) {
00080             // TODO:  trace suppressed excn
00081         }
00082         throw;
00083     }
00084     levels.clear();
00085 }
00086 
00087 void BTreeBuilder::buildTwoPass(
00088     ByteInputStream &sortedInputStream,
00089     RecordNum nEntriesTotal,
00090     double fillFactor)
00091 {
00092     // calculate amount of space to reserve on each leaf from fillfactor
00093     uint cbReserved = getSegment()->getUsablePageSize() - sizeof(BTreeNode);
00094     cbReserved = uint(cbReserved*(1-fillFactor));
00095 
00096     levels.resize(1);
00097     BTreeNodeAccessor &nodeAccessor = *pLeafNodeAccessor;
00098     assert(!nodeAccessor.hasFixedWidthEntries());
00099     VariableBuildLevel *pLeafLevel = new VariableBuildLevel(*this,nodeAccessor);
00100     levels[0].reset(pLeafLevel);
00101 
00102     BTreeBuildLevel &level = getLevel(0);
00103     level.nEntriesTotal = nEntriesTotal;
00104     level.cbReserved = cbReserved;
00105 
00106     level.allocatePage();
00107 
00108     // feed data into the leaf level; this will collect the entries for
00109     // level 1 (the parent of the leaf level) in a temp stream
00110     level.processInput(sortedInputStream);
00111 
00112     level.indexLastKey(true);
00113 
00114     // now we know how many entries to expect for level 1:  the number of
00115     // leaf nodes just filled
00116     RecordNum nEntriesParent = level.iNode + 1;
00117 
00118     if (nEntriesParent == 1) {
00119         // The leaf we just filled turns out to be the root.
00120         swapRoot();
00121         return;
00122     }
00123 
00124     // REVIEW:  distinguish leaf fillFactor from non-leaf fillFactor?
00125 
00126     SharedSegInputStream pParentInputStream =
00127         pLeafLevel->getParentKeyStream();
00128     assert(pParentInputStream);
00129     // feed buffered entries into level 1, and build up balanced from there
00130     buildBalanced(*pParentInputStream,1,nEntriesParent,fillFactor);
00131 }
00132 
00133 void BTreeBuilder::swapRoot()
00134 {
00135     BTreeBuildLevel &rootLevel = getLevel(getRootHeight());
00136     if (getRootPageId() == NULL_PAGE_ID) {
00137         // No rootPageId has been assigned yet, so no need to copy.
00138         treeDescriptor.rootPageId = rootLevel.pageId;
00139         return;
00140     }
00141 
00142     // We're supposed to preserve the PageId of the existing root, so
00143     // swap buffers with the original root.
00144     BTreePageLock reusedRootPageLock(treeDescriptor.segmentAccessor);
00145     reusedRootPageLock.lockExclusive(getRootPageId());
00146     BTreeNode &reusedRoot = reusedRootPageLock.getNodeForWrite();
00147     assert(!reusedRoot.nEntries);
00148     reusedRootPageLock.swapBuffers(rootLevel.pageLock);
00149 
00150     // deallocate the abandonded tempRoot
00151     rootLevel.pageLock.deallocateLockedPage();
00152 }
00153 
00154 void BTreeBuilder::buildUnbalanced(
00155     ByteInputStream &sortedInputStream,
00156     RecordNum nEntriesTotal,
00157     double fillFactor)
00158 {
00159     // TODO:  common fcn
00160     // calculate amount of space to reserve on each leaf from fillfactor
00161     uint cbReserved = getSegment()->getUsablePageSize() - sizeof(BTreeNode);
00162     cbReserved = uint(cbReserved*(1-fillFactor));
00163 
00164     // start with just a leaf level
00165     growTree();
00166     BTreeBuildLevel &level = getLevel(0);
00167     level.cbReserved = cbReserved;
00168     level.nEntriesTotal = nEntriesTotal;
00169     level.processInput(sortedInputStream);
00170 
00171     // NOTE:  It's important to realize that growTree() could be called inside
00172     // this loop, so it's necessary to recompute levels.size() after each
00173     // iteration.
00174     for (uint i = 0; i < levels.size(); ++i) {
00175         BTreeBuildLevel &level = getLevel(i);
00176         level.indexLastKey(true);
00177     }
00178 
00179     swapRoot();
00180 }
00181 
00182 void BTreeBuilder::buildBalanced(
00183     ByteInputStream &sortedInputStream,
00184     uint iInputLevel,
00185     RecordNum nEntriesTotal,
00186     double fillFactor)
00187 {
00188     // First, determine node capacities based on fixed-width entry sizes.  This
00189     // gives us the maximum fanout.
00190 
00191     uint nEntriesPerNonLeaf =
00192         getSegment()->getUsablePageSize() - sizeof(BTreeNode);
00193     nEntriesPerNonLeaf /=
00194         pNonLeafNodeAccessor->getEntryByteCount(
00195             pNonLeafNodeAccessor->tupleAccessor.getMaxByteCount());
00196 
00197     uint nEntriesPerLeaf =
00198         getSegment()->getUsablePageSize() - sizeof(BTreeNode);
00199     nEntriesPerLeaf /=
00200         pLeafNodeAccessor->getEntryByteCount(
00201             pLeafNodeAccessor->tupleAccessor.getMaxByteCount());
00202 
00203     nEntriesPerNonLeaf = uint(nEntriesPerNonLeaf*fillFactor);
00204     nEntriesPerLeaf = uint(nEntriesPerLeaf*fillFactor);
00205 
00206     if (!nEntriesPerNonLeaf) {
00207         nEntriesPerNonLeaf = 1;
00208     }
00209     if (!nEntriesPerLeaf) {
00210         nEntriesPerLeaf = 1;
00211     }
00212 
00213     // Next, calculate how high a "full" tree with this fanout would have to be
00214     // in order to accommodate the expected number of entries.  In most cases
00215     // the tree won't actually be full, but the height can't be any lower than
00216     // this.
00217 
00218     RecordNum nEntriesFull = iInputLevel ? nEntriesPerNonLeaf : nEntriesPerLeaf;
00219     uint nLevels = iInputLevel + 1;
00220     while (nEntriesFull < nEntriesTotal) {
00221         ++nLevels;
00222         nEntriesFull *= nEntriesPerNonLeaf;
00223     }
00224     levels.resize(nLevels);
00225 
00226     // Now, calculate how many entries to expect on each level.  We could do
00227     // the per-level balancing here as well, but then we'd have to keep around
00228     // an in-memory structure proportional to the number of nodes in the tree.
00229     // Instead, we calculate the balancing on the fly later.
00230     RecordNum nEntriesInLevel = nEntriesTotal;
00231     for (uint i = iInputLevel; i < levels.size(); i++) {
00232         BTreeNodeAccessor &nodeAccessor =
00233             i ? *pNonLeafNodeAccessor : *pLeafNodeAccessor;
00234 
00235         assert(nodeAccessor.hasFixedWidthEntries());
00236         levels[i].reset(new FixedBuildLevel(*this,nodeAccessor));
00237 
00238         BTreeBuildLevel &level = getLevel(i);
00239         level.iLevel = i;
00240         level.nEntriesTotal = nEntriesInLevel;
00241 
00242         // number of parent entries is same as number of child nodes
00243         nEntriesInLevel = calculateNodesOnLevel(
00244             nEntriesInLevel,
00245             i ? nEntriesPerNonLeaf : nEntriesPerLeaf);
00246 
00247         if (i == getRootHeight()) {
00248             // Set up the root info, which can be fully determined ahead of
00249             // time.
00250             level.nEntriesPerNode = level.nEntriesTotal;
00251             if (getRootPageId() == NULL_PAGE_ID) {
00252                 level.allocatePage();
00253                 treeDescriptor.rootPageId = level.pageId;
00254             } else {
00255                 // We're from Berkeley, so we reuse the existing root rather
00256                 // than allocating a new one.
00257                 level.pageId = getRootPageId();
00258                 level.pageLock.lockExclusive(level.pageId);
00259                 BTreeNode &node = level.pageLock.getNodeForWrite();
00260                 assert(!node.nEntries);
00261                 level.nodeAccessor.clearNode(
00262                     node,getSegment()->getUsablePageSize());
00263                 node.height = i;
00264             }
00265         } else {
00266             // Allocate the first empty page of a non-root level.
00267             level.allocatePage();
00268         }
00269         if (i) {
00270             // Prepare the first page of a non-leaf level.
00271             // Calculate balancing for first child node.
00272             BTreeBuildLevel &childLevel = getLevel(i - 1);
00273             childLevel.nEntriesPerNode = calculateChildEntriesPerNode(
00274                 level.nEntriesTotal,
00275                 childLevel.nEntriesTotal,
00276                 0);
00277         }
00278     }
00279 
00280     // should end up with exactly one node at the root level, which corresponds
00281     // to one entry in an imaginary parent of the root
00282     assert(nEntriesInLevel == 1);
00283 
00284     // feed data into the correct level
00285     getLevel(iInputLevel).processInput(sortedInputStream);
00286 
00287     // finalize rightist fringe
00288     for (uint i = iInputLevel; i < levels.size(); ++i) {
00289         BTreeBuildLevel &level = getLevel(i);
00290         level.indexLastKey(true);
00291         assert(level.isFinished());
00292     }
00293 }
00294 
00295 uint BTreeBuilder::calculateChildEntriesPerNode(
00296     RecordNum parentLevelTotalEntries,
00297     RecordNum childLevelTotalEntries,
00298     RecordNum parentLevelProcessedEntries)
00299 {
00300     // Determine the desired fanout.  This is non-integral, so our job is to
00301     // distribute it as evenly as possible.
00302     // TODO:  we could remember this instead of recalculating it.
00303     double fanout =
00304         double(childLevelTotalEntries) / double(parentLevelTotalEntries);
00305 
00306     uint childLevelNewTotal;
00307     if (parentLevelProcessedEntries == (parentLevelTotalEntries - 1)) {
00308         // Last child node needs to make everything come out exact.
00309         childLevelNewTotal = childLevelTotalEntries;
00310     } else {
00311         childLevelNewTotal =
00312             uint(0.5 + fanout * (parentLevelProcessedEntries + 1));
00313     }
00314 
00315     // Use what we returned from the previous call as a baseline.
00316     // TODO:  we could remember this instead of recalculating it.
00317     uint childLevelPrevTotal = uint(0.5 + fanout * parentLevelProcessedEntries);
00318 
00319     uint n = childLevelNewTotal - childLevelPrevTotal;
00320     assert(n > 0);
00321     return n;
00322 }
00323 
00324 void BTreeBuilder::createEmptyRoot()
00325 {
00326     assert(getRootPageId() == NULL_PAGE_ID);
00327     BTreePageLock pageLock;
00328     pageLock.accessSegment(treeDescriptor.segmentAccessor);
00329     treeDescriptor.rootPageId = pageLock.allocatePage(getPageOwnerId());
00330     BTreeNode &node = pageLock.getNodeForWrite();
00331     pLeafNodeAccessor->clearNode(
00332         node,getSegment()->getUsablePageSize());
00333 }
00334 
00335 void BTreeBuilder::growTree()
00336 {
00337     BTreeNodeAccessor &nodeAccessor =
00338         levels.size() ? *pNonLeafNodeAccessor : *pLeafNodeAccessor;
00339     levels.resize(levels.size() + 1);
00340     levels[getRootHeight()].reset(new DynamicBuildLevel(*this,nodeAccessor));
00341     BTreeBuildLevel &level = *levels.back();
00342     level.iLevel = getRootHeight();
00343     if (level.iLevel) {
00344         // inherit cbReserved
00345         level.cbReserved = getLevel(level.iLevel - 1).cbReserved;
00346     }
00347     level.allocatePage();
00348 }
00349 
00350 
00351 void BTreeBuilder::truncate(
00352     bool rootless, TupleProjection const *pLeafPageIdProj)
00353 {
00354     if (pLeafPageIdProj) {
00355         truncateExternal(*pLeafPageIdProj);
00356     }
00357     BTreePageLock pageLock;
00358     pageLock.accessSegment(treeDescriptor.segmentAccessor);
00359 
00360     pageLock.lockExclusive(getRootPageId());
00361 
00362     // Try a read-only access to see if we can skip dirtying a page.
00363     BTreeNode const &rootReadOnly = pageLock.getNodeForRead();
00364     if (!rootReadOnly.height) {
00365         if (rootless) {
00366             pageLock.deallocateLockedPage();
00367             treeDescriptor.rootPageId = NULL_PAGE_ID;
00368             return;
00369         }
00370         if (!rootReadOnly.nEntries) {
00371             // root is already empty
00372             return;
00373         }
00374     }
00375 
00376     BTreeNode &root = pageLock.getNodeForWrite();
00377     if (root.height) {
00378         truncateChildren(root);
00379     }
00380     if (rootless) {
00381         pageLock.deallocateLockedPage();
00382         treeDescriptor.rootPageId = NULL_PAGE_ID;
00383     } else {
00384         pLeafNodeAccessor->clearNode(
00385             root,getSegment()->getUsablePageSize());
00386     }
00387 }
00388 
00389 void BTreeBuilder::truncateChildren(BTreeNode const &node)
00390 {
00391     assert(node.height);
00392     assert(node.nEntries);
00393     PageId pageId = getChild(node,0);
00394     BTreePageLock pageLock;
00395     pageLock.accessSegment(treeDescriptor.segmentAccessor);
00396     if (node.height > 1) {
00397         pageLock.lockExclusive(pageId);
00398         truncateChildren(pageLock.getNodeForRead());
00399         pageLock.unlock();
00400     }
00401     while (pageId != NULL_PAGE_ID) {
00402         PageId nextPageId = getRightSibling(pageId);
00403         pageLock.deallocateUnlockedPage(pageId);
00404         pageId = nextPageId;
00405     }
00406 }
00407 
00408 void BTreeBuilder::truncateExternal(TupleProjection const &leafPageIdProj)
00409 {
00410     // REVIEW jvs 24-Dec-2005:  Here we pre-scan the tree, dropping
00411     // the external pages.  This scan could be combined with
00412     // the main truncate traversal for better efficiency.
00413     BTreeReader reader(treeDescriptor);
00414     TupleProjectionAccessor projAccessor;
00415     projAccessor.bind(
00416         reader.getTupleAccessorForRead(),
00417         leafPageIdProj);
00418     TupleDescriptor projDesc;
00419     projDesc.projectFrom(treeDescriptor.tupleDescriptor, leafPageIdProj);
00420     TupleData projData(projDesc);
00421     BTreePageLock pageLock;
00422     pageLock.accessSegment(treeDescriptor.segmentAccessor);
00423     if (reader.searchFirst()) {
00424         do {
00425             projAccessor.unmarshal(projData);
00426             for (uint i = 0; i < projData.size(); ++i) {
00427                 if (!projData[i].pData) {
00428                     continue;
00429                 }
00430                 PageId pageId = *reinterpret_cast<PageId const *>(
00431                     projData[i].pData);
00432                 pageLock.deallocateUnlockedPage(pageId);
00433             }
00434         } while (reader.searchNext());
00435     }
00436 }
00437 
00438 FENNEL_END_CPPFILE("$Id: //open/dev/fennel/btree/BTreeBuilder.cpp#12 $");
00439 
00440 // End BTreeBuilder.cpp

Generated on Mon Jun 22 04:00:13 2009 for Fennel by  doxygen 1.5.1