diff options
Diffstat (limited to 'src/backend/access/gist')
-rw-r--r-- | src/backend/access/gist/Makefile | 2 | ||||
-rw-r--r-- | src/backend/access/gist/README | 299 | ||||
-rw-r--r-- | src/backend/access/gist/gist.c | 1063 | ||||
-rw-r--r-- | src/backend/access/gist/gistget.c | 838 | ||||
-rw-r--r-- | src/backend/access/gist/gistproc.c | 113 | ||||
-rw-r--r-- | src/backend/access/gist/gistscan.c | 222 | ||||
-rw-r--r-- | src/backend/access/gist/gistsplit.c | 85 | ||||
-rw-r--r-- | src/backend/access/gist/gistutil.c | 108 | ||||
-rw-r--r-- | src/backend/access/gist/gistvacuum.c | 70 | ||||
-rw-r--r-- | src/backend/access/gist/gistxlog.c | 804 |
10 files changed, 1876 insertions, 1728 deletions
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile index 298e9309f5..f8051a2b45 100644 --- a/src/backend/access/gist/Makefile +++ b/src/backend/access/gist/Makefile @@ -4,7 +4,7 @@ # Makefile for access/gist # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/gist/Makefile,v 1.18 2008/02/19 10:30:06 petere Exp $ +# src/backend/access/gist/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 6c90e508bf..2d78dcb0df 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/gist/README,v 1.5 2010/04/14 20:17:26 rhaas Exp $ +src/backend/access/gist/README GiST Indexing ============= @@ -20,33 +20,34 @@ The current implementation of GiST supports: * Variable length keys * Composite keys (multi-key) + * Ordered search (nearest-neighbor search) * provides NULL-safe interface to GiST core * Concurrency * Recovery support via WAL logging -The support for concurrency implemented in PostgreSQL was developed based on -the paper "Access Methods for Next-Generation Database Systems" by +The support for concurrency implemented in PostgreSQL was developed based on +the paper "Access Methods for Next-Generation Database Systems" by Marcel Kornaker: http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz The original algorithms were modified in several ways: -* They should be adapted to PostgreSQL conventions. For example, the SEARCH - algorithm was considerably changed, because in PostgreSQL function search - should return one tuple (next), not all tuples at once. Also, it should +* They had to be adapted to PostgreSQL conventions. For example, the SEARCH + algorithm was considerably changed, because in PostgreSQL the search function + should return one tuple (next), not all tuples at once. Also, it should release page locks between calls. -* Since we added support for variable length keys, it's not possible to - guarantee enough free space for all keys on pages after splitting. User - defined function picksplit doesn't have information about size of tuples +* Since we added support for variable length keys, it's not possible to + guarantee enough free space for all keys on pages after splitting. User + defined function picksplit doesn't have information about size of tuples (each tuple may contain several keys as in multicolumn index while picksplit could work with only one key) and pages. -* We modified original INSERT algorithm for performance reason. In particular, +* We modified original INSERT algorithm for performance reasons. In particular, it is now a single-pass algorithm. * Since the papers were theoretical, some details were omitted and we - have to find out ourself how to solve some specific problems. + had to find out ourself how to solve some specific problems. -Because of the above reasons, we have to revised interaction of GiST +Because of the above reasons, we have revised the interaction of GiST core and PostgreSQL WAL system. Moreover, we encountered (and solved) a problem of uncompleted insertions when recovering after crash, which was not touched in the paper. @@ -54,96 +55,127 @@ was not touched in the paper. Search Algorithm ---------------- -Function gettuple finds a tuple which satisfies the search -predicate. It store their state and returns next tuple under -subsequent calls. Stack contains page, its LSN and LSN of parent page -and currentposition is saved between calls. +The search code maintains a queue of unvisited items, where an "item" is +either a heap tuple known to satisfy the search conditions, or an index +page that is consistent with the search conditions according to inspection +of its parent page's downlink item. Initially the root page is searched +to find unvisited items in it. Then we pull items from the queue. A +heap tuple pointer is just returned immediately; an index page entry +causes that page to be searched, generating more queue entries. -gettuple(search-pred) - if ( firsttime ) - push(stack, [root, 0, 0]) // page, LSN, parentLSN - currentposition=0 - end - ptr = top of stack - while(true) - latch( ptr->page, S-mode ) - if ( ptr->page->lsn != ptr->lsn ) - ptr->lsn = ptr->page->lsn - currentposition=0 - if ( ptr->parentlsn < ptr->page->nsn ) - add to stack rightlink - else - currentposition++ - end +The queue is kept ordered with heap tuple items at the front, then +index page entries, with any newly-added index page entry inserted +before existing index page entries. This ensures depth-first traversal +of the index, and in particular causes the first few heap tuples to be +returned as soon as possible. That is helpful in case there is a LIMIT +that requires only a few tuples to be produced. - while(true) - currentposition = find_first_match( currentposition ) - if ( currentposition is invalid ) - unlatch( ptr->page ) - pop stack - ptr = top of stack - if (ptr is NULL) - return NULL - break loop - else if ( ptr->page is leaf ) - unlatch( ptr->page ) - return tuple - else - add to stack child page - end - currentposition++ - end - end +To implement nearest-neighbor search, the queue entries are augmented +with distance data: heap tuple entries are labeled with exact distance +from the search argument, while index-page entries must be labeled with +the minimum distance that any of their children could have. Then, +queue entries are retrieved in smallest-distance-first order, with +entries having identical distances managed as stated in the previous +paragraph. + +The search algorithm keeps an index page locked only long enough to scan +its entries and queue those that satisfy the search conditions. Since +insertions can occur concurrently with searches, it is possible for an +index child page to be split between the time we make a queue entry for it +(while visiting its parent page) and the time we actually reach and scan +the child page. To avoid missing the entries that were moved to the right +sibling, we detect whether a split has occurred by comparing the child +page's NSN to the LSN that the parent had when visited. If it did, the +sibling page is immediately added to the front of the queue, ensuring that +its items will be scanned in the same order as if they were still on the +original child page. + +As is usual in Postgres, the search algorithm only guarantees to find index +entries that existed before the scan started; index entries added during +the scan might or might not be visited. This is okay as long as all +searches use MVCC snapshot rules to reject heap tuples newer than the time +of scan start. In particular, this means that we need not worry about +cases where a parent page's downlink key is "enlarged" after we look at it. +Any such enlargement would be to add child items that we aren't interested +in returning anyway. Insert Algorithm ---------------- -INSERT guarantees that the GiST tree remains balanced. User defined key method -Penalty is used for choosing a subtree to insert; method PickSplit is used for -the node splitting algorithm; method Union is used for propagating changes +INSERT guarantees that the GiST tree remains balanced. User defined key method +Penalty is used for choosing a subtree to insert; method PickSplit is used for +the node splitting algorithm; method Union is used for propagating changes upward to maintain the tree properties. -NOTICE: We modified original INSERT algorithm for performance reason. In -particularly, it is now a single-pass algorithm. +To insert a tuple, we first have to find a suitable leaf page to insert to. +The algorithm walks down the tree, starting from the root, along the path +of smallest Penalty. At each step: -Function findLeaf is used to identify subtree for insertion. Page, in which -insertion is proceeded, is locked as well as its parent page. Functions -findParent and findPath are used to find parent pages, which could be changed -because of concurrent access. Function pageSplit is recurrent and could split -page by more than 2 pages, which could be necessary if keys have different -lengths or more than one key are inserted (in such situation, user defined -function pickSplit cannot guarantee free space on page). +1. Has this page been split since we looked at the parent? If so, it's +possible that we should be inserting to the other half instead, so retreat +back to the parent. +2. If this is a leaf node, we've found our target node. +3. Otherwise use Penalty to pick a new target subtree. +4. Check the key representing the target subtree. If it doesn't already cover +the key we're inserting, replace it with the Union of the old downlink key +and the key being inserted. (Actually, we always call Union, and just skip +the replacement if the Unioned key is the same as the existing key) +5. Replacing the key in step 4 might cause the page to be split. In that case, +propagate the change upwards and restart the algorithm from the first parent +that didn't need to be split. +6. Walk down to the target subtree, and goto 1. + +This differs from the insertion algorithm in the original paper. In the +original paper, you first walk down the tree until you reach a leaf page, and +then you adjust the downlink in the parent, and propagating the adjustment up, +all the way up to the root in the worst case. But we adjust the downlinks to +cover the new key already when we walk down, so that when we reach the leaf +page, we don't need to update the parents anymore, except to insert the +downlinks if we have to split the page. This makes crash recovery simpler: +after inserting a key to the page, the tree is immediately self-consistent +without having to update the parents. Even if we split a page and crash before +inserting the downlink to the parent, the tree is self-consistent because the +right half of the split is accessible via the rightlink of the left page +(which replaced the original page). + +Note that the algorithm can walk up and down the tree before reaching a leaf +page, if internal pages need to split while adjusting the downlinks for the +new key. Eventually, you should reach the bottom, and proceed with the +insertion of the new tuple. + +Once we've found the target page to insert to, we check if there's room +for the new tuple. If there is, the tuple is inserted, and we're done. +If it doesn't fit, however, the page needs to be split. Note that it is +possible that a page needs to be split into more than two pages, if keys have +different lengths or more than one key is being inserted at a time (which can +happen when inserting downlinks for a page split that resulted in more than +two pages at the lower level). After splitting a page, the parent page needs +to be updated. The downlink for the new page needs to be inserted, and the +downlink for the old page, which became the left half of the split, needs to +be updated to only cover those tuples that stayed on the left page. Inserting +the downlink in the parent can again lead to a page split, recursing up to the +root page in the worst case. + +gistplacetopage is the workhorse function that performs one step of the +insertion. If the tuple fits, it inserts it to the given page, otherwise +it splits the page, and constructs the new downlink tuples for the split +pages. The caller must then call gistplacetopage() on the parent page to +insert the downlink tuples. The parent page that holds the downlink to +the child might have migrated as a result of concurrent splits of the +parent, gistfindCorrectParent() is used to find the parent page. + +Splitting the root page works slightly differently. At root split, +gistplacetopage() allocates the new child pages and replaces the old root +page with the new root containing downlinks to the new children, all in one +operation. -findLeaf(new-key) - push(stack, [root, 0]) //page, LSN - while(true) - ptr = top of stack - latch( ptr->page, S-mode ) - ptr->lsn = ptr->page->lsn - if ( exists ptr->parent AND ptr->parent->lsn < ptr->page->nsn ) - unlatch( ptr->page ) - pop stack - else if ( ptr->page is not leaf ) - push( stack, [get_best_child(ptr->page, new-key), 0] ) - unlatch( ptr->page ) - else - unlatch( ptr->page ) - latch( ptr->page, X-mode ) - if ( ptr->page is not leaf ) - //the only root page can become a non-leaf - unlatch( ptr->page ) - else if ( ptr->parent->lsn < ptr->page->nsn ) - unlatch( ptr->page ) - pop stack - else - return stack - end - end - end + +findPath is a subroutine of findParent, used when the correct parent page +can't be found by following the rightlinks at the parent level: findPath( stack item ) - push stack, [root, 0, 0] // page, LSN, parent + push stack, [root, 0, 0] // page, LSN, parent while( stack ) ptr = top of stack latch( ptr->page, S-mode ) @@ -152,7 +184,7 @@ findPath( stack item ) end for( each tuple on page ) if ( tuple->pagepointer == item->page ) - return stack + return stack else add to stack at the end [tuple->pagepointer,0, ptr] end @@ -160,12 +192,16 @@ findPath( stack item ) unlatch( ptr->page ) pop stack end - + + +gistFindCorrectParent is used to re-find the parent of a page during +insertion. It might have migrated to the right since we traversed down the +tree because of page splits. + findParent( stack item ) parent = item->parent - latch( parent->page, X-mode ) if ( parent->page->lsn != parent->lsn ) - while(true) + while(true) search parent tuple on parent->page, if found the return rightlink = parent->page->rightlink unlatch( parent->page ) @@ -177,9 +213,13 @@ findParent( stack item ) end newstack = findPath( item->parent ) replace part of stack to new one + latch( parent->page, X-mode ) return findParent( item ) end +pageSplit function decides how to distribute keys to the new pages after +page split: + pageSplit(page, allkeys) (lkeys, rkeys) = pickSplit( allkeys ) if ( page is root ) @@ -200,40 +240,45 @@ pageSplit(page, allkeys) return newkeys -placetopage(page, keysarray) - if ( no space left on page ) - keysarray = pageSplit(page, [ extract_keys(page), keysarray]) - last page in chain gets old NSN, - original and others - new NSN equals to LSN - if ( page is root ) - make new root with keysarray - end - else - put keysarray on page - if ( length of keysarray > 1 ) - keysarray = [ union(keysarray) ] - end - end - -insert(new-key) - stack = findLeaf(new-key) - keysarray = [new-key] - ptr = top of stack - while(true) - findParent( ptr ) //findParent latches parent page - keysarray = placetopage(ptr->page, keysarray) - unlatch( ptr->page ) - pop stack; - ptr = top of stack - if (length of keysarray == 1) - newboundingkey = union(oldboundingkey, keysarray) - if (newboundingkey == oldboundingkey) - unlatch ptr->page - break loop - end - end - end + +Concurrency control +------------------- +As a rule of thumb, if you need to hold a lock on multiple pages at the +same time, the locks should be acquired in the following order: child page +before parent, and left-to-right at the same level. Always acquiring the +locks in the same order avoids deadlocks. + +The search algorithm only looks at and locks one page at a time. Consequently +there's a race condition between a search and a page split. A page split +happens in two phases: 1. The page is split 2. The downlink is inserted to the +parent. If a search looks at the parent page between those steps, before the +downlink is inserted, it will still find the new right half by following the +rightlink on the left half. But it must not follow the rightlink if it saw the +downlink in the parent, or the page will be visited twice! + +A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan +sees that flag set, it knows that the right page is missing the downlink, and +should be visited too. When split inserts the downlink to the parent, it +clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the +child page header to match the LSN of the insertion on the parent. If the +F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the +LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page +before the downlink was inserted, so it should follow the rightlink. Otherwise +the scan saw the downlink in the parent page, and will/did follow that as +usual. + +A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because +a page split keeps the child pages locked until the downlink has been inserted +to the parent and the flag cleared again. But if a crash happens in the middle +of a page split, before the downlinks are inserted into the parent, that will +leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine, +but we'll eventually want to fix that for performance reasons. And more +importantly, dealing with pages with missing downlink pointers in the parent +would complicate the insertion algorithm. So when an insertion sees a page +with F_FOLLOW_RIGHT set, it immediately tries to bring the split that +crashed in the middle to completion by adding the downlink in the parent. + Authors: Teodor Sigaev <[email protected]> - Oleg Bartunov <[email protected]> + Oleg Bartunov <[email protected]> diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index cec08c7226..8227bfdb88 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -4,11 +4,11 @@ * interface routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.158 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gist/gist.c * *------------------------------------------------------------------------- */ @@ -17,13 +17,12 @@ #include "access/genam.h" #include "access/gist_private.h" #include "catalog/index.h" +#include "catalog/pg_collation.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/indexfsm.h" #include "utils/memutils.h" -const XLogRecPtr XLogRecPtrForTemp = {1, 1}; - /* Working state for gistbuild and its callback */ typedef struct { @@ -33,6 +32,12 @@ typedef struct MemoryContext tmpCtx; } GISTBuildState; +/* A List of these is used represent a split-in-progress. */ +typedef struct +{ + Buffer buf; /* the split page "half" */ + IndexTuple downlink; /* downlink for this half. */ +} GISTPageSplitInfo; /* non-export function prototypes */ static void gistbuildCallback(Relation index, @@ -45,8 +50,13 @@ static void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *GISTstate); -static void gistfindleaf(GISTInsertState *state, - GISTSTATE *giststate); +static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); +static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild); +static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo); #define ROTATEDIST(d) do { \ @@ -117,7 +127,7 @@ gistbuild(PG_FUNCTION_ARGS) MarkBufferDirty(buffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; @@ -132,7 +142,7 @@ gistbuild(PG_FUNCTION_ARGS) PageSetTLI(page, ThisTimeLineID); } else - PageSetLSN(page, XLogRecPtrForTemp); + PageSetLSN(page, GetXLogRecPtrForTemp()); UnlockReleaseBuffer(buffer); @@ -210,6 +220,19 @@ gistbuildCallback(Relation index, } /* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +Datum +gistbuildempty(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unlogged GiST indexes are not supported"))); + + PG_RETURN_VOID(); +} + +/* * gistinsert -- wrapper for GiST tuple insertion. * * This is the public interface routine for tuple insertion in GiSTs. @@ -253,41 +276,52 @@ gistinsert(PG_FUNCTION_ARGS) /* - * Workhouse routine for doing insertion into a GiST index. Note that - * this routine assumes it is invoked in a short-lived memory context, - * so it does not bother releasing palloc'd allocations. + * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple + * at that offset is atomically removed along with inserting the new tuples. + * This is used to replace a tuple with a new one. + * + * If 'leftchildbuf' is valid, we're inserting the downlink for the page + * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. + * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. + * + * If there is not enough room on the page, it is split. All the split + * pages are kept pinned and locked and returned in *splitinfo, the caller + * is responsible for inserting the downlinks for them. However, if + * 'buffer' is the root page and it needs to be split, gistplacetopage() + * performs the split as one atomic operation, and *splitinfo is set to NIL. + * In that case, we continue to hold the root page locked, and the child + * pages are released; note that new tuple(s) are *not* on the root page + * but in one of the new child pages. */ -static void -gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) +static bool +gistplacetopage(GISTInsertState *state, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + Buffer leftchildbuf, + List **splitinfo) { - GISTInsertState state; - - memset(&state, 0, sizeof(GISTInsertState)); - - state.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); - state.itup[0] = (IndexTuple) palloc(IndexTupleSize(itup)); - memcpy(state.itup[0], itup, IndexTupleSize(itup)); - state.ituplen = 1; - state.freespace = freespace; - state.r = r; - state.key = itup->t_tid; - state.needInsertComplete = true; + Page page = BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + XLogRecPtr recptr; + int i; + bool is_split; - state.stack = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); - state.stack->blkno = GIST_ROOT_BLKNO; + /* + * Refuse to modify a page that's incompletely split. This should not + * happen because we finish any incomplete splits while we walk down the + * tree. However, it's remotely possible that another concurrent inserter + * splits a parent page, and errors out before completing the split. We + * will just throw an error in that case, and leave any split we had in + * progress unfinished too. The next insert that comes along will clean up + * the mess. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); - gistfindleaf(&state, giststate); - gistmakedeal(&state, giststate); -} - -static bool -gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) -{ - bool is_splitted = false; - bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false; + *splitinfo = NIL; /* - * if (!is_leaf) remove old key: This node's key has been modified, either + * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. @@ -295,77 +329,136 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. - * - * XXX: If we want to change fillfactors between node and leaf, fillfactor - * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor) */ - if (gistnospace(state->stack->page, state->itup, state->ituplen, - is_leaf ? InvalidOffsetNumber : state->stack->childoffnum, - state->freespace)) + is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace); + if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; - BlockNumber rrlink = InvalidBlockNumber; - GistNSN oldnsn; + BlockNumber oldrlink = InvalidBlockNumber; + GistNSN oldnsn = {0, 0}; + SplitedPageLayout rootpg; + BlockNumber blkno = BufferGetBlockNumber(buffer); + bool is_rootsplit; - is_splitted = true; + is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* - * Form index tuples vector to split: remove old tuple if t's needed - * and add new tuples to vector + * Form index tuples vector to split. If we're replacing an old tuple, + * remove the old version from the vector. */ - itvec = gistextractpage(state->stack->page, &tlen); - if (!is_leaf) + itvec = gistextractpage(page, &tlen); + if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ - int pos = state->stack->childoffnum - FirstOffsetNumber; + int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } - itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen); - dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate); + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(state->r, page, itvec, tlen, giststate); - state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen); - state->ituplen = 0; - - if (state->stack->blkno != GIST_ROOT_BLKNO) + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) { - /* - * if non-root split then we should not allocate new buffer, but - * we must create temporary page to operate - */ - dist->buffer = state->stack->buffer; - dist->page = PageGetTempPageCopySpecial(BufferGetPage(dist->buffer)); + /* save old rightlink and NSN */ + oldrlink = GistPageGetOpaque(page)->rightlink; + oldnsn = GistPageGetOpaque(page)->nsn; + + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(state->r); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } - /* make new pages and fills them */ + /* + * Now that we know whick blocks the new pages go to, set up downlink + * tuples to point to them. + */ for (ptr = dist; ptr; ptr = ptr->next) { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of requiring the caller to insert + * them. Add the new root page to the list along with the child pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; int i; - char *data; - /* get new page */ - if (ptr->buffer == InvalidBuffer) + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + else + { + /* Prepare split-info to be returned to caller */ + for (ptr = dist; ptr; ptr = ptr->next) { - ptr->buffer = gistNewBuffer(state->r); - GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); - ptr->page = BufferGetPage(ptr->buffer); + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + + si->buf = ptr->buffer; + si->downlink = ptr->itup; + *splitinfo = lappend(*splitinfo, si); } - ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); - /* - * fill page, we can do it because all these pages are new (ie not - * linked in tree or masked by temp page - */ - data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) @@ -373,276 +466,388 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) data += IndexTupleSize((IndexTuple) data); } - /* set up ItemPointer and remember it for parent */ - ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); - state->itup[state->ituplen] = ptr->itup; - state->ituplen++; - } + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; - /* saves old rightlink */ - if (state->stack->blkno != GIST_ROOT_BLKNO) - rrlink = GistPageGetOpaque(dist->page)->rightlink; + if (ptr->next && !is_rootsplit) + GistMarkFollowRight(ptr->page); + else + GistClearFollowRight(ptr->page); + + /* + * Copy the NSN of the original page to all pages. The + * F_FOLLOW_RIGHT flags ensure that scans will follow the + * rightlinks until the downlinks are inserted. + */ + GistPageGetOpaque(ptr->page)->nsn = oldnsn; + } START_CRIT_SECTION(); /* - * must mark buffers dirty before XLogInsert, even though we'll still - * be changing their opaque fields below. set up right links. + * Must mark buffers dirty before XLogInsert, even though we'll still + * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) - { MarkBufferDirty(ptr->buffer); - GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ? - ptr->next->block.blkno : rrlink; - } - - /* restore splitted non-root page */ - if (state->stack->blkno != GIST_ROOT_BLKNO) - { - PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); - dist->page = BufferGetPage(dist->buffer); - } - - if (!state->r->rd_istemp) - { - XLogRecPtr recptr; - XLogRecData *rdata; - - rdata = formSplitRdata(state->r->rd_node, state->stack->blkno, - is_leaf, &(state->key), dist); + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); + /* + * The first page in the chain was a temporary working copy meant to + * replace the old page. Copy it over the old page. + */ + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); - for (ptr = dist; ptr; ptr = ptr->next) - { - PageSetLSN(ptr->page, recptr); - PageSetTLI(ptr->page, ThisTimeLineID); - } - } + /* Write the WAL record */ + if (RelationNeedsWAL(state->r)) + recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf, + dist, oldrlink, oldnsn, leftchildbuf); else - { - for (ptr = dist; ptr; ptr = ptr->next) - { - PageSetLSN(ptr->page, XLogRecPtrForTemp); - } - } - - /* set up NSN */ - oldnsn = GistPageGetOpaque(dist->page)->nsn; - if (state->stack->blkno == GIST_ROOT_BLKNO) - /* if root split we should put initial value */ - oldnsn = PageGetLSN(dist->page); + recptr = GetXLogRecPtrForTemp(); for (ptr = dist; ptr; ptr = ptr->next) { - /* only for last set oldnsn */ - GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ? - PageGetLSN(ptr->page) : oldnsn; + PageSetLSN(ptr->page, recptr); + PageSetTLI(ptr->page, ThisTimeLineID); } /* - * release buffers, if it was a root split then release all buffers - * because we create all buffers + * Return the new child buffers to the caller. + * + * If this was a root split, we've already inserted the downlink + * pointers, in the form of a new root page. Therefore we can release + * all the new buffers, and keep just the root page locked. */ - ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next; - for (; ptr; ptr = ptr->next) - UnlockReleaseBuffer(ptr->buffer); - - if (state->stack->blkno == GIST_ROOT_BLKNO) + if (is_rootsplit) { - gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key)); - state->needInsertComplete = false; + for (ptr = dist->next; ptr; ptr = ptr->next) + UnlockReleaseBuffer(ptr->buffer); } - - END_CRIT_SECTION(); } else { - /* enough space */ + /* + * Enough space. We also get here if ntuples==0. + */ START_CRIT_SECTION(); - if (!is_leaf) - PageIndexTupleDelete(state->stack->page, state->stack->childoffnum); - gistfillbuffer(state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber); + if (OffsetNumberIsValid(oldoffnum)) + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); - MarkBufferDirty(state->stack->buffer); + MarkBufferDirty(buffer); - if (!state->r->rd_istemp) + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + if (RelationNeedsWAL(state->r)) { - OffsetNumber noffs = 0, - offs[1]; - XLogRecPtr recptr; - XLogRecData *rdata; + OffsetNumber ndeloffs = 0, + deloffs[1]; - if (!is_leaf) + if (OffsetNumberIsValid(oldoffnum)) { - /* only on inner page we should delete previous version */ - offs[0] = state->stack->childoffnum; - noffs = 1; + deloffs[0] = oldoffnum; + ndeloffs = 1; } - rdata = formUpdateRdata(state->r->rd_node, state->stack->buffer, - offs, noffs, - state->itup, state->ituplen, - &(state->key)); + recptr = gistXLogUpdate(state->r->rd_node, buffer, + deloffs, ndeloffs, itup, ntup, + leftchildbuf); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); - PageSetLSN(state->stack->page, recptr); - PageSetTLI(state->stack->page, ThisTimeLineID); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); } else - PageSetLSN(state->stack->page, XLogRecPtrForTemp); - - if (state->stack->blkno == GIST_ROOT_BLKNO) - state->needInsertComplete = false; + { + recptr = GetXLogRecPtrForTemp(); + PageSetLSN(page, recptr); + } - END_CRIT_SECTION(); + *splitinfo = NIL; + } - if (state->ituplen > 1) - { /* previous is_splitted==true */ + /* + * If we inserted the downlink for a child page, set NSN and clear + * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to + * follow the rightlink if and only if they looked at the parent page + * before we inserted the downlink. + * + * Note that we do this *after* writing the WAL record. That means that + * the possible full page image in the WAL record does not include these + * changes, and they must be replayed even if the page is restored from + * the full page image. There's a chicken-and-egg problem: if we updated + * the child pages first, we wouldn't know the recptr of the WAL record + * we're about to write. + */ + if (BufferIsValid(leftchildbuf)) + { + Page leftpg = BufferGetPage(leftchildbuf); - /* - * child was splited, so we must form union for insertion in - * parent - */ - IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate); + GistPageGetOpaque(leftpg)->nsn = recptr; + GistClearFollowRight(leftpg); - ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno); - state->itup[0] = newtup; - state->ituplen = 1; - } - else if (is_leaf) - { - /* - * itup[0] store key to adjust parent, we set it to valid to - * correct check by GistTupleIsInvalid macro in gistgetadjusted() - */ - ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno); - GistTupleSetValid(state->itup[0]); - } + PageSetLSN(leftpg, recptr); + PageSetTLI(leftpg, ThisTimeLineID); } - return is_splitted; + + END_CRIT_SECTION(); + + return is_split; } /* - * returns stack of pages, all pages in stack are pinned, and - * leaf is X-locked + * Workhouse routine for doing insertion into a GiST index. Note that + * this routine assumes it is invoked in a short-lived memory context, + * so it does not bother releasing palloc'd allocations. */ - static void -gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) +gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; - GISTPageOpaque opaque; + GISTInsertStack firststack; + GISTInsertStack *stack; + GISTInsertState state; + bool xlocked = false; + + memset(&state, 0, sizeof(GISTInsertState)); + state.freespace = freespace; + state.r = r; + + /* Start from the root */ + firststack.blkno = GIST_ROOT_BLKNO; + firststack.lsn.xrecoff = 0; + firststack.parent = NULL; + state.stack = stack = &firststack; /* - * walk down, We don't lock page for a long time, but so we should be - * ready to recheck path in a bad case... We remember, that page->lsn - * should never be invalid. + * Walk down along the path of smallest penalty, updating the parent + * pointers with the key we're inserting as we go. If we crash in the + * middle, the tree is consistent, although the possible parent updates + * were a waste. */ for (;;) { - if (XLogRecPtrIsInvalid(state->stack->lsn)) - state->stack->buffer = ReadBuffer(state->r, state->stack->blkno); - LockBuffer(state->stack->buffer, GIST_SHARE); - gistcheckpage(state->r, state->stack->buffer); + if (XLogRecPtrIsInvalid(stack->lsn)) + stack->buffer = ReadBuffer(state.r, stack->blkno); + + /* + * Be optimistic and grab shared lock first. Swap it for an exclusive + * lock later if we need to update the page. + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_SHARE); + gistcheckpage(state.r, stack->buffer); + } - state->stack->page = (Page) BufferGetPage(state->stack->buffer); - opaque = GistPageGetOpaque(state->stack->page); + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); + Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); - state->stack->lsn = PageGetLSN(state->stack->page); - Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn)); + /* + * If this page was split but the downlink was never inserted to the + * parent because the inserting backend crashed before doing that, fix + * that now. + */ + if (GistFollowRight(stack->page)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + /* someone might've completed the split when we unlocked */ + if (!GistFollowRight(stack->page)) + continue; + } + gistfixsplit(&state, giststate); - if (state->stack->blkno != GIST_ROOT_BLKNO && - XLByteLT(state->stack->parent->lsn, opaque->nsn)) + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if (stack->blkno != GIST_ROOT_BLKNO && + XLByteLT(stack->parent->lsn, + GistPageGetOpaque(stack->page)->nsn)) { /* - * caused split non-root page is detected, go up to parent to - * choose best child + * Concurrent split detected. There's no guarantee that the + * downlink for this page is consistent with the tuple we're + * inserting anymore, so go back to parent and rechoose the best + * child. */ - UnlockReleaseBuffer(state->stack->buffer); - state->stack = state->stack->parent; + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; continue; } - if (!GistPageIsLeaf(state->stack->page)) + if (!GistPageIsLeaf(stack->page)) { /* - * This is an internal page, so continue to walk down the tree. We - * find the child node that has the minimum insertion penalty and - * recursively invoke ourselves to modify that node. Once the - * recursive call returns, we may need to adjust the parent node - * for two reasons: the child node split, or the key in this node - * needs to be adjusted for the newly inserted key below us. + * This is an internal page so continue to walk down the tree. + * Find the child node that has the minimum insertion penalty. */ - GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); - - state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate); + BlockNumber childblkno; + IndexTuple newtup; + GISTInsertStack *item; - iid = PageGetItemId(state->stack->page, state->stack->childoffnum); - idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid); - item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); - LockBuffer(state->stack->buffer, GIST_UNLOCK); + stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate); + iid = PageGetItemId(stack->page, stack->childoffnum); + idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); - item->parent = state->stack; - item->child = NULL; - if (state->stack) - state->stack->child = item; - state->stack = item; - } - else - { - /* be carefull, during unlock/lock page may be changed... */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - LockBuffer(state->stack->buffer, GIST_EXCLUSIVE); - state->stack->page = (Page) BufferGetPage(state->stack->buffer); - opaque = GistPageGetOpaque(state->stack->page); + /* + * Check that it's not a leftover invalid tuple from pre-9.1 + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(r)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), + errhint("Please REINDEX it."))); - if (state->stack->blkno == GIST_ROOT_BLKNO) + /* + * Check that the key representing the target child node is + * consistent with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); + if (newtup) { /* - * the only page can become inner instead of leaf is a root - * page, so for root we should recheck it + * Swap shared lock for an exclusive one. Beware, the page may + * change while we unlock/lock the page... */ - if (!GistPageIsLeaf(state->stack->page)) + if (!xlocked) { - /* - * very rarely situation: during unlock/lock index with - * number of pages = 1 was increased - */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - continue; + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn)) + { + /* the page was changed while we unlocked it, retry */ + continue; + } } /* - * we don't need to check root split, because checking - * leaf/inner is enough to recognize split for root + * Update the tuple. + * + * We still hold the lock after gistinserttuples(), but it + * might have to split the page to make the updated tuple fit. + * In that case the updated tuple might migrate to the other + * half of the split, so we have to go back to the parent and + * descend back to the half that's a better fit for the new + * tuple. */ - + if (gistinserttuples(&state, stack, giststate, &newtup, 1, + stack->childoffnum, InvalidBuffer)) + { + /* + * If this was a root split, the root page continues to be + * the parent and the updated tuple went to one of the + * child pages, so we just need to retry from the root + * page. + */ + if (stack->blkno != GIST_ROOT_BLKNO) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + } + continue; + } } - else if (XLByteLT(state->stack->parent->lsn, opaque->nsn)) + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + /* descend to the chosen child */ + item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item->blkno = childblkno; + item->parent = stack; + state.stack = stack = item; + } + else + { + /* + * Leaf page. Insert the new key. We've already updated all the + * parents on the way down, but we might have to split the page if + * it doesn't fit. gistinserthere() will take care of that. + */ + + /* + * Swap shared lock for an exclusive one. Be careful, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) { - /* - * detecting split during unlock/lock, so we should find - * better child on parent - */ + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); - /* forget buffer */ - UnlockReleaseBuffer(state->stack->buffer); + if (stack->blkno == GIST_ROOT_BLKNO) + { + /* + * the only page that can become inner instead of leaf is + * the root page, so for root we should recheck it + */ + if (!GistPageIsLeaf(stack->page)) + { + /* + * very rare situation: during unlock/lock index with + * number of pages = 1 was increased + */ + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + continue; + } - state->stack = state->stack->parent; - continue; + /* + * we don't need to check root split, because checking + * leaf/inner is enough to recognize split for root + */ + } + else if (GistFollowRight(stack->page) || + XLByteLT(stack->parent->lsn, + GistPageGetOpaque(stack->page)->nsn)) + { + /* + * The page was split while we momentarily unlocked the + * page. Go back to parent. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } } - state->stack->lsn = PageGetLSN(state->stack->page); + /* now state.stack->(page, buffer and blkno) points to leaf page */ + + gistinserttuples(&state, stack, giststate, &itup, 1, + InvalidOffsetNumber, InvalidBuffer); + LockBuffer(stack->buffer, GIST_UNLOCK); - /* ok we found a leaf page and it X-locked */ + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + ReleaseBuffer(stack->buffer); break; } } - - /* now state->stack->(page, buffer and blkno) points to leaf page */ } /* @@ -650,7 +855,7 @@ gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) * * returns from the beginning of closest parent; * - * To prevent deadlocks, this should lock only one page simultaneously. + * To prevent deadlocks, this should lock only one page at a time. */ GISTInsertStack * gistFindPath(Relation r, BlockNumber child) @@ -685,6 +890,13 @@ gistFindPath(Relation r, BlockNumber child) top->lsn = PageGetLSN(page); + /* + * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a + * downlink. This should not normally happen.. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { @@ -713,8 +925,6 @@ gistFindPath(Relation r, BlockNumber child) ptr = top; while (ptr->parent) { - /* set child link */ - ptr->parent->child = ptr; /* move childoffnum.. */ if (ptr == top) { @@ -756,17 +966,16 @@ gistFindPath(Relation r, BlockNumber child) return NULL; } - /* - * Returns X-locked parent of stack page + * Updates the stack so that child->parent is the correct parent of the + * child. child->parent must be exclusively locked on entry, and will + * remain so at exit, but it might not be the same page anymore. */ - static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; - LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); @@ -838,83 +1047,232 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child) /* install new chain of parents to stack */ child->parent = parent; - parent->child = child; /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); gistFindCorrectParent(r, child); } return; } -void -gistmakedeal(GISTInsertState *state, GISTSTATE *giststate) +/* + * Form a downlink pointer for the page in 'buf'. + */ +static IndexTuple +gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, + GISTInsertStack *stack) { - int is_splitted; - ItemId iid; - IndexTuple oldtup, - newtup; + Page page = BufferGetPage(buf); + OffsetNumber maxoff; + OffsetNumber offset; + IndexTuple downlink = NULL; - /* walk up */ - while (true) + maxoff = PageGetMaxOffsetNumber(page); + for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) { - /* - * After this call: 1. if child page was splited, then itup contains - * keys for each page 2. if child page wasn't splited, then itup - * contains additional for adjustment of current key - */ + IndexTuple ituple = (IndexTuple) + PageGetItem(page, PageGetItemId(page, offset)); - if (state->stack->parent) + if (downlink == NULL) + downlink = CopyIndexTuple(ituple); + else { - /* - * X-lock parent page before proceed child, gistFindCorrectParent - * should find and lock it - */ - gistFindCorrectParent(state->r, state->stack); + IndexTuple newdownlink; + + newdownlink = gistgetadjusted(rel, downlink, ituple, + giststate); + if (newdownlink) + downlink = newdownlink; } - is_splitted = gistplacetopage(state, giststate); + } + + /* + * If the page is completely empty, we can't form a meaningful downlink + * for it. But we have to insert a downlink for the page. Any key will do, + * as long as its consistent with the downlink of parent page, so that we + * can legally insert it to the parent. A minimal one that matches as few + * scans as possible would be best, to keep scans from doing useless work, + * but we don't know how to construct that. So we just use the downlink of + * the original page that was split - that's as far from optimal as it can + * get but will do.. + */ + if (!downlink) + { + ItemId iid; - /* parent locked above, so release child buffer */ - UnlockReleaseBuffer(state->stack->buffer); + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(rel, stack); + iid = PageGetItemId(stack->parent->page, stack->parent->childoffnum); + downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); + downlink = CopyIndexTuple(downlink); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + } - /* pop parent page from stack */ - state->stack = state->stack->parent; + ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf)); + GistTupleSetValid(downlink); - /* stack is void */ - if (!state->stack) - break; + return downlink; +} - /* - * child did not split, so we can check is it needed to update parent - * tuple - */ - if (!is_splitted) - { - /* parent's tuple */ - iid = PageGetItemId(state->stack->page, state->stack->childoffnum); - oldtup = (IndexTuple) PageGetItem(state->stack->page, iid); - newtup = gistgetadjusted(state->r, oldtup, state->itup[0], giststate); - - if (!newtup) - { /* not need to update key */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - break; - } - state->itup[0] = newtup; +/* + * Complete the incomplete split of state->stack->page. + */ +static void +gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) +{ + GISTInsertStack *stack = state->stack; + Buffer buf; + Page page; + List *splitinfo = NIL; + + elog(LOG, "fixing incomplete split in index \"%s\", block %u", + RelationGetRelationName(state->r), stack->blkno); + + Assert(GistFollowRight(stack->page)); + Assert(OffsetNumberIsValid(stack->parent->childoffnum)); + + buf = stack->buffer; + + /* + * Read the chain of split pages, following the rightlinks. Construct a + * downlink tuple for each page. + */ + for (;;) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + IndexTuple downlink; + + page = BufferGetPage(buf); + + /* Form the new downlink tuples to insert to parent */ + downlink = gistformdownlink(state->r, buf, giststate, stack); + + si->buf = buf; + si->downlink = downlink; + + splitinfo = lappend(splitinfo, si); + + if (GistFollowRight(page)) + { + /* lock next page */ + buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink); + LockBuffer(buf, GIST_EXCLUSIVE); } - } /* while */ + else + break; + } + + /* Insert the downlinks */ + gistfinishsplit(state, stack, giststate, splitinfo); +} + +/* + * Insert tuples to stack->buffer. If 'oldoffnum' is valid, the new tuples + * replace an old tuple at oldoffnum. The caller must hold an exclusive lock + * on the page. + * + * If leftchild is valid, we're inserting/updating the downlink for the + * page to the right of leftchild. We clear the F_FOLLOW_RIGHT flag and + * update NSN on leftchild, atomically with the insertion of the downlink. + * + * Returns 'true' if the page had to be split. On return, we will continue + * to hold an exclusive lock on state->stack->buffer, but if we had to split + * the page, it might not contain the tuple we just inserted/updated. + */ +static bool +gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild) +{ + List *splitinfo; + bool is_split; + + is_split = gistplacetopage(state, giststate, stack->buffer, + tuples, ntup, oldoffnum, + leftchild, + &splitinfo); + if (splitinfo) + gistfinishsplit(state, stack, giststate, splitinfo); + + return is_split; +} + +/* + * Finish an incomplete split by inserting/updating the downlinks in + * parent page. 'splitinfo' contains all the child pages, exclusively-locked, + * involved in the split, from left-to-right. + */ +static void +gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo) +{ + ListCell *lc; + List *reversed; + GISTPageSplitInfo *right; + GISTPageSplitInfo *left; + IndexTuple tuples[2]; + + /* A split always contains at least two halves */ + Assert(list_length(splitinfo) >= 2); + + /* + * We need to insert downlinks for each new page, and update the downlink + * for the original (leftmost) page in the split. Begin at the rightmost + * page, inserting one downlink at a time until there's only two pages + * left. Finally insert the downlink for the last new page and update the + * downlink for the original page as one operation. + */ + + /* for convenience, create a copy of the list in reverse order */ + reversed = NIL; + foreach(lc, splitinfo) + { + reversed = lcons(lfirst(lc), reversed); + } - /* release all parent buffers */ - while (state->stack) + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(state->r, stack); + + while (list_length(reversed) > 2) { - ReleaseBuffer(state->stack->buffer); - state->stack = state->stack->parent; + right = (GISTPageSplitInfo *) linitial(reversed); + left = (GISTPageSplitInfo *) lsecond(reversed); + + if (gistinserttuples(state, stack->parent, giststate, + &right->downlink, 1, + InvalidOffsetNumber, + left->buf)) + { + /* + * If the parent page was split, need to relocate the original + * parent pointer. + */ + gistFindCorrectParent(state->r, stack); + } + UnlockReleaseBuffer(right->buf); + reversed = list_delete_first(reversed); } - /* say to xlog that insert is completed */ - if (state->needInsertComplete && !state->r->rd_istemp) - gistxlogInsertCompletion(state->r->rd_node, &(state->key), 1); + right = (GISTPageSplitInfo *) linitial(reversed); + left = (GISTPageSplitInfo *) lsecond(reversed); + + /* + * Finally insert downlink for the remaining right page and update the + * downlink for the original page to not contain the tuples that were + * moved to the new pages. + */ + tuples[0] = left->downlink; + tuples[1] = right->downlink; + gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->parent->childoffnum, + left->buf); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + UnlockReleaseBuffer(right->buf); + Assert(left->buf == stack->buffer); } /* @@ -965,8 +1323,7 @@ gistSplit(Relation r, ROTATEDIST(res); res->block.num = v.splitVector.spl_nright; res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist)); - res->itup = (v.spl_rightvalid) ? gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false) - : gist_form_invalid_tuple(GIST_ROOT_BLKNO); + res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false); } if (!gistfitpage(lvectup, v.splitVector.spl_nleft)) @@ -988,51 +1345,16 @@ gistSplit(Relation r, ROTATEDIST(res); res->block.num = v.splitVector.spl_nleft; res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist)); - res->itup = (v.spl_leftvalid) ? gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false) - : gist_form_invalid_tuple(GIST_ROOT_BLKNO); + res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false); } return res; } /* - * buffer must be pinned and locked by caller + * Fill a GISTSTATE with information about the index */ void -gistnewroot(Relation r, Buffer buffer, IndexTuple *itup, int len, ItemPointer key) -{ - Page page; - - Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); - page = BufferGetPage(buffer); - - START_CRIT_SECTION(); - - GISTInitBuffer(buffer, 0); - gistfillbuffer(page, itup, len, FirstOffsetNumber); - - MarkBufferDirty(buffer); - - if (!r->rd_istemp) - { - XLogRecPtr recptr; - XLogRecData *rdata; - - rdata = formUpdateRdata(r->rd_node, buffer, - NULL, 0, - itup, len, key); - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_NEW_ROOT, rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); - } - else - PageSetLSN(page, XLogRecPtrForTemp); - - END_CRIT_SECTION(); -} - -void initGISTstate(GISTSTATE *giststate, Relation index) { int i; @@ -1066,6 +1388,29 @@ initGISTstate(GISTSTATE *giststate, Relation index) fmgr_info_copy(&(giststate->equalFn[i]), index_getprocinfo(index, i + 1, GIST_EQUAL_PROC), CurrentMemoryContext); + /* opclasses are not required to provide a Distance method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC))) + fmgr_info_copy(&(giststate->distanceFn[i]), + index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC), + CurrentMemoryContext); + else + giststate->distanceFn[i].fn_oid = InvalidOid; + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type. If there's no index + * collation then specify default collation in case the support + * functions need collation. This is harmless if the support + * functions don't care about collation, so we just do it + * unconditionally. (We could alternatively call get_typcollation, + * but that seems like expensive overkill --- there aren't going to be + * any cases where a GiST storage type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + giststate->supportCollation[i] = index->rd_indcollation[i]; + else + giststate->supportCollation[i] = DEFAULT_COLLATION_OID; } } diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 216910307a..1aba686844 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -4,11 +4,11 @@ * fetch tuples from a GiST scan. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.85 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistget.c * *------------------------------------------------------------------------- */ @@ -20,504 +20,568 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "utils/builtins.h" #include "utils/memutils.h" -static OffsetNumber gistfindnext(IndexScanDesc scan, OffsetNumber n); -static int64 gistnext(IndexScanDesc scan, TIDBitmap *tbm); -static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan, - OffsetNumber offset); - -static void -killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr) +/* + * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? + * + * The index tuple might represent either a heap tuple or a lower index page, + * depending on whether the containing page is a leaf page or not. + * + * On success return for a heap tuple, *recheck_p is set to indicate + * whether recheck is needed. We recheck if any of the consistent() functions + * request it. recheck is not interesting when examining a non-leaf entry, + * since we must visit the lower index page if there's any doubt. + * + * If we are doing an ordered scan, so->distances[] is filled with distance + * data from the distance() functions before returning success. + * + * We must decompress the key in the IndexTuple before passing it to the + * sk_funcs (which actually are the opclass Consistent or Distance methods). + * + * Note that this function is always invoked in a short-lived memory context, + * so we don't need to worry about cleaning up allocated memory, either here + * or in the implementation of any Consistent or Distance methods. + */ +static bool +gistindex_keytest(IndexScanDesc scan, + IndexTuple tuple, + Page page, + OffsetNumber offset, + bool *recheck_p) { - Page p; - OffsetNumber offset; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + ScanKey key = scan->keyData; + int keySize = scan->numberOfKeys; + double *distance_p; + Relation r = scan->indexRelation; - LockBuffer(so->curbuf, GIST_SHARE); - gistcheckpage(r, so->curbuf); - p = (Page) BufferGetPage(so->curbuf); + *recheck_p = false; - if (XLByteEQ(so->stack->lsn, PageGetLSN(p))) + /* + * If it's a leftover invalid tuple from pre-9.1, treat it as a match with + * minimum possible distances. This means we'll always follow it to the + * referenced page. + */ + if (GistTupleIsInvalid(tuple)) { - /* page unchanged, so all is simple */ - offset = ItemPointerGetOffsetNumber(iptr); - ItemIdMarkDead(PageGetItemId(p, offset)); - SetBufferCommitInfoNeedsSave(so->curbuf); + int i; + + if (GistPageIsLeaf(page)) /* shouldn't happen */ + elog(ERROR, "invalid GiST tuple found on leaf page"); + for (i = 0; i < scan->numberOfOrderBys; i++) + so->distances[i] = -get_float8_infinity(); + return true; } - else + + /* Check whether it matches according to the Consistent functions */ + while (keySize > 0) { - OffsetNumber maxoff = PageGetMaxOffsetNumber(p); + Datum datum; + bool isNull; - for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) - { - IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset)); + datum = index_getattr(tuple, + key->sk_attno, + giststate->tupdesc, + &isNull); - if (ItemPointerEquals(&(ituple->t_tid), iptr)) + if (key->sk_flags & SK_ISNULL) + { + /* + * On non-leaf page we can't conclude that child hasn't NULL + * values because of assumption in GiST: union (VAL, NULL) is VAL. + * But if on non-leaf page key IS NULL, then all children are + * NULL. + */ + if (key->sk_flags & SK_SEARCHNULL) { - /* found */ - ItemIdMarkDead(PageGetItemId(p, offset)); - SetBufferCommitInfoNeedsSave(so->curbuf); - break; + if (GistPageIsLeaf(page) && !isNull) + return false; + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (isNull) + return false; } } - } + else if (isNull) + { + return false; + } + else + { + Datum test; + bool recheck; + GISTENTRY de; - LockBuffer(so->curbuf, GIST_UNLOCK); -} + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + FALSE, isNull); -/* - * gistgettuple() -- Get the next tuple in the scan - */ -Datum -gistgettuple(PG_FUNCTION_ARGS) -{ - IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); - GISTScanOpaque so; - bool res; + /* + * Call the Consistent function to evaluate the test. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the comparison operator's strategy number and subtype + * from pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * We initialize the recheck flag to true (the safest assumption) + * in case the Consistent function forgets to set it. + */ + recheck = true; - so = (GISTScanOpaque) scan->opaque; + test = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int32GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); - if (dir != ForwardScanDirection) - elog(ERROR, "GiST doesn't support other scan directions than forward"); + if (!DatumGetBool(test)) + return false; + *recheck_p |= recheck; + } - /* - * If we have produced an index tuple in the past and the executor has - * informed us we need to mark it as "killed", do so now. - */ - if (scan->kill_prior_tuple && ItemPointerIsValid(&(so->curpos))) - killtuple(scan->indexRelation, so, &(so->curpos)); + key++; + keySize--; + } - /* - * Get the next tuple that matches the search key. - */ - res = (gistnext(scan, NULL) > 0); + /* OK, it passes --- now let's compute the distances */ + key = scan->orderByData; + distance_p = so->distances; + keySize = scan->numberOfOrderBys; + while (keySize > 0) + { + Datum datum; + bool isNull; - PG_RETURN_BOOL(res); -} + datum = index_getattr(tuple, + key->sk_attno, + giststate->tupdesc, + &isNull); -Datum -gistgetbitmap(PG_FUNCTION_ARGS) -{ - IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); - int64 ntids; + if ((key->sk_flags & SK_ISNULL) || isNull) + { + /* Assume distance computes as null and sorts to the end */ + *distance_p = get_float8_infinity(); + } + else + { + Datum dist; + GISTENTRY de; - ntids = gistnext(scan, tbm); + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + FALSE, isNull); - PG_RETURN_INT64(ntids); + /* + * Call the Distance function to evaluate the distance. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, and the ordering operator's strategy number and subtype + * from pg_amop. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * Note that Distance functions don't get a recheck argument. We + * can't tolerate lossy distance calculations on leaf tuples; + * there is no opportunity to re-sort the tuples afterwards. + */ + dist = FunctionCall4Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int32GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype)); + + *distance_p = DatumGetFloat8(dist); + } + + key++; + distance_p++; + keySize--; + } + + return true; } /* - * Fetch tuple(s) that match the search key; this can be invoked - * either to fetch the first such tuple or subsequent matching tuples. + * Scan all items on the GiST index page identified by *pageItem, and insert + * them into the queue (or directly to output areas) + * + * scan: index scan we are executing + * pageItem: search queue item identifying an index page to scan + * myDistances: distances array associated with pageItem, or NULL at the root + * tbm: if not NULL, gistgetbitmap's output bitmap + * ntids: if not NULL, gistgetbitmap's output tuple counter * - * This function is used by both gistgettuple and gistgetbitmap. When - * invoked from gistgettuple, tbm is null and the next matching tuple - * is returned in scan->xs_ctup.t_self. When invoked from getbitmap, - * tbm is non-null and all matching tuples are added to tbm before - * returning. In both cases, the function result is the number of - * returned tuples. + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap + * tuples should be reported directly into the bitmap. If they are NULL, + * we're doing a plain or ordered indexscan. For a plain indexscan, heap + * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, + * heap tuple TIDs are pushed into individual search queue items. * - * If scan specifies to skip killed tuples, continue looping until we find a - * non-killed tuple that matches the search key. + * If we detect that the index page has split since we saw its downlink + * in the parent, we push its new right sibling onto the queue so the + * sibling will be processed next. */ -static int64 -gistnext(IndexScanDesc scan, TIDBitmap *tbm) +static void +gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, + TIDBitmap *tbm, int64 *ntids) { - Page p; - OffsetNumber n; - GISTScanOpaque so; - GISTSearchStack *stk; - IndexTuple it; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; GISTPageOpaque opaque; - int64 ntids = 0; + OffsetNumber maxoff; + OffsetNumber i; + GISTSearchTreeItem *tmpItem = so->tmpTreeItem; + bool isNew; + MemoryContext oldcxt; - so = (GISTScanOpaque) scan->opaque; + Assert(!GISTSearchItemIsHeap(*pageItem)); - if (so->qual_ok == false) - return 0; + buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + opaque = GistPageGetOpaque(page); - if (so->curbuf == InvalidBuffer) + /* + * Check if we need to follow the rightlink. We need to follow it if the + * page was concurrently split since we visited the parent (in which case + * parentlsn < nsn), or if the the system crashed after a page split but + * before the downlink was inserted into the parent. + */ + if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + (GistFollowRight(page) || + XLByteLT(pageItem->data.parentlsn, opaque->nsn)) && + opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { - if (ItemPointerIsValid(&so->curpos) == false) - { - /* Being asked to fetch the first entry, so start at the root */ - Assert(so->curbuf == InvalidBuffer); - Assert(so->stack == NULL); + /* There was a page split, follow right link to add pages */ + GISTSearchItem *item; - so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); + /* This can't happen when starting at the root */ + Assert(myDistances != NULL); - stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); + oldcxt = MemoryContextSwitchTo(so->queueCxt); - stk->next = NULL; - stk->block = GIST_ROOT_BLKNO; + /* Create new GISTSearchItem for the right sibling index page */ + item = palloc(sizeof(GISTSearchItem)); + item->next = NULL; + item->blkno = opaque->rightlink; + item->data.parentlsn = pageItem->data.parentlsn; - pgstat_count_index_scan(scan->indexRelation); - } - else - { - /* scan is finished */ - return 0; - } + /* Insert it into the queue using same distances as for this page */ + tmpItem->head = item; + tmpItem->lastHeap = NULL; + memcpy(tmpItem->distances, myDistances, + sizeof(double) * scan->numberOfOrderBys); + + (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); + + MemoryContextSwitchTo(oldcxt); } + so->nPageData = so->curPageData = 0; + /* - * check stored pointers from last visit + * check all tuples on page */ - if (so->nPageData > 0) + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { + IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + bool match; + bool recheck; + /* - * gistgetmulti never should go here + * Must call gistindex_keytest in tempCxt, and clean up any leftover + * junk afterward. */ - Assert(tbm == NULL); + oldcxt = MemoryContextSwitchTo(so->tempCxt); - if (so->curPageData < so->nPageData) - { - scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; + match = gistindex_keytest(scan, it, page, i, &recheck); - ItemPointerSet(&so->curpos, - BufferGetBlockNumber(so->curbuf), - so->pageData[so->curPageData].pageOffset); + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(so->tempCxt); - so->curPageData++; + /* Ignore tuple if it doesn't match */ + if (!match) + continue; - return 1; + if (tbm && GistPageIsLeaf(page)) + { + /* + * getbitmap scan, so just push heap tuple TIDs into the bitmap + * without worrying about ordering + */ + tbm_add_tuples(tbm, &it->t_tid, 1, recheck); + (*ntids)++; + } + else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + { + /* + * Non-ordered scan, so report heap tuples in so->pageData[] + */ + so->pageData[so->nPageData].heapPtr = it->t_tid; + so->pageData[so->nPageData].recheck = recheck; + so->nPageData++; } else { /* - * Go to the next page + * Must push item into search queue. We get here for any lower + * index page, and also for heap tuples if doing an ordered + * search. */ - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; + GISTSearchItem *item; - /* If we're out of stack entries, we're done */ - if (so->stack == NULL) + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for this item */ + item = palloc(sizeof(GISTSearchItem)); + item->next = NULL; + + if (GistPageIsLeaf(page)) + { + /* Creating heap-tuple GISTSearchItem */ + item->blkno = InvalidBlockNumber; + item->data.heap.heapPtr = it->t_tid; + item->data.heap.recheck = recheck; + } + else { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return 0; + /* Creating index-page GISTSearchItem */ + item->blkno = ItemPointerGetBlockNumber(&it->t_tid); + /* lsn of current page is lsn of parent page for child */ + item->data.parentlsn = PageGetLSN(page); } - so->curbuf = ReleaseAndReadBuffer(so->curbuf, - scan->indexRelation, - stk->block); + /* Insert it into the queue using new distance data */ + tmpItem->head = item; + tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL; + memcpy(tmpItem->distances, so->distances, + sizeof(double) * scan->numberOfOrderBys); + + (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); + + MemoryContextSwitchTo(oldcxt); } } + UnlockReleaseBuffer(buffer); +} + +/* + * Extract next item (in order) from search queue + * + * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it. + * + * NOTE: on successful return, so->curTreeItem is the GISTSearchTreeItem that + * contained the result item. Callers can use so->curTreeItem->distances as + * the distances value for the item. + */ +static GISTSearchItem * +getNextGISTSearchItem(GISTScanOpaque so) +{ for (;;) { - CHECK_FOR_INTERRUPTS(); + GISTSearchItem *item; - /* First of all, we need lock buffer */ - Assert(so->curbuf != InvalidBuffer); - LockBuffer(so->curbuf, GIST_SHARE); - gistcheckpage(scan->indexRelation, so->curbuf); - p = BufferGetPage(so->curbuf); - opaque = GistPageGetOpaque(p); - - /* remember lsn to identify page changed for tuple's killing */ - so->stack->lsn = PageGetLSN(p); - - /* check page split, occured since visit to parent */ - if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && - XLByteLT(so->stack->parentlsn, opaque->nsn) && - opaque->rightlink != InvalidBlockNumber /* sanity check */ && - (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already - added */ ) + /* Update curTreeItem if we don't have one */ + if (so->curTreeItem == NULL) { - /* detect page split, follow right link to add pages */ - - stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); - stk->next = so->stack->next; - stk->block = opaque->rightlink; - stk->parentlsn = so->stack->parentlsn; - memset(&(stk->lsn), 0, sizeof(GistNSN)); - so->stack->next = stk; + so->curTreeItem = (GISTSearchTreeItem *) rb_leftmost(so->queue); + /* Done when tree is empty */ + if (so->curTreeItem == NULL) + break; } - /* if page is empty, then just skip it */ - if (PageIsEmpty(p)) + item = so->curTreeItem->head; + if (item != NULL) { - LockBuffer(so->curbuf, GIST_UNLOCK); - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; - - if (so->stack == NULL) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return ntids; - } - - so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, - stk->block); - continue; + /* Delink item from chain */ + so->curTreeItem->head = item->next; + if (item == so->curTreeItem->lastHeap) + so->curTreeItem->lastHeap = NULL; + /* Return item; caller is responsible to pfree it */ + return item; } - n = FirstOffsetNumber; - - /* wonderful, we can look at page */ - so->nPageData = so->curPageData = 0; - - for (;;) - { - n = gistfindnext(scan, n); - - if (!OffsetNumberIsValid(n)) - { - /* - * If we was called from gistgettuple and current buffer - * contains something matched then make a recursive call - it - * will return ItemPointer from so->pageData. But we save - * buffer pinned to support tuple's killing - */ - if (!tbm && so->nPageData > 0) - { - LockBuffer(so->curbuf, GIST_UNLOCK); - return gistnext(scan, NULL); - } + /* curTreeItem is exhausted, so remove it from rbtree */ + rb_delete(so->queue, (RBNode *) so->curTreeItem); + so->curTreeItem = NULL; + } - /* - * We ran out of matching index entries on the current page, - * so pop the top stack entry and use it to continue the - * search. - */ - LockBuffer(so->curbuf, GIST_UNLOCK); - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; - - /* If we're out of stack entries, we're done */ - - if (so->stack == NULL) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return ntids; - } - - so->curbuf = ReleaseAndReadBuffer(so->curbuf, - scan->indexRelation, - stk->block); - /* XXX go up */ - break; - } + return NULL; +} - if (GistPageIsLeaf(p)) - { - /* - * We've found a matching index entry in a leaf page, so - * return success. Note that we keep "curbuf" pinned so that - * we can efficiently resume the index scan later. - */ +/* + * Fetch next heap tuple in an ordered search + */ +static bool +getNextNearest(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool res = false; - if (!(scan->ignore_killed_tuples && - ItemIdIsDead(PageGetItemId(p, n)))) - { - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - ntids++; - if (tbm != NULL) - tbm_add_tuples(tbm, &it->t_tid, 1, scan->xs_recheck); - else - { - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].pageOffset = n; - so->pageData[so->nPageData].recheck = scan->xs_recheck; - so->nPageData++; - } - } - } - else - { - /* - * We've found an entry in an internal node whose key is - * consistent with the search key, so push it to stack - */ - stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); + do + { + GISTSearchItem *item = getNextGISTSearchItem(so); - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); - memset(&(stk->lsn), 0, sizeof(GistNSN)); - stk->parentlsn = so->stack->lsn; + if (!item) + break; - stk->next = so->stack->next; - so->stack->next = stk; - } + if (GISTSearchItemIsHeap(*item)) + { + /* found a heap item at currently minimal distance */ + scan->xs_ctup.t_self = item->data.heap.heapPtr; + scan->xs_recheck = item->data.heap.recheck; + res = true; + } + else + { + /* visit an index page, extract its items into queue */ + CHECK_FOR_INTERRUPTS(); - n = OffsetNumberNext(n); + gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL); } - } - return ntids; + pfree(item); + } while (!res); + + return res; } /* - * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? - * - * On success return for a leaf tuple, scan->xs_recheck is set to indicate - * whether recheck is needed. We recheck if any of the consistent() functions - * request it. - * - * We must decompress the key in the IndexTuple before passing it to the - * sk_func (and we have previously overwritten the sk_func to use the - * user-defined Consistent method, so we actually are invoking that). - * - * Note that this function is always invoked in a short-lived memory context, - * so we don't need to worry about cleaning up allocated memory, either here - * or in the implementation of any Consistent methods. + * gistgettuple() -- Get the next tuple in the scan */ -static bool -gistindex_keytest(IndexTuple tuple, - IndexScanDesc scan, - OffsetNumber offset) +Datum +gistgettuple(PG_FUNCTION_ARGS) { - int keySize = scan->numberOfKeys; - ScanKey key = scan->keyData; - Relation r = scan->indexRelation; - GISTScanOpaque so; - Page p; - GISTSTATE *giststate; - - so = (GISTScanOpaque) scan->opaque; - giststate = so->giststate; - p = BufferGetPage(so->curbuf); + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - scan->xs_recheck = false; + if (dir != ForwardScanDirection) + elog(ERROR, "GiST only supports forward scan direction"); - /* - * Tuple doesn't restore after crash recovery because of incomplete insert - */ - if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple)) - return true; + if (!so->qual_ok) + PG_RETURN_BOOL(false); - while (keySize > 0) + if (so->firstCall) { - Datum datum; - bool isNull; - Datum test; - bool recheck; - GISTENTRY de; + /* Begin the scan by processing the root page */ + GISTSearchItem fakeItem; - datum = index_getattr(tuple, - key->sk_attno, - giststate->tupdesc, - &isNull); + pgstat_count_index_scan(scan->indexRelation); - if (key->sk_flags & SK_ISNULL) + so->firstCall = false; + so->curTreeItem = NULL; + so->curPageData = so->nPageData = 0; + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, NULL, NULL); + } + + if (scan->numberOfOrderBys > 0) + { + /* Must fetch tuples in strict distance order */ + PG_RETURN_BOOL(getNextNearest(scan)); + } + else + { + /* Fetch tuples index-page-at-a-time */ + for (;;) { - /* - * On non-leaf page we can't conclude that child hasn't NULL - * values because of assumption in GiST: union (VAL, NULL) is VAL. - * But if on non-leaf page key IS NULL, then all children are - * NULL. - */ - if (key->sk_flags & SK_SEARCHNULL) + if (so->curPageData < so->nPageData) { - if (GistPageIsLeaf(p) && !isNull) - return false; + /* continuing to return tuples from a leaf page */ + scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; + scan->xs_recheck = so->pageData[so->curPageData].recheck; + so->curPageData++; + PG_RETURN_BOOL(true); } - else + + /* find and process the next index page */ + do { - Assert(key->sk_flags & SK_SEARCHNOTNULL); - if (isNull) - return false; - } - } - else if (isNull) - { - return false; - } - else - { - gistdentryinit(giststate, key->sk_attno - 1, &de, - datum, r, p, offset, - FALSE, isNull); + GISTSearchItem *item = getNextGISTSearchItem(so); - /* - * Call the Consistent function to evaluate the test. The - * arguments are the index datum (as a GISTENTRY*), the comparison - * datum, the comparison operator's strategy number and subtype - * from pg_amop, and the recheck flag. - * - * (Presently there's no need to pass the subtype since it'll - * always be zero, but might as well pass it for possible future - * use.) - * - * We initialize the recheck flag to true (the safest assumption) - * in case the Consistent function forgets to set it. - */ - recheck = true; + if (!item) + PG_RETURN_BOOL(false); - test = FunctionCall5(&key->sk_func, - PointerGetDatum(&de), - key->sk_argument, - Int32GetDatum(key->sk_strategy), - ObjectIdGetDatum(key->sk_subtype), - PointerGetDatum(&recheck)); + CHECK_FOR_INTERRUPTS(); - if (!DatumGetBool(test)) - return false; - scan->xs_recheck |= recheck; - } + /* + * While scanning a leaf page, ItemPointers of matching heap + * tuples are stored in so->pageData. If there are any on + * this page, we fall out of the inner "do" and loop around to + * return them. + */ + gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL); - keySize--; - key++; + pfree(item); + } while (so->nPageData == 0); + } } - return true; + PG_RETURN_BOOL(false); /* keep compiler quiet */ } /* - * Return the offset of the first index entry that is consistent with - * the search key after offset 'n' in the current page. If there are - * no more consistent entries, return InvalidOffsetNumber. - * On success, scan->xs_recheck is set correctly, too. - * Page should be locked.... + * gistgetbitmap() -- Get a bitmap of all heap tuple locations */ -static OffsetNumber -gistfindnext(IndexScanDesc scan, OffsetNumber n) +Datum +gistgetbitmap(PG_FUNCTION_ARGS) { - OffsetNumber maxoff; - IndexTuple it; - GISTScanOpaque so; - MemoryContext oldcxt; - Page p; + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + int64 ntids = 0; + GISTSearchItem fakeItem; - so = (GISTScanOpaque) scan->opaque; - p = BufferGetPage(so->curbuf); - maxoff = PageGetMaxOffsetNumber(p); + if (!so->qual_ok) + PG_RETURN_INT64(0); + + pgstat_count_index_scan(scan->indexRelation); + + /* Begin the scan by processing the root page */ + so->curTreeItem = NULL; + so->curPageData = so->nPageData = 0; + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); /* - * Make sure we're in a short-lived memory context when we invoke a - * user-supplied GiST method in gistindex_keytest(), so we don't leak - * memory + * While scanning a leaf page, ItemPointers of matching heap tuples will + * be stored directly into tbm, so we don't need to deal with them here. */ - oldcxt = MemoryContextSwitchTo(so->tempCxt); - - while (n >= FirstOffsetNumber && n <= maxoff) + for (;;) { - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - if (gistindex_keytest(it, scan, n)) + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) break; - n = OffsetNumberNext(n); - } + CHECK_FOR_INTERRUPTS(); - MemoryContextSwitchTo(oldcxt); - MemoryContextReset(so->tempCxt); + gistScanPage(scan, item, so->curTreeItem->distances, tbm, &ntids); - /* - * If we found a matching entry, return its offset; otherwise return - * InvalidOffsetNumber to inform the caller to go to the next page. - */ - if (n >= FirstOffsetNumber && n <= maxoff) - return n; - else - return InvalidOffsetNumber; + pfree(item); + } + + PG_RETURN_INT64(ntids); } diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index cb34b26113..43c4b1251b 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -6,11 +6,11 @@ * This gives R-tree behavior, with Guttman's poly-time split algorithm. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.21 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistproc.c * *------------------------------------------------------------------------- */ @@ -380,12 +380,12 @@ gist_box_picksplit(PG_FUNCTION_ARGS) for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i)) { cur = DatumGetBoxP(entryvec->vector[i].key); - if (allisequal == true && ( - pageunion.high.x != cur->high.x || - pageunion.high.y != cur->high.y || - pageunion.low.x != cur->low.x || - pageunion.low.y != cur->low.y - )) + if (allisequal && ( + pageunion.high.x != cur->high.x || + pageunion.high.y != cur->high.y || + pageunion.low.x != cur->low.x || + pageunion.low.y != cur->low.y + )) allisequal = false; adjustBox(&pageunion, cur); @@ -904,6 +904,76 @@ gist_point_compress(PG_FUNCTION_ARGS) PG_RETURN_POINTER(entry); } +#define point_point_distance(p1,p2) \ + DatumGetFloat8(DirectFunctionCall2(point_distance, \ + PointPGetDatum(p1), PointPGetDatum(p2))) + +static double +computeDistance(bool isLeaf, BOX *box, Point *point) +{ + double result = 0.0; + + if (isLeaf) + { + /* simple point to point distance */ + result = point_point_distance(point, &box->low); + } + else if (point->x <= box->high.x && point->x >= box->low.x && + point->y <= box->high.y && point->y >= box->low.y) + { + /* point inside the box */ + result = 0.0; + } + else if (point->x <= box->high.x && point->x >= box->low.x) + { + /* point is over or below box */ + Assert(box->low.y <= box->high.y); + if (point->y > box->high.y) + result = point->y - box->high.y; + else if (point->y < box->low.y) + result = box->low.y - point->y; + else + elog(ERROR, "inconsistent point values"); + } + else if (point->y <= box->high.y && point->y >= box->low.y) + { + /* point is to left or right of box */ + Assert(box->low.x <= box->high.x); + if (point->x > box->high.x) + result = point->x - box->high.x; + else if (point->x < box->low.x) + result = box->low.x - point->x; + else + elog(ERROR, "inconsistent point values"); + } + else + { + /* closest point will be a vertex */ + Point p; + double subresult; + + result = point_point_distance(point, &box->low); + + subresult = point_point_distance(point, &box->high); + if (result > subresult) + result = subresult; + + p.x = box->low.x; + p.y = box->high.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + + p.x = box->high.x; + p.y = box->low.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + } + + return result; +} + static bool gist_point_consistent_internal(StrategyNumber strategy, bool isLeaf, BOX *key, Point *query) @@ -954,8 +1024,8 @@ gist_point_consistent(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); - bool result; bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; switch (strategyGroup) @@ -1034,9 +1104,32 @@ gist_point_consistent(PG_FUNCTION_ARGS) } break; default: - result = false; /* silence compiler warning */ elog(ERROR, "unknown strategy number: %d", strategy); + result = false; /* keep compiler quiet */ } PG_RETURN_BOOL(result); } + +Datum +gist_point_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + double distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + break; + default: + elog(ERROR, "unknown strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + } + + PG_RETURN_FLOAT8(distance); +} diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index a53d8cd087..5662a3a4aa 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -4,11 +4,11 @@ * routines to manage scans on GiST index relations * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.79 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistscan.c * *------------------------------------------------------------------------- */ @@ -20,18 +20,112 @@ #include "access/relscan.h" #include "storage/bufmgr.h" #include "utils/memutils.h" +#include "utils/rel.h" -static void gistfreestack(GISTSearchStack *s); + +/* + * RBTree support functions for the GISTSearchTreeItem queue + */ + +static int +GISTSearchTreeItemComparator(const RBNode *a, const RBNode *b, void *arg) +{ + const GISTSearchTreeItem *sa = (const GISTSearchTreeItem *) a; + const GISTSearchTreeItem *sb = (const GISTSearchTreeItem *) b; + IndexScanDesc scan = (IndexScanDesc) arg; + int i; + + /* Order according to distance comparison */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + if (sa->distances[i] != sb->distances[i]) + return (sa->distances[i] > sb->distances[i]) ? 1 : -1; + } + + return 0; +} + +static void +GISTSearchTreeItemCombiner(RBNode *existing, const RBNode *newrb, void *arg) +{ + GISTSearchTreeItem *scurrent = (GISTSearchTreeItem *) existing; + const GISTSearchTreeItem *snew = (const GISTSearchTreeItem *) newrb; + GISTSearchItem *newitem = snew->head; + + /* snew should have just one item in its chain */ + Assert(newitem && newitem->next == NULL); + + /* + * If new item is heap tuple, it goes to front of chain; otherwise insert + * it before the first index-page item, so that index pages are visited in + * LIFO order, ensuring depth-first search of index pages. See comments + * in gist_private.h. + */ + if (GISTSearchItemIsHeap(*newitem)) + { + newitem->next = scurrent->head; + scurrent->head = newitem; + if (scurrent->lastHeap == NULL) + scurrent->lastHeap = newitem; + } + else if (scurrent->lastHeap == NULL) + { + newitem->next = scurrent->head; + scurrent->head = newitem; + } + else + { + newitem->next = scurrent->lastHeap->next; + scurrent->lastHeap->next = newitem; + } +} + +static RBNode * +GISTSearchTreeItemAllocator(void *arg) +{ + IndexScanDesc scan = (IndexScanDesc) arg; + + return palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys); +} + +static void +GISTSearchTreeItemDeleter(RBNode *rb, void *arg) +{ + pfree(rb); +} + + +/* + * Index AM API functions for scanning GiST indexes + */ Datum gistbeginscan(PG_FUNCTION_ARGS) { Relation r = (Relation) PG_GETARG_POINTER(0); int nkeys = PG_GETARG_INT32(1); - ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + int norderbys = PG_GETARG_INT32(2); IndexScanDesc scan; + GISTScanOpaque so; - scan = RelationGetIndexScan(r, nkeys, key); + scan = RelationGetIndexScan(r, nkeys, norderbys); + + /* initialize opaque data */ + so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData)); + so->queueCxt = AllocSetContextCreate(CurrentMemoryContext, + "GiST queue context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + so->tempCxt = createTempGistContext(); + so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + initGISTstate(so->giststate, scan->indexRelation); + /* workspaces with size dependent on numberOfOrderBys: */ + so->tmpTreeItem = palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys); + so->distances = palloc(sizeof(double) * scan->numberOfOrderBys); + so->qual_ok = true; /* in case there are zero keys */ + + scan->opaque = so; PG_RETURN_POINTER(scan); } @@ -41,42 +135,28 @@ gistrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey key = (ScanKey) PG_GETARG_POINTER(1); - GISTScanOpaque so; + ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); + + /* nkeys and norderbys arguments are ignored */ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; int i; + MemoryContext oldCxt; - so = (GISTScanOpaque) scan->opaque; - if (so != NULL) - { - /* rescan an existing indexscan --- reset state */ - gistfreestack(so->stack); - so->stack = NULL; - /* drop pins on buffers -- no locks held */ - if (BufferIsValid(so->curbuf)) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - } - } - else - { - /* initialize opaque data */ - so = (GISTScanOpaque) palloc(sizeof(GISTScanOpaqueData)); - so->stack = NULL; - so->tempCxt = createTempGistContext(); - so->curbuf = InvalidBuffer; - so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); - initGISTstate(so->giststate, scan->indexRelation); - - scan->opaque = so; - } + /* rescan an existing indexscan --- reset state */ + MemoryContextReset(so->queueCxt); + so->curTreeItem = NULL; - /* - * Clear all the pointers. - */ - ItemPointerSetInvalid(&so->curpos); - so->nPageData = so->curPageData = 0; + /* create new, empty RBTree for search queue */ + oldCxt = MemoryContextSwitchTo(so->queueCxt); + so->queue = rb_create(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys, + GISTSearchTreeItemComparator, + GISTSearchTreeItemCombiner, + GISTSearchTreeItemAllocator, + GISTSearchTreeItemDeleter, + scan); + MemoryContextSwitchTo(oldCxt); - so->qual_ok = true; + so->firstCall = true; /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) @@ -85,8 +165,8 @@ gistrescan(PG_FUNCTION_ARGS) scan->numberOfKeys * sizeof(ScanKeyData)); /* - * Modify the scan key so that all the Consistent method is called for - * all comparisons. The original operator is passed to the Consistent + * Modify the scan key so that the Consistent method is called for all + * comparisons. The original operator is passed to the Consistent * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. @@ -95,9 +175,11 @@ gistrescan(PG_FUNCTION_ARGS) * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we * assume all indexable operators are strict). */ + so->qual_ok = true; + for (i = 0; i < scan->numberOfKeys; i++) { - ScanKey skey = &(scan->keyData[i]); + ScanKey skey = scan->keyData + i; skey->sk_func = so->giststate->consistentFn[skey->sk_attno - 1]; @@ -109,6 +191,33 @@ gistrescan(PG_FUNCTION_ARGS) } } + /* Update order-by key, if a new one is given */ + if (orderbys && scan->numberOfOrderBys > 0) + { + memmove(scan->orderByData, orderbys, + scan->numberOfOrderBys * sizeof(ScanKeyData)); + + /* + * Modify the order-by key so that the Distance method is called for + * all comparisons. The original operator is passed to the Distance + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = scan->orderByData + i; + + skey->sk_func = so->giststate->distanceFn[skey->sk_attno - 1]; + + /* Check we actually have a distance function ... */ + if (!OidIsValid(skey->sk_func.fn_oid)) + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIST_DISTANCE_PROC, skey->sk_attno, + RelationGetRelationName(scan->indexRelation)); + } + } + PG_RETURN_VOID(); } @@ -130,33 +239,14 @@ Datum gistendscan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - GISTScanOpaque so; - - so = (GISTScanOpaque) scan->opaque; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - if (so != NULL) - { - gistfreestack(so->stack); - if (so->giststate != NULL) - freeGISTstate(so->giststate); - /* drop pins on buffers -- we aren't holding any locks */ - if (BufferIsValid(so->curbuf)) - ReleaseBuffer(so->curbuf); - MemoryContextDelete(so->tempCxt); - pfree(scan->opaque); - } + freeGISTstate(so->giststate); + MemoryContextDelete(so->queueCxt); + MemoryContextDelete(so->tempCxt); + pfree(so->tmpTreeItem); + pfree(so->distances); + pfree(so); PG_RETURN_VOID(); } - -static void -gistfreestack(GISTSearchStack *s) -{ - while (s != NULL) - { - GISTSearchStack *p = s->next; - - pfree(s); - s = p; - } -} diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c index 5700e530fe..bd846cecca 100644 --- a/src/backend/access/gist/gistsplit.c +++ b/src/backend/access/gist/gistsplit.c @@ -4,11 +4,11 @@ * Split page algorithm * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistsplit.c,v 1.12 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistsplit.c * *------------------------------------------------------------------------- */ @@ -325,16 +325,18 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC evec->n = v->spl_nleft; memcpy(evec->vector, entryvec->vector + FirstOffsetNumber, sizeof(GISTENTRY) * evec->n); - v->spl_ldatum = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&nbytes)); + v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); evec->n = v->spl_nright; memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft, sizeof(GISTENTRY) * evec->n); - v->spl_rdatum = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&nbytes)); + v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); } /* @@ -361,9 +363,10 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec sv->spl_ldatum = v->spl_lattr[attno]; sv->spl_rdatum = v->spl_rattr[attno]; - FunctionCall2(&giststate->picksplitFn[attno], - PointerGetDatum(entryvec), - PointerGetDatum(sv)); + FunctionCall2Coll(&giststate->picksplitFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(entryvec), + PointerGetDatum(sv)); if (sv->spl_nleft == 0 || sv->spl_nright == 0) { @@ -500,58 +503,6 @@ gistSplitHalf(GIST_SPLITVEC *v, int len) } /* - * if it was invalid tuple then we need special processing. - * We move all invalid tuples on right page. - * - * if there is no place on left page, gistSplit will be called one more - * time for left page. - * - * Normally, we never exec this code, but after crash replay it's possible - * to get 'invalid' tuples (probability is low enough) - */ -static void -gistSplitByInvalid(GISTSTATE *giststate, GistSplitVector *v, IndexTuple *itup, int len) -{ - int i; - static OffsetNumber offInvTuples[MaxOffsetNumber]; - int nOffInvTuples = 0; - - for (i = 1; i <= len; i++) - if (GistTupleIsInvalid(itup[i - 1])) - offInvTuples[nOffInvTuples++] = i; - - if (nOffInvTuples == len) - { - /* corner case, all tuples are invalid */ - v->spl_rightvalid = v->spl_leftvalid = false; - gistSplitHalf(&v->splitVector, len); - } - else - { - GistSplitUnion gsvp; - - v->splitVector.spl_right = offInvTuples; - v->splitVector.spl_nright = nOffInvTuples; - v->spl_rightvalid = false; - - v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); - v->splitVector.spl_nleft = 0; - for (i = 1; i <= len; i++) - if (!GistTupleIsInvalid(itup[i - 1])) - v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; - v->spl_leftvalid = true; - - gsvp.equiv = NULL; - gsvp.attr = v->spl_lattr; - gsvp.len = v->splitVector.spl_nleft; - gsvp.entries = v->splitVector.spl_left; - gsvp.isnull = v->spl_lisnull; - - gistunionsubkeyvec(giststate, itup, &gsvp, 0); - } -} - -/* * trys to split page by attno key, in a case of null * values move its to separate page. */ @@ -568,12 +519,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist Datum datum; bool IsNull; - if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1])) - { - gistSplitByInvalid(giststate, v, itup, len); - return; - } - datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, attno, &(entryvec->vector[i]), datum, r, page, i, @@ -582,8 +527,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist offNullTuples[nOffNullTuples++] = i; } - v->spl_leftvalid = v->spl_rightvalid = true; - if (nOffNullTuples == len) { /* diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 03c5773d4d..1754a10369 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -4,15 +4,17 @@ * utilities routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.35 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistutil.c *------------------------------------------------------------------------- */ #include "postgres.h" +#include <math.h> + #include "access/gist_private.h" #include "access/reloptions.h" #include "storage/freespace.h" @@ -152,7 +154,7 @@ gistfillitupvec(IndexTuple *vec, int veclen, int *memlen) * invalid tuple. Resulting Datums aren't compressed. */ -bool +void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey, Datum *attr, bool *isnull) { @@ -180,10 +182,6 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke Datum datum; bool IsNull; - if (GistTupleIsInvalid(itvec[j])) - return FALSE; /* signals that union with invalid tuple => - * result is invalid */ - datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull); if (IsNull) continue; @@ -211,15 +209,14 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke } /* Make union and store in attr array */ - attr[i] = FunctionCall2(&giststate->unionFn[i], - PointerGetDatum(evec), - PointerGetDatum(&attrsize)); + attr[i] = FunctionCall2Coll(&giststate->unionFn[i], + giststate->supportCollation[i], + PointerGetDatum(evec), + PointerGetDatum(&attrsize)); isnull[i] = FALSE; } } - - return TRUE; } /* @@ -231,8 +228,7 @@ gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate) { memset(isnullS, TRUE, sizeof(bool) * giststate->tupdesc->natts); - if (!gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS)) - return gist_form_invalid_tuple(InvalidBlockNumber); + gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS); return gistFormTuple(giststate, r, attrS, isnullS, false); } @@ -278,9 +274,10 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno, } *dstisnull = FALSE; - *dst = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&dstsize)); + *dst = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&dstsize)); } } @@ -289,9 +286,10 @@ gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b) { bool result; - FunctionCall3(&giststate->equalFn[attno], - a, b, - PointerGetDatum(&result)); + FunctionCall3Coll(&giststate->equalFn[attno], + giststate->supportCollation[attno], + a, b, + PointerGetDatum(&result)); return result; } @@ -328,9 +326,6 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis IndexTuple newtup = NULL; int i; - if (GistTupleIsInvalid(oldtup) || GistTupleIsInvalid(addtup)) - return gist_form_invalid_tuple(ItemPointerGetBlockNumber(&(oldtup->t_tid))); - gistDeCompressAtt(giststate, r, oldtup, NULL, (OffsetNumber) 0, oldentries, oldisnull); @@ -401,14 +396,6 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ int j; IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); - if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup)) - { - ereport(LOG, - (errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery", - RelationGetRelationName(r)))); - continue; - } - sum_grow = 0; for (j = 0; j < r->rd_att->natts; j++) { @@ -460,8 +447,9 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, gistentryinit(*e, k, r, pg, o, l); dep = (GISTENTRY *) - DatumGetPointer(FunctionCall1(&giststate->decompressFn[nkey], - PointerGetDatum(e))); + DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); /* decompressFn may just return the given pointer */ if (dep != e) gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, @@ -486,8 +474,9 @@ gistcentryinit(GISTSTATE *giststate, int nkey, gistentryinit(*e, k, r, pg, o, l); cep = (GISTENTRY *) - DatumGetPointer(FunctionCall1(&giststate->compressFn[nkey], - PointerGetDatum(e))); + DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); /* compressFn may just return the given pointer */ if (cep != e) gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset, @@ -521,7 +510,12 @@ gistFormTuple(GISTSTATE *giststate, Relation r, } res = index_form_tuple(giststate->tupdesc, compatt, isnull); - GistTupleSetValid(res); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); return res; } @@ -532,16 +526,23 @@ gistpenalty(GISTSTATE *giststate, int attno, { float penalty = 0.0; - if (giststate->penaltyFn[attno].fn_strict == FALSE || (isNullOrig == FALSE && isNullAdd == FALSE)) - FunctionCall3(&giststate->penaltyFn[attno], - PointerGetDatum(orig), - PointerGetDatum(add), - PointerGetDatum(&penalty)); + if (giststate->penaltyFn[attno].fn_strict == FALSE || + (isNullOrig == FALSE && isNullAdd == FALSE)) + { + FunctionCall3Coll(&giststate->penaltyFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(orig), + PointerGetDatum(add), + PointerGetDatum(&penalty)); + /* disallow negative or NaN penalty */ + if (isnan(penalty) || penalty < 0.0) + penalty = 0.0; + } else if (isNullOrig && isNullAdd) penalty = 0.0; else - penalty = 1e10; /* try to prevent to mix null and non-null - * value */ + penalty = 1e10; /* try to prevent mixing null and non-null + * values */ return penalty; } @@ -677,3 +678,24 @@ gistoptions(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); PG_RETURN_NULL(); } + +/* + * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect + * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake + * sequence of LSNs for that purpose. Each call generates an LSN that is + * greater than any previous value returned by this function in the same + * session. + */ +XLogRecPtr +GetXLogRecPtrForTemp(void) +{ + static XLogRecPtr counter = {0, 1}; + + counter.xrecoff++; + if (counter.xrecoff == 0) + { + counter.xlogid++; + counter.xrecoff++; + } + return counter; +} diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index abd3d99956..33e6f34154 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -4,11 +4,11 @@ * vacuuming routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.48 2010/02/08 05:17:31 tgl Exp $ + * src/backend/access/gist/gistvacuum.c * *------------------------------------------------------------------------- */ @@ -26,13 +26,6 @@ #include "utils/memutils.h" -typedef struct GistBulkDeleteResult -{ - IndexBulkDeleteResult std; /* common state */ - bool needReindex; -} GistBulkDeleteResult; - - /* * VACUUM cleanup: update FSM */ @@ -40,13 +33,11 @@ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; - BlockNumber lastBlock = GIST_ROOT_BLKNO, - lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* No-op in ANALYZE ONLY mode */ @@ -56,10 +47,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ - stats->std.num_index_tuples = info->num_heap_tuples; - stats->std.estimated_count = info->estimated_count; + stats->num_index_tuples = info->num_heap_tuples; + stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just @@ -67,11 +58,6 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) */ } - if (stats->needReindex) - ereport(NOTICE, - (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", - RelationGetRelationName(rel)))); - /* * Need lock unless it's local to this backend. */ @@ -102,20 +88,17 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) totFreePages++; RecordFreeIndexPage(rel, blkno); } - else - lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } - lastBlock = npages - 1; /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ - stats->std.pages_free = totFreePages; + stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); - stats->std.num_pages = RelationGetNumberOfBlocks(rel); + stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); @@ -135,7 +118,7 @@ pushStackIfSplited(Page page, GistBDItem *stack) GISTPageOpaque opaque = GistPageGetOpaque(page); if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) && - XLByteLT(stack->parentlsn, opaque->nsn) && + (GistFollowRight(page) || XLByteLT(stack->parentlsn, opaque->nsn)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* split page detected, install right link to the stack */ @@ -162,7 +145,7 @@ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; @@ -171,10 +154,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) /* first time through? */ if (stats == NULL) - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ - stats->std.estimated_count = false; - stats->std.num_index_tuples = 0; + stats->estimated_count = false; + stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; @@ -232,10 +215,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) { todelete[ntodelete] = i - ntodelete; ntodelete++; - stats->std.tuples_removed += 1; + stats->tuples_removed += 1; } else - stats->std.num_index_tuples += 1; + stats->num_index_tuples += 1; } if (ntodelete) @@ -248,27 +231,18 @@ gistbulkdelete(PG_FUNCTION_ARGS) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { - XLogRecData *rdata; XLogRecPtr recptr; - gistxlogPageUpdate *xlinfo; - rdata = formUpdateRdata(rel->rd_node, buffer, + recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, - NULL, 0, - NULL); - xlinfo = (gistxlogPageUpdate *) rdata->next->data; - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); - - pfree(xlinfo); - pfree(rdata); } else - PageSetLSN(page, XLogRecPtrForTemp); + PageSetLSN(page, GetXLogRecPtrForTemp()); END_CRIT_SECTION(); } @@ -293,7 +267,11 @@ gistbulkdelete(PG_FUNCTION_ARGS) stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) - stats->needReindex = true; + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), + errhint("Please REINDEX it."))); } } diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 7f5dd990c8..02c4ec3a6f 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -4,11 +4,11 @@ * WAL replay logic for GiST. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.35 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistxlog.c *------------------------------------------------------------------------- */ #include "postgres.h" @@ -20,15 +20,6 @@ #include "utils/memutils.h" #include "utils/rel.h" - -typedef struct -{ - gistxlogPageUpdate *data; - int len; - IndexTuple *itup; - OffsetNumber *todelete; -} PageUpdateRecord; - typedef struct { gistxlogPage *header; @@ -41,144 +32,37 @@ typedef struct NewPage *page; } PageSplitRecord; -/* track for incomplete inserts, idea was taken from nbtxlog.c */ - -typedef struct gistIncompleteInsert -{ - RelFileNode node; - BlockNumber origblkno; /* for splits */ - ItemPointerData key; - int lenblk; - BlockNumber *blkno; - XLogRecPtr lsn; - BlockNumber *path; - int pathlen; -} gistIncompleteInsert; - - static MemoryContext opCtx; /* working memory for operations */ -static MemoryContext insertCtx; /* holds incomplete_inserts list */ -static List *incomplete_inserts; - - -#define ItemPointerEQ(a, b) \ - ( ItemPointerGetOffsetNumber(a) == ItemPointerGetOffsetNumber(b) && \ - ItemPointerGetBlockNumber (a) == ItemPointerGetBlockNumber(b) ) - +/* + * Replay the clearing of F_FOLLOW_RIGHT flag. + */ static void -pushIncompleteInsert(RelFileNode node, XLogRecPtr lsn, ItemPointerData key, - BlockNumber *blkno, int lenblk, - PageSplitRecord *xlinfo /* to extract blkno info */ ) +gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn, + BlockNumber leftblkno) { - MemoryContext oldCxt; - gistIncompleteInsert *ninsert; + Buffer buffer; - if (!ItemPointerIsValid(&key)) + buffer = XLogReadBuffer(node, leftblkno, false); + if (BufferIsValid(buffer)) + { + Page page = (Page) BufferGetPage(buffer); /* - * if key is null then we should not store insertion as incomplete, - * because it's a vacuum operation.. + * Note that we still update the page even if page LSN is equal to the + * LSN of this record, because the updated NSN is not included in the + * full page image. */ - return; - - oldCxt = MemoryContextSwitchTo(insertCtx); - ninsert = (gistIncompleteInsert *) palloc(sizeof(gistIncompleteInsert)); - - ninsert->node = node; - ninsert->key = key; - ninsert->lsn = lsn; - - if (lenblk && blkno) - { - ninsert->lenblk = lenblk; - ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk); - memcpy(ninsert->blkno, blkno, sizeof(BlockNumber) * ninsert->lenblk); - ninsert->origblkno = *blkno; - } - else - { - int i; - - Assert(xlinfo); - ninsert->lenblk = xlinfo->data->npage; - ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk); - for (i = 0; i < ninsert->lenblk; i++) - ninsert->blkno[i] = xlinfo->page[i].header->blkno; - ninsert->origblkno = xlinfo->data->origblkno; - } - Assert(ninsert->lenblk > 0); - - /* - * Stick the new incomplete insert onto the front of the list, not the - * back. This is so that gist_xlog_cleanup will process incompletions in - * last-in-first-out order. - */ - incomplete_inserts = lcons(ninsert, incomplete_inserts); - - MemoryContextSwitchTo(oldCxt); -} - -static void -forgetIncompleteInsert(RelFileNode node, ItemPointerData key) -{ - ListCell *l; - - if (!ItemPointerIsValid(&key)) - return; - - if (incomplete_inserts == NIL) - return; - - foreach(l, incomplete_inserts) - { - gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l); - - if (RelFileNodeEquals(node, insert->node) && ItemPointerEQ(&(insert->key), &(key))) + if (!XLByteLT(lsn, PageGetLSN(page))) { - /* found */ - incomplete_inserts = list_delete_ptr(incomplete_inserts, insert); - pfree(insert->blkno); - pfree(insert); - break; - } - } -} + GistPageGetOpaque(page)->nsn = lsn; + GistClearFollowRight(page); -static void -decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record) -{ - char *begin = XLogRecGetData(record), - *ptr; - int i = 0, - addpath = 0; - - decoded->data = (gistxlogPageUpdate *) begin; - - if (decoded->data->ntodelete) - { - decoded->todelete = (OffsetNumber *) (begin + sizeof(gistxlogPageUpdate) + addpath); - addpath = MAXALIGN(sizeof(OffsetNumber) * decoded->data->ntodelete); - } - else - decoded->todelete = NULL; - - decoded->len = 0; - ptr = begin + sizeof(gistxlogPageUpdate) + addpath; - while (ptr - begin < record->xl_len) - { - decoded->len++; - ptr += IndexTupleSize((IndexTuple) ptr); - } - - decoded->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * decoded->len); - - ptr = begin + sizeof(gistxlogPageUpdate) + addpath; - while (ptr - begin < record->xl_len) - { - decoded->itup[i] = (IndexTuple) ptr; - ptr += IndexTupleSize(decoded->itup[i]); - i++; + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } } @@ -186,29 +70,22 @@ decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record) * redo any page update (except page split) */ static void -gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot) +gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { - gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); - PageUpdateRecord xlrec; + char *begin = XLogRecGetData(record); + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; + char *data; - /* we must fix incomplete_inserts list even if XLR_BKP_BLOCK_1 is set */ - forgetIncompleteInsert(xldata->node, xldata->key); + if (BlockNumberIsValid(xldata->leftchild)) + gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); - if (!isnewroot && xldata->blkno != GIST_ROOT_BLKNO) - /* operation with root always finalizes insertion */ - pushIncompleteInsert(xldata->node, lsn, xldata->key, - &(xldata->blkno), 1, - NULL); - - /* nothing else to do if page was backed up (and no info to do it with) */ + /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; - decodePageUpdateRecord(&xlrec, record); - - buffer = XLogReadBuffer(xlrec.data->node, xlrec.data->blkno, false); + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); @@ -219,28 +96,52 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot) return; } - if (isnewroot) - GISTInitBuffer(buffer, 0); - else if (xlrec.data->ntodelete) + data = begin + sizeof(gistxlogPageUpdate); + + /* Delete old tuples */ + if (xldata->ntodelete > 0) { int i; + OffsetNumber *todelete = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntodelete; - for (i = 0; i < xlrec.data->ntodelete; i++) - PageIndexTupleDelete(page, xlrec.todelete[i]); + for (i = 0; i < xldata->ntodelete; i++) + PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ - if (xlrec.len > 0) - gistfillbuffer(page, xlrec.itup, xlrec.len, InvalidOffsetNumber); + if (data - begin < record->xl_len) + { + OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); - /* - * special case: leafpage, nothing to insert, nothing to delete, then - * vacuum marks page - */ - if (GistPageIsLeaf(page) && xlrec.len == 0 && xlrec.data->ntodelete == 0) - GistClearTuplesDeleted(page); + while (data - begin < record->xl_len) + { + IndexTuple itup = (IndexTuple) data; + Size sz = IndexTupleSize(itup); + OffsetNumber l; + + data += sz; + + l = PageAddItem(page, (Item) itup, sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) sz); + off++; + } + } + else + { + /* + * special case: leafpage, nothing to insert, nothing to delete, then + * vacuum marks page + */ + if (GistPageIsLeaf(page) && xldata->ntodelete == 0) + GistClearTuplesDeleted(page); + } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) @@ -315,41 +216,67 @@ decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record) static void gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { + gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; Buffer buffer; Page page; int i; - int flags; + bool isrootsplit = false; + if (BlockNumberIsValid(xldata->leftchild)) + gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); decodePageSplitRecord(&xlrec, record); - flags = xlrec.data->origleaf ? F_LEAF : 0; /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { NewPage *newpage = xlrec.page + i; + int flags; + + if (newpage->header->blkno == GIST_ROOT_BLKNO) + { + Assert(i == 0); + isrootsplit = true; + } buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); /* ok, clear buffer */ + if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) + flags = F_LEAF; + else + flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); + if (newpage->header->blkno == GIST_ROOT_BLKNO) + { + GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; + GistPageGetOpaque(page)->nsn = xldata->orignsn; + GistClearFollowRight(page); + } + else + { + if (i < xlrec.data->npage - 1) + GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; + else + GistPageGetOpaque(page)->rightlink = xldata->origrlink; + GistPageGetOpaque(page)->nsn = xldata->orignsn; + if (i < xlrec.data->npage - 1 && !isrootsplit) + GistMarkFollowRight(page); + else + GistClearFollowRight(page); + } + PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } - - forgetIncompleteInsert(xlrec.data->node, xlrec.data->key); - - pushIncompleteInsert(xlrec.data->node, lsn, xlrec.data->key, - NULL, 0, - &xlrec); } static void @@ -372,24 +299,6 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); } -static void -gistRedoCompleteInsert(XLogRecPtr lsn, XLogRecord *record) -{ - char *begin = XLogRecGetData(record), - *ptr; - gistxlogInsertComplete *xlrec; - - xlrec = (gistxlogInsertComplete *) begin; - - ptr = begin + sizeof(gistxlogInsertComplete); - while (ptr - begin < record->xl_len) - { - Assert(record->xl_len - (ptr - begin) >= sizeof(ItemPointerData)); - forgetIncompleteInsert(xlrec->node, *((ItemPointerData *) ptr)); - ptr += sizeof(ItemPointerData); - } -} - void gist_redo(XLogRecPtr lsn, XLogRecord *record) { @@ -397,34 +306,27 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) MemoryContext oldCxt; /* - * GIST indexes do not require any conflict processing. NB: If we ever + * GiST indexes do not require any conflict processing. NB: If we ever * implement a similar optimization we have in b-tree, and remove killed * tuples outside VACUUM, we'll need to handle that here. */ - RestoreBkpBlocks(lsn, record, false); oldCxt = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_GIST_PAGE_UPDATE: - gistRedoPageUpdateRecord(lsn, record, false); + gistRedoPageUpdateRecord(lsn, record); break; case XLOG_GIST_PAGE_DELETE: gistRedoPageDeleteRecord(lsn, record); break; - case XLOG_GIST_NEW_ROOT: - gistRedoPageUpdateRecord(lsn, record, true); - break; case XLOG_GIST_PAGE_SPLIT: gistRedoPageSplitRecord(lsn, record); break; case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(lsn, record); break; - case XLOG_GIST_INSERT_COMPLETE: - gistRedoCompleteInsert(lsn, record); - break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -434,20 +336,16 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) } static void -out_target(StringInfo buf, RelFileNode node, ItemPointerData key) +out_target(StringInfo buf, RelFileNode node) { appendStringInfo(buf, "rel %u/%u/%u", node.spcNode, node.dbNode, node.relNode); - if (ItemPointerIsValid(&key)) - appendStringInfo(buf, "; tid %u/%u", - ItemPointerGetBlockNumber(&key), - ItemPointerGetOffsetNumber(&key)); } static void out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) { - out_target(buf, xlrec->node, xlrec->key); + out_target(buf, xlrec->node); appendStringInfo(buf, "; block number %u", xlrec->blkno); } @@ -463,7 +361,7 @@ static void out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) { appendStringInfo(buf, "page_split: "); - out_target(buf, xlrec->node, xlrec->key); + out_target(buf, xlrec->node); appendStringInfo(buf, "; block number %u splits to %d pages", xlrec->origblkno, xlrec->npage); } @@ -482,10 +380,6 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec) case XLOG_GIST_PAGE_DELETE: out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec); break; - case XLOG_GIST_NEW_ROOT: - appendStringInfo(buf, "new_root: "); - out_target(buf, ((gistxlogPageUpdate *) rec)->node, ((gistxlogPageUpdate *) rec)->key); - break; case XLOG_GIST_PAGE_SPLIT: out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); break; @@ -495,415 +389,102 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec) ((RelFileNode *) rec)->dbNode, ((RelFileNode *) rec)->relNode); break; - case XLOG_GIST_INSERT_COMPLETE: - appendStringInfo(buf, "complete_insert: rel %u/%u/%u", - ((gistxlogInsertComplete *) rec)->node.spcNode, - ((gistxlogInsertComplete *) rec)->node.dbNode, - ((gistxlogInsertComplete *) rec)->node.relNode); - break; default: appendStringInfo(buf, "unknown gist op code %u", info); break; } } -IndexTuple -gist_form_invalid_tuple(BlockNumber blkno) -{ - /* - * we don't alloc space for null's bitmap, this is invalid tuple, be - * carefull in read and write code - */ - Size size = IndexInfoFindDataOffset(0); - IndexTuple tuple = (IndexTuple) palloc0(size); - - tuple->t_info |= size; - - ItemPointerSetBlockNumber(&(tuple->t_tid), blkno); - GistTupleSetInvalid(tuple); - - return tuple; -} - - -static void -gistxlogFindPath(Relation index, gistIncompleteInsert *insert) -{ - GISTInsertStack *top; - - insert->pathlen = 0; - insert->path = NULL; - - if ((top = gistFindPath(index, insert->origblkno)) != NULL) - { - int i; - GISTInsertStack *ptr; - - for (ptr = top; ptr; ptr = ptr->parent) - insert->pathlen++; - - insert->path = (BlockNumber *) palloc(sizeof(BlockNumber) * insert->pathlen); - - i = 0; - for (ptr = top; ptr; ptr = ptr->parent) - insert->path[i++] = ptr->blkno; - } - else - elog(ERROR, "lost parent for block %u", insert->origblkno); -} - -static SplitedPageLayout * -gistMakePageLayout(Buffer *buffers, int nbuffers) -{ - SplitedPageLayout *res = NULL, - *resptr; - - while (nbuffers-- > 0) - { - Page page = BufferGetPage(buffers[nbuffers]); - IndexTuple *vec; - int veclen; - - resptr = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout)); - - resptr->block.blkno = BufferGetBlockNumber(buffers[nbuffers]); - resptr->block.num = PageGetMaxOffsetNumber(page); - - vec = gistextractpage(page, &veclen); - resptr->list = gistfillitupvec(vec, veclen, &(resptr->lenlist)); - - resptr->next = res; - res = resptr; - } - - return res; -} - -/* - * Continue insert after crash. In normal situations, there aren't any - * incomplete inserts, but if a crash occurs partway through an insertion - * sequence, we'll need to finish making the index valid at the end of WAL - * replay. - * - * Note that we assume the index is now in a valid state, except for the - * unfinished insertion. In particular it's safe to invoke gistFindPath(); - * there shouldn't be any garbage pages for it to run into. - * - * To complete insert we can't use basic insertion algorithm because - * during insertion we can't call user-defined support functions of opclass. - * So, we insert 'invalid' tuples without real key and do it by separate algorithm. - * 'invalid' tuple should be updated by vacuum full. - */ -static void -gistContinueInsert(gistIncompleteInsert *insert) -{ - IndexTuple *itup; - int i, - lenitup; - Relation index; - - index = CreateFakeRelcacheEntry(insert->node); - - /* - * needed vector itup never will be more than initial lenblkno+2, because - * during this processing Indextuple can be only smaller - */ - lenitup = insert->lenblk; - itup = (IndexTuple *) palloc(sizeof(IndexTuple) * (lenitup + 2 /* guarantee root split */ )); - - for (i = 0; i < insert->lenblk; i++) - itup[i] = gist_form_invalid_tuple(insert->blkno[i]); - - /* - * any insertion of itup[] should make LOG message about - */ - - if (insert->origblkno == GIST_ROOT_BLKNO) - { - /* - * it was split root, so we should only make new root. it can't be - * simple insert into root, we should replace all content of root. - */ - Buffer buffer = XLogReadBuffer(insert->node, GIST_ROOT_BLKNO, true); - - gistnewroot(index, buffer, itup, lenitup, NULL); - UnlockReleaseBuffer(buffer); - } - else - { - Buffer *buffers; - Page *pages; - int numbuffer; - OffsetNumber *todelete; - - /* construct path */ - gistxlogFindPath(index, insert); - - Assert(insert->pathlen > 0); - - buffers = (Buffer *) palloc(sizeof(Buffer) * (insert->lenblk + 2 /* guarantee root split */ )); - pages = (Page *) palloc(sizeof(Page) * (insert->lenblk + 2 /* guarantee root split */ )); - todelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (insert->lenblk + 2 /* guarantee root split */ )); - - for (i = 0; i < insert->pathlen; i++) - { - int j, - k, - pituplen = 0; - uint8 xlinfo; - XLogRecData *rdata; - XLogRecPtr recptr; - Buffer tempbuffer = InvalidBuffer; - int ntodelete = 0; - - numbuffer = 1; - buffers[0] = ReadBuffer(index, insert->path[i]); - LockBuffer(buffers[0], GIST_EXCLUSIVE); - - /* - * we check buffer, because we restored page earlier - */ - gistcheckpage(index, buffers[0]); - - pages[0] = BufferGetPage(buffers[0]); - Assert(!GistPageIsLeaf(pages[0])); - - pituplen = PageGetMaxOffsetNumber(pages[0]); - - /* find remove old IndexTuples to remove */ - for (j = 0; j < pituplen && ntodelete < lenitup; j++) - { - BlockNumber blkno; - ItemId iid = PageGetItemId(pages[0], j + FirstOffsetNumber); - IndexTuple idxtup = (IndexTuple) PageGetItem(pages[0], iid); - - blkno = ItemPointerGetBlockNumber(&(idxtup->t_tid)); - - for (k = 0; k < lenitup; k++) - if (ItemPointerGetBlockNumber(&(itup[k]->t_tid)) == blkno) - { - todelete[ntodelete] = j + FirstOffsetNumber - ntodelete; - ntodelete++; - break; - } - } - - if (ntodelete == 0) - elog(PANIC, "gistContinueInsert: cannot find pointer to page(s)"); - - /* - * we check space with subtraction only first tuple to delete, - * hope, that wiil be enough space.... - */ - - if (gistnospace(pages[0], itup, lenitup, *todelete, 0)) - { - - /* no space left on page, so we must split */ - buffers[numbuffer] = ReadBuffer(index, P_NEW); - LockBuffer(buffers[numbuffer], GIST_EXCLUSIVE); - GISTInitBuffer(buffers[numbuffer], 0); - pages[numbuffer] = BufferGetPage(buffers[numbuffer]); - gistfillbuffer(pages[numbuffer], itup, lenitup, FirstOffsetNumber); - numbuffer++; - - if (BufferGetBlockNumber(buffers[0]) == GIST_ROOT_BLKNO) - { - Buffer tmp; - - /* - * we split root, just copy content from root to new page - */ - - /* sanity check */ - if (i + 1 != insert->pathlen) - elog(PANIC, "unexpected pathlen in index \"%s\"", - RelationGetRelationName(index)); - - /* fill new page, root will be changed later */ - tempbuffer = ReadBuffer(index, P_NEW); - LockBuffer(tempbuffer, GIST_EXCLUSIVE); - memcpy(BufferGetPage(tempbuffer), pages[0], BufferGetPageSize(tempbuffer)); - - /* swap buffers[0] (was root) and temp buffer */ - tmp = buffers[0]; - buffers[0] = tempbuffer; - tempbuffer = tmp; /* now in tempbuffer GIST_ROOT_BLKNO, - * it is still unchanged */ - - pages[0] = BufferGetPage(buffers[0]); - } - - START_CRIT_SECTION(); - - for (j = 0; j < ntodelete; j++) - PageIndexTupleDelete(pages[0], todelete[j]); - - xlinfo = XLOG_GIST_PAGE_SPLIT; - rdata = formSplitRdata(index->rd_node, insert->path[i], - false, &(insert->key), - gistMakePageLayout(buffers, numbuffer)); - - } - else - { - START_CRIT_SECTION(); - - for (j = 0; j < ntodelete; j++) - PageIndexTupleDelete(pages[0], todelete[j]); - gistfillbuffer(pages[0], itup, lenitup, InvalidOffsetNumber); - - xlinfo = XLOG_GIST_PAGE_UPDATE; - rdata = formUpdateRdata(index->rd_node, buffers[0], - todelete, ntodelete, - itup, lenitup, &(insert->key)); - } - - /* - * use insert->key as mark for completion of insert (form*Rdata() - * above) for following possible replays - */ - - /* write pages, we should mark it dirty befor XLogInsert() */ - for (j = 0; j < numbuffer; j++) - { - GistPageGetOpaque(pages[j])->rightlink = InvalidBlockNumber; - MarkBufferDirty(buffers[j]); - } - recptr = XLogInsert(RM_GIST_ID, xlinfo, rdata); - for (j = 0; j < numbuffer; j++) - { - PageSetLSN(pages[j], recptr); - PageSetTLI(pages[j], ThisTimeLineID); - } - - END_CRIT_SECTION(); - - lenitup = numbuffer; - for (j = 0; j < numbuffer; j++) - { - itup[j] = gist_form_invalid_tuple(BufferGetBlockNumber(buffers[j])); - UnlockReleaseBuffer(buffers[j]); - } - - if (tempbuffer != InvalidBuffer) - { - /* - * it was a root split, so fill it by new values - */ - gistnewroot(index, tempbuffer, itup, lenitup, &(insert->key)); - UnlockReleaseBuffer(tempbuffer); - } - } - } - - FreeFakeRelcacheEntry(index); - - ereport(LOG, - (errmsg("index %u/%u/%u needs VACUUM FULL or REINDEX to finish crash recovery", - insert->node.spcNode, insert->node.dbNode, insert->node.relNode), - errdetail("Incomplete insertion detected during crash replay."))); -} - void gist_xlog_startup(void) { - incomplete_inserts = NIL; - insertCtx = AllocSetContextCreate(CurrentMemoryContext, - "GiST recovery temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); opCtx = createTempGistContext(); } void gist_xlog_cleanup(void) { - ListCell *l; - MemoryContext oldCxt; - - oldCxt = MemoryContextSwitchTo(opCtx); - - foreach(l, incomplete_inserts) - { - gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l); - - gistContinueInsert(insert); - MemoryContextReset(opCtx); - } - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(opCtx); - MemoryContextDelete(insertCtx); -} - -bool -gist_safe_restartpoint(void) -{ - if (incomplete_inserts) - return false; - return true; } - -XLogRecData * -formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf, - ItemPointer key, SplitedPageLayout *dist) +/* + * Write WAL record of a page split. + */ +XLogRecPtr +gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, + SplitedPageLayout *dist, + BlockNumber origrlink, GistNSN orignsn, + Buffer leftchildbuf) { XLogRecData *rdata; - gistxlogPageSplit *xlrec = (gistxlogPageSplit *) palloc(sizeof(gistxlogPageSplit)); + gistxlogPageSplit xlrec; SplitedPageLayout *ptr; int npage = 0, - cur = 1; + cur; + XLogRecPtr recptr; - ptr = dist; - while (ptr) - { + for (ptr = dist; ptr; ptr = ptr->next) npage++; - ptr = ptr->next; - } rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2)); - xlrec->node = node; - xlrec->origblkno = blkno; - xlrec->origleaf = page_is_leaf; - xlrec->npage = (uint16) npage; - if (key) - xlrec->key = *key; - else - ItemPointerSetInvalid(&(xlrec->key)); + xlrec.node = node; + xlrec.origblkno = blkno; + xlrec.origrlink = origrlink; + xlrec.orignsn = orignsn; + xlrec.origleaf = page_is_leaf; + xlrec.npage = (uint16) npage; + xlrec.leftchild = + BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) xlrec; + rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(gistxlogPageSplit); - rdata[0].next = NULL; + rdata[0].buffer = InvalidBuffer; + + cur = 1; - ptr = dist; - while (ptr) + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) { + rdata[cur - 1].next = &(rdata[cur]); + rdata[cur].data = NULL; + rdata[cur].len = 0; + rdata[cur].buffer = leftchildbuf; + rdata[cur].buffer_std = true; + cur++; + } + + for (ptr = dist; ptr; ptr = ptr->next) + { + rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) &(ptr->block); rdata[cur].len = sizeof(gistxlogPage); - rdata[cur - 1].next = &(rdata[cur]); cur++; + rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) (ptr->list); rdata[cur].len = ptr->lenlist; - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].next = NULL; cur++; - ptr = ptr->next; } + rdata[cur - 1].next = NULL; + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); - return rdata; + pfree(rdata); + return recptr; } /* - * Construct the rdata array for an XLOG record describing a page update - * (deletion and/or insertion of tuples on a single index page). + * Write XLOG record describing a page update. The update can include any + * number of deletions and/or insertions of tuples on a single index page. + * + * If this update inserts a downlink for a split page, also record that + * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. * * Note that both the todelete array and the tuples are marked as belonging * to the target buffer; they need not be stored in XLOG if XLogInsert decides @@ -911,27 +492,26 @@ formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf, * at least one rdata item referencing the buffer, even when ntodelete and * ituplen are both zero; this ensures that XLogInsert knows about the buffer. */ -XLogRecData * -formUpdateRdata(RelFileNode node, Buffer buffer, - OffsetNumber *todelete, int ntodelete, - IndexTuple *itup, int ituplen, ItemPointer key) +XLogRecPtr +gistXLogUpdate(RelFileNode node, Buffer buffer, + OffsetNumber *todelete, int ntodelete, + IndexTuple *itup, int ituplen, + Buffer leftchildbuf) { XLogRecData *rdata; gistxlogPageUpdate *xlrec; int cur, i; + XLogRecPtr recptr; - rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen)); + rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen)); xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate)); xlrec->node = node; xlrec->blkno = BufferGetBlockNumber(buffer); xlrec->ntodelete = ntodelete; - - if (key) - xlrec->key = *key; - else - ItemPointerSetInvalid(&(xlrec->key)); + xlrec->leftchild = + BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; rdata[0].buffer = buffer; rdata[0].buffer_std = true; @@ -945,13 +525,13 @@ formUpdateRdata(RelFileNode node, Buffer buffer, rdata[1].next = &(rdata[2]); rdata[2].data = (char *) todelete; - rdata[2].len = MAXALIGN(sizeof(OffsetNumber) * ntodelete); + rdata[2].len = sizeof(OffsetNumber) * ntodelete; rdata[2].buffer = buffer; rdata[2].buffer_std = true; - rdata[2].next = NULL; - /* new tuples */ cur = 3; + + /* new tuples */ for (i = 0; i < ituplen; i++) { rdata[cur - 1].next = &(rdata[cur]); @@ -959,38 +539,26 @@ formUpdateRdata(RelFileNode node, Buffer buffer, rdata[cur].len = IndexTupleSize(itup[i]); rdata[cur].buffer = buffer; rdata[cur].buffer_std = true; - rdata[cur].next = NULL; cur++; } - return rdata; -} - -XLogRecPtr -gistxlogInsertCompletion(RelFileNode node, ItemPointerData *keys, int len) -{ - gistxlogInsertComplete xlrec; - XLogRecData rdata[2]; - XLogRecPtr recptr; - - Assert(len > 0); - xlrec.node = node; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(gistxlogInsertComplete); - rdata[0].next = &(rdata[1]); - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) keys; - rdata[1].len = sizeof(ItemPointerData) * len; - rdata[1].next = NULL; - - START_CRIT_SECTION(); - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_INSERT_COMPLETE, rdata); + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + { + rdata[cur - 1].next = &(rdata[cur]); + rdata[cur].data = NULL; + rdata[cur].len = 0; + rdata[cur].buffer = leftchildbuf; + rdata[cur].buffer_std = true; + cur++; + } + rdata[cur - 1].next = NULL; - END_CRIT_SECTION(); + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + pfree(rdata); return recptr; } |