10 files changed, 1876 insertions, 1728 deletions
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile
index 298e9309f5..f8051a2b45 100644
--- a/src/backend/access/gist/Makefile
+++ b/src/backend/access/gist/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/gist
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/gist/Makefile,v 1.18 2008/02/19 10:30:06 petere Exp $
+#    src/backend/access/gist/Makefile
 #
 #-------------------------------------------------------------------------
 
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 6c90e508bf..2d78dcb0df 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/gist/README,v 1.5 2010/04/14 20:17:26 rhaas Exp $
+src/backend/access/gist/README
 
 GiST Indexing
 =============
@@ -20,33 +20,34 @@ The current implementation of GiST supports:
 
   * Variable length keys
   * Composite keys (multi-key)
+  * Ordered search (nearest-neighbor search)
   * provides NULL-safe interface to GiST core
   * Concurrency
   * Recovery support via WAL logging
 
-The support for concurrency implemented in PostgreSQL was developed based on 
-the paper "Access Methods for Next-Generation Database Systems" by 
+The support for concurrency implemented in PostgreSQL was developed based on
+the paper "Access Methods for Next-Generation Database Systems" by
 Marcel Kornaker:
 
     http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
 
 The original algorithms were modified in several ways:
 
-* They should be adapted to PostgreSQL conventions. For example, the SEARCH 
-  algorithm was considerably changed, because in PostgreSQL function search 
-  should return one tuple (next), not all tuples at once. Also, it should 
+* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
+  algorithm was considerably changed, because in PostgreSQL the search function
+  should return one tuple (next), not all tuples at once. Also, it should
   release page locks between calls.
-* Since we added support for variable length keys, it's not possible to 
-  guarantee enough free space for all keys on pages after splitting. User 
-  defined function picksplit doesn't have information about size of tuples 
+* Since we added support for variable length keys, it's not possible to
+  guarantee enough free space for all keys on pages after splitting. User
+  defined function picksplit doesn't have information about size of tuples
   (each tuple may contain several keys as in multicolumn index while picksplit
   could work with only one key) and pages.
-* We modified original INSERT algorithm for performance reason. In particular,
+* We modified original INSERT algorithm for performance reasons. In particular,
   it is now a single-pass algorithm.
 * Since the papers were theoretical, some details were omitted and we
-  have to find out ourself how to solve some specific problems.
+  had to find out ourself how to solve some specific problems.
 
-Because of the above reasons, we have to revised interaction of GiST
+Because of the above reasons, we have revised the interaction of GiST
 core and PostgreSQL WAL system. Moreover, we encountered (and solved)
 a problem of uncompleted insertions when recovering after crash, which
 was not touched in the paper.
@@ -54,96 +55,127 @@ was not touched in the paper.
 Search Algorithm
 ----------------
 
-Function gettuple finds a tuple which satisfies the search
-predicate. It store their state and returns next tuple under
-subsequent calls. Stack contains page, its LSN and LSN of parent page
-and currentposition is saved between calls.
+The search code maintains a queue of unvisited items, where an "item" is
+either a heap tuple known to satisfy the search conditions, or an index
+page that is consistent with the search conditions according to inspection
+of its parent page's downlink item.  Initially the root page is searched
+to find unvisited items in it.  Then we pull items from the queue.  A
+heap tuple pointer is just returned immediately; an index page entry
+causes that page to be searched, generating more queue entries.
 
-gettuple(search-pred)
-	if ( firsttime )
-		push(stack, [root, 0, 0]) // page, LSN, parentLSN
-		currentposition=0
-	end
-	ptr = top of stack
-	while(true)
-		latch( ptr->page, S-mode )
-		if ( ptr->page->lsn != ptr->lsn ) 
-			ptr->lsn = ptr->page->lsn
-			currentposition=0
-			if ( ptr->parentlsn < ptr->page->nsn )
-				add to stack rightlink
-		else
-			currentposition++
-		end
+The queue is kept ordered with heap tuple items at the front, then
+index page entries, with any newly-added index page entry inserted
+before existing index page entries.  This ensures depth-first traversal
+of the index, and in particular causes the first few heap tuples to be
+returned as soon as possible.  That is helpful in case there is a LIMIT
+that requires only a few tuples to be produced.
 
-		while(true)
-			currentposition = find_first_match( currentposition )
-			if ( currentposition is invalid )
-				unlatch( ptr->page )
-				pop stack
-				ptr = top of stack
-				if (ptr is NULL)
-					return NULL
-				break loop
-			else if ( ptr->page is leaf )
-				unlatch( ptr->page )
-				return tuple
-			else 
-				add to stack child page
-			end
-			currentposition++
-		end
-	end
+To implement nearest-neighbor search, the queue entries are augmented
+with distance data: heap tuple entries are labeled with exact distance
+from the search argument, while index-page entries must be labeled with
+the minimum distance that any of their children could have.  Then,
+queue entries are retrieved in smallest-distance-first order, with
+entries having identical distances managed as stated in the previous
+paragraph.
+
+The search algorithm keeps an index page locked only long enough to scan
+its entries and queue those that satisfy the search conditions.  Since
+insertions can occur concurrently with searches, it is possible for an
+index child page to be split between the time we make a queue entry for it
+(while visiting its parent page) and the time we actually reach and scan
+the child page.  To avoid missing the entries that were moved to the right
+sibling, we detect whether a split has occurred by comparing the child
+page's NSN to the LSN that the parent had when visited.  If it did, the
+sibling page is immediately added to the front of the queue, ensuring that
+its items will be scanned in the same order as if they were still on the
+original child page.
+
+As is usual in Postgres, the search algorithm only guarantees to find index
+entries that existed before the scan started; index entries added during
+the scan might or might not be visited.  This is okay as long as all
+searches use MVCC snapshot rules to reject heap tuples newer than the time
+of scan start.  In particular, this means that we need not worry about
+cases where a parent page's downlink key is "enlarged" after we look at it.
+Any such enlargement would be to add child items that we aren't interested
+in returning anyway.
 
 
 Insert Algorithm
 ----------------
 
-INSERT guarantees that the GiST tree remains balanced. User defined key method 
-Penalty is used for choosing a subtree to insert; method PickSplit is used for 
-the node splitting algorithm; method Union is used for propagating changes 
+INSERT guarantees that the GiST tree remains balanced. User defined key method
+Penalty is used for choosing a subtree to insert; method PickSplit is used for
+the node splitting algorithm; method Union is used for propagating changes
 upward to maintain the tree properties.
 
-NOTICE: We modified original INSERT algorithm for performance reason. In 
-particularly, it is now a single-pass algorithm.
+To insert a tuple, we first have to find a suitable leaf page to insert to.
+The algorithm walks down the tree, starting from the root, along the path
+of smallest Penalty. At each step:
 
-Function findLeaf is used to identify subtree for insertion. Page, in which 
-insertion is proceeded, is locked as well as its parent page. Functions 
-findParent and findPath are used to find parent pages, which could be changed 
-because of concurrent access. Function pageSplit is recurrent and could split 
-page by more than 2 pages, which could be necessary if keys have different 
-lengths or more than one key are inserted (in such situation, user defined 
-function pickSplit cannot guarantee free space on page).
+1. Has this page been split since we looked at the parent? If so, it's
+possible that we should be inserting to the other half instead, so retreat
+back to the parent.
+2. If this is a leaf node, we've found our target node.
+3. Otherwise use Penalty to pick a new target subtree.
+4. Check the key representing the target subtree. If it doesn't already cover
+the key we're inserting, replace it with the Union of the old downlink key
+and the key being inserted. (Actually, we always call Union, and just skip
+the replacement if the Unioned key is the same as the existing key)
+5. Replacing the key in step 4 might cause the page to be split. In that case,
+propagate the change upwards and restart the algorithm from the first parent
+that didn't need to be split.
+6. Walk down to the target subtree, and goto 1.
+
+This differs from the insertion algorithm in the original paper. In the
+original paper, you first walk down the tree until you reach a leaf page, and
+then you adjust the downlink in the parent, and propagating the adjustment up,
+all the way up to the root in the worst case. But we adjust the downlinks to
+cover the new key already when we walk down, so that when we reach the leaf
+page, we don't need to update the parents anymore, except to insert the
+downlinks if we have to split the page. This makes crash recovery simpler:
+after inserting a key to the page, the tree is immediately self-consistent
+without having to update the parents. Even if we split a page and crash before
+inserting the downlink to the parent, the tree is self-consistent because the
+right half of the split is accessible via the rightlink of the left page
+(which replaced the original page).
+
+Note that the algorithm can walk up and down the tree before reaching a leaf
+page, if internal pages need to split while adjusting the downlinks for the
+new key. Eventually, you should reach the bottom, and proceed with the
+insertion of the new tuple.
+
+Once we've found the target page to insert to, we check if there's room
+for the new tuple. If there is, the tuple is inserted, and we're done.
+If it doesn't fit, however, the page needs to be split. Note that it is
+possible that a page needs to be split into more than two pages, if keys have
+different lengths or more than one key is being inserted at a time (which can
+happen when inserting downlinks for a page split that resulted in more than
+two pages at the lower level). After splitting a page, the parent page needs
+to be updated. The downlink for the new page needs to be inserted, and the
+downlink for the old page, which became the left half of the split, needs to
+be updated to only cover those tuples that stayed on the left page. Inserting
+the downlink in the parent can again lead to a page split, recursing up to the
+root page in the worst case.
+
+gistplacetopage is the workhorse function that performs one step of the
+insertion. If the tuple fits, it inserts it to the given page, otherwise
+it splits the page, and constructs the new downlink tuples for the split
+pages. The caller must then call gistplacetopage() on the parent page to
+insert the downlink tuples. The parent page that holds the downlink to
+the child might have migrated as a result of concurrent splits of the
+parent, gistfindCorrectParent() is used to find the parent page.
+
+Splitting the root page works slightly differently. At root split,
+gistplacetopage() allocates the new child pages and replaces the old root
+page with the new root containing downlinks to the new children, all in one
+operation.
 
-findLeaf(new-key)
-	push(stack, [root, 0]) //page, LSN
-	while(true)
-		ptr = top of stack
-		latch( ptr->page, S-mode )
-		ptr->lsn = ptr->page->lsn
-		if ( exists ptr->parent AND ptr->parent->lsn < ptr->page->nsn )
-			unlatch( ptr->page )
-			pop stack
-		else if ( ptr->page is not leaf )
-			push( stack, [get_best_child(ptr->page, new-key), 0] )
-			unlatch( ptr->page )
-		else
-			unlatch( ptr->page )
-			latch( ptr->page, X-mode )
-			if ( ptr->page is not leaf )
-				//the only root page can become a non-leaf
-				unlatch( ptr->page )
-			else if ( ptr->parent->lsn < ptr->page->nsn )
-				unlatch( ptr->page )
-				pop stack
-			else
-				return stack
-			end
-		end
-	end
+
+findPath is a subroutine of findParent, used when the correct parent page
+can't be found by following the rightlinks at the parent level:
 
 findPath( stack item )
-	push stack, [root, 0, 0] // page, LSN, parent 
+	push stack, [root, 0, 0] // page, LSN, parent
 	while( stack )
 		ptr = top of stack
 		latch( ptr->page, S-mode )
@@ -152,7 +184,7 @@ findPath( stack item )
 		end
 		for( each tuple on page )
 			if ( tuple->pagepointer == item->page )
-				return stack	
+				return stack
 			else
 				add to stack at the end [tuple->pagepointer,0, ptr]
 			end
@@ -160,12 +192,16 @@ findPath( stack item )
 		unlatch( ptr->page )
 		pop stack
 	end
-	
+
+
+gistFindCorrectParent is used to re-find the parent of a page during
+insertion. It might have migrated to the right since we traversed down the
+tree because of page splits.
+
 findParent( stack item )
 	parent = item->parent
-	latch( parent->page, X-mode )
 	if ( parent->page->lsn != parent->lsn )
-		while(true) 
+		while(true)
 			search parent tuple on parent->page, if found the return
 			rightlink = parent->page->rightlink
 			unlatch( parent->page )
@@ -177,9 +213,13 @@ findParent( stack item )
 		end
 		newstack = findPath( item->parent )
 		replace part of stack to new one
+		latch( parent->page, X-mode )
 		return findParent( item )
 	end
 
+pageSplit function decides how to distribute keys to the new pages after
+page split:
+
 pageSplit(page, allkeys)
 	(lkeys, rkeys) = pickSplit( allkeys )
 	if ( page is root )
@@ -200,40 +240,45 @@ pageSplit(page, allkeys)
 	return newkeys
 
 
-placetopage(page, keysarray)
-	if ( no space left on page )
-		keysarray = pageSplit(page, [ extract_keys(page), keysarray])
-		last page in chain gets old NSN,
-		original and others - new NSN equals to LSN
-		if ( page is root )
-			make new root with keysarray
-		end
-	else
-		put keysarray on page
-		if ( length of keysarray > 1 )
-			keysarray = [ union(keysarray) ]
-		end
-	end
-	
-insert(new-key)
-	stack = findLeaf(new-key)
-	keysarray = [new-key]
-	ptr = top of stack
-	while(true)
-		findParent( ptr ) //findParent latches parent page
-		keysarray = placetopage(ptr->page, keysarray)
-		unlatch( ptr->page )
-		pop stack;
-		ptr = top of stack
-		if (length of keysarray == 1)
-			newboundingkey = union(oldboundingkey, keysarray)
-			if (newboundingkey == oldboundingkey)
-				unlatch ptr->page
-				break loop
-			end
-		end
-	end
+
+Concurrency control
+-------------------
+As a rule of thumb, if you need to hold a lock on multiple pages at the
+same time, the locks should be acquired in the following order: child page
+before parent, and left-to-right at the same level. Always acquiring the
+locks in the same order avoids deadlocks.
+
+The search algorithm only looks at and locks one page at a time. Consequently
+there's a race condition between a search and a page split. A page split
+happens in two phases: 1. The page is split 2. The downlink is inserted to the
+parent. If a search looks at the parent page between those steps, before the
+downlink is inserted, it will still find the new right half by following the
+rightlink on the left half. But it must not follow the rightlink if it saw the
+downlink in the parent, or the page will be visited twice!
+
+A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan
+sees that flag set, it knows that the right page is missing the downlink, and
+should be visited too. When split inserts the downlink to the parent, it
+clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the
+child page header to match the LSN of the insertion on the parent. If the
+F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the
+LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page
+before the downlink was inserted, so it should follow the rightlink. Otherwise
+the scan saw the downlink in the parent page, and will/did follow that as
+usual.
+
+A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because
+a page split keeps the child pages locked until the downlink has been inserted
+to the parent and the flag cleared again. But if a crash happens in the middle
+of a page split, before the downlinks are inserted into the parent, that will
+leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine,
+but we'll eventually want to fix that for performance reasons. And more
+importantly, dealing with pages with missing downlink pointers in the parent
+would complicate the insertion algorithm. So when an insertion sees a page
+with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
+crashed in the middle to completion by adding the downlink in the parent.
+
 
 Authors:
 	Teodor Sigaev	<[email protected]>
-	Oleg Bartunov   <[email protected]>
+	Oleg Bartunov	<[email protected]>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index cec08c7226..8227bfdb88 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -4,11 +4,11 @@
  *	  interface routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.158 2010/01/02 16:57:33 momjian Exp $
+ *	  src/backend/access/gist/gist.c
  *
  *-------------------------------------------------------------------------
  */
@@ -17,13 +17,12 @@
 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "catalog/index.h"
+#include "catalog/pg_collation.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
 #include "utils/memutils.h"
 
-const XLogRecPtr XLogRecPtrForTemp = {1, 1};
-
 /* Working state for gistbuild and its callback */
 typedef struct
 {
@@ -33,6 +32,12 @@ typedef struct
 	MemoryContext tmpCtx;
 } GISTBuildState;
 
+/* A List of these is used represent a split-in-progress. */
+typedef struct
+{
+	Buffer		buf;			/* the split page "half" */
+	IndexTuple	downlink;		/* downlink for this half. */
+} GISTPageSplitInfo;
 
 /* non-export function prototypes */
 static void gistbuildCallback(Relation index,
@@ -45,8 +50,13 @@ static void gistdoinsert(Relation r,
 			 IndexTuple itup,
 			 Size freespace,
 			 GISTSTATE *GISTstate);
-static void gistfindleaf(GISTInsertState *state,
-			 GISTSTATE *giststate);
+static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
+static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+				 GISTSTATE *giststate,
+				 IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+				 Buffer leftchild);
+static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+				GISTSTATE *giststate, List *splitinfo);
 
 
 #define ROTATEDIST(d) do { \
@@ -117,7 +127,7 @@ gistbuild(PG_FUNCTION_ARGS)
 
 	MarkBufferDirty(buffer);
 
-	if (!index->rd_istemp)
+	if (RelationNeedsWAL(index))
 	{
 		XLogRecPtr	recptr;
 		XLogRecData rdata;
@@ -132,7 +142,7 @@ gistbuild(PG_FUNCTION_ARGS)
 		PageSetTLI(page, ThisTimeLineID);
 	}
 	else
-		PageSetLSN(page, XLogRecPtrForTemp);
+		PageSetLSN(page, GetXLogRecPtrForTemp());
 
 	UnlockReleaseBuffer(buffer);
 
@@ -210,6 +220,19 @@ gistbuildCallback(Relation index,
 }
 
 /*
+ *	gistbuildempty() -- build an empty gist index in the initialization fork
+ */
+Datum
+gistbuildempty(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("unlogged GiST indexes are not supported")));
+
+	PG_RETURN_VOID();
+}
+
+/*
  *	gistinsert -- wrapper for GiST tuple insertion.
  *
  *	  This is the public interface routine for tuple insertion in GiSTs.
@@ -253,41 +276,52 @@ gistinsert(PG_FUNCTION_ARGS)
 
 
 /*
- * Workhouse routine for doing insertion into a GiST index. Note that
- * this routine assumes it is invoked in a short-lived memory context,
- * so it does not bother releasing palloc'd allocations.
+ * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
+ * at that offset is atomically removed along with inserting the new tuples.
+ * This is used to replace a tuple with a new one.
+ *
+ * If 'leftchildbuf' is valid, we're inserting the downlink for the page
+ * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
+ * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
+ *
+ * If there is not enough room on the page, it is split. All the split
+ * pages are kept pinned and locked and returned in *splitinfo, the caller
+ * is responsible for inserting the downlinks for them. However, if
+ * 'buffer' is the root page and it needs to be split, gistplacetopage()
+ * performs the split as one atomic operation, and *splitinfo is set to NIL.
+ * In that case, we continue to hold the root page locked, and the child
+ * pages are released; note that new tuple(s) are *not* on the root page
+ * but in one of the new child pages.
  */
-static void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+static bool
+gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
+				Buffer buffer,
+				IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+				Buffer leftchildbuf,
+				List **splitinfo)
 {
-	GISTInsertState state;
-
-	memset(&state, 0, sizeof(GISTInsertState));
-
-	state.itup = (IndexTuple *) palloc(sizeof(IndexTuple));
-	state.itup[0] = (IndexTuple) palloc(IndexTupleSize(itup));
-	memcpy(state.itup[0], itup, IndexTupleSize(itup));
-	state.ituplen = 1;
-	state.freespace = freespace;
-	state.r = r;
-	state.key = itup->t_tid;
-	state.needInsertComplete = true;
+	Page		page = BufferGetPage(buffer);
+	bool		is_leaf = (GistPageIsLeaf(page)) ? true : false;
+	XLogRecPtr	recptr;
+	int			i;
+	bool		is_split;
 
-	state.stack = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
-	state.stack->blkno = GIST_ROOT_BLKNO;
+	/*
+	 * Refuse to modify a page that's incompletely split. This should not
+	 * happen because we finish any incomplete splits while we walk down the
+	 * tree. However, it's remotely possible that another concurrent inserter
+	 * splits a parent page, and errors out before completing the split. We
+	 * will just throw an error in that case, and leave any split we had in
+	 * progress unfinished too. The next insert that comes along will clean up
+	 * the mess.
+	 */
+	if (GistFollowRight(page))
+		elog(ERROR, "concurrent GiST page split was incomplete");
 
-	gistfindleaf(&state, giststate);
-	gistmakedeal(&state, giststate);
-}
-
-static bool
-gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
-{
-	bool		is_splitted = false;
-	bool		is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false;
+	*splitinfo = NIL;
 
 	/*
-	 * if (!is_leaf) remove old key: This node's key has been modified, either
+	 * if isupdate, remove old key: This node's key has been modified, either
 	 * because a child split occurred or because we needed to adjust our key
 	 * for an insert in a child node. Therefore, remove the old version of
 	 * this node's key.
@@ -295,77 +329,136 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
 	 * for WAL replay, in the non-split case we handle this by setting up a
 	 * one-element todelete array; in the split case, it's handled implicitly
 	 * because the tuple vector passed to gistSplit won't include this tuple.
-	 *
-	 * XXX: If we want to change fillfactors between node and leaf, fillfactor
-	 * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor)
 	 */
-	if (gistnospace(state->stack->page, state->itup, state->ituplen,
-					is_leaf ? InvalidOffsetNumber : state->stack->childoffnum,
-					state->freespace))
+	is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
+	if (is_split)
 	{
 		/* no space for insertion */
 		IndexTuple *itvec;
 		int			tlen;
 		SplitedPageLayout *dist = NULL,
 				   *ptr;
-		BlockNumber rrlink = InvalidBlockNumber;
-		GistNSN		oldnsn;
+		BlockNumber oldrlink = InvalidBlockNumber;
+		GistNSN		oldnsn = {0, 0};
+		SplitedPageLayout rootpg;
+		BlockNumber blkno = BufferGetBlockNumber(buffer);
+		bool		is_rootsplit;
 
-		is_splitted = true;
+		is_rootsplit = (blkno == GIST_ROOT_BLKNO);
 
 		/*
-		 * Form index tuples vector to split: remove old tuple if t's needed
-		 * and add new tuples to vector
+		 * Form index tuples vector to split. If we're replacing an old tuple,
+		 * remove the old version from the vector.
 		 */
-		itvec = gistextractpage(state->stack->page, &tlen);
-		if (!is_leaf)
+		itvec = gistextractpage(page, &tlen);
+		if (OffsetNumberIsValid(oldoffnum))
 		{
 			/* on inner page we should remove old tuple */
-			int			pos = state->stack->childoffnum - FirstOffsetNumber;
+			int			pos = oldoffnum - FirstOffsetNumber;
 
 			tlen--;
 			if (pos != tlen)
 				memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
 		}
-		itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen);
-		dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate);
+		itvec = gistjoinvector(itvec, &tlen, itup, ntup);
+		dist = gistSplit(state->r, page, itvec, tlen, giststate);
 
-		state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen);
-		state->ituplen = 0;
-
-		if (state->stack->blkno != GIST_ROOT_BLKNO)
+		/*
+		 * Set up pages to work with. Allocate new buffers for all but the
+		 * leftmost page. The original page becomes the new leftmost page, and
+		 * is just replaced with the new contents.
+		 *
+		 * For a root-split, allocate new buffers for all child pages, the
+		 * original page is overwritten with new root page containing
+		 * downlinks to the new child pages.
+		 */
+		ptr = dist;
+		if (!is_rootsplit)
 		{
-			/*
-			 * if non-root split then we should not allocate new buffer, but
-			 * we must create temporary page to operate
-			 */
-			dist->buffer = state->stack->buffer;
-			dist->page = PageGetTempPageCopySpecial(BufferGetPage(dist->buffer));
+			/* save old rightlink and NSN */
+			oldrlink = GistPageGetOpaque(page)->rightlink;
+			oldnsn = GistPageGetOpaque(page)->nsn;
+
+			dist->buffer = buffer;
+			dist->block.blkno = BufferGetBlockNumber(buffer);
+			dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
 
 			/* clean all flags except F_LEAF */
 			GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
+
+			ptr = ptr->next;
+		}
+		for (; ptr; ptr = ptr->next)
+		{
+			/* Allocate new page */
+			ptr->buffer = gistNewBuffer(state->r);
+			GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
+			ptr->page = BufferGetPage(ptr->buffer);
+			ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
 		}
 
-		/* make new pages and fills them */
+		/*
+		 * Now that we know whick blocks the new pages go to, set up downlink
+		 * tuples to point to them.
+		 */
 		for (ptr = dist; ptr; ptr = ptr->next)
 		{
+			ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
+			GistTupleSetValid(ptr->itup);
+		}
+
+		/*
+		 * If this is a root split, we construct the new root page with the
+		 * downlinks here directly, instead of requiring the caller to insert
+		 * them. Add the new root page to the list along with the child pages.
+		 */
+		if (is_rootsplit)
+		{
+			IndexTuple *downlinks;
+			int			ndownlinks = 0;
 			int			i;
-			char	   *data;
 
-			/* get new page */
-			if (ptr->buffer == InvalidBuffer)
+			rootpg.buffer = buffer;
+			rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer));
+			GistPageGetOpaque(rootpg.page)->flags = 0;
+
+			/* Prepare a vector of all the downlinks */
+			for (ptr = dist; ptr; ptr = ptr->next)
+				ndownlinks++;
+			downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
+			for (i = 0, ptr = dist; ptr; ptr = ptr->next)
+				downlinks[i++] = ptr->itup;
+
+			rootpg.block.blkno = GIST_ROOT_BLKNO;
+			rootpg.block.num = ndownlinks;
+			rootpg.list = gistfillitupvec(downlinks, ndownlinks,
+										  &(rootpg.lenlist));
+			rootpg.itup = NULL;
+
+			rootpg.next = dist;
+			dist = &rootpg;
+		}
+		else
+		{
+			/* Prepare split-info to be returned to caller */
+			for (ptr = dist; ptr; ptr = ptr->next)
 			{
-				ptr->buffer = gistNewBuffer(state->r);
-				GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
-				ptr->page = BufferGetPage(ptr->buffer);
+				GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+
+				si->buf = ptr->buffer;
+				si->downlink = ptr->itup;
+				*splitinfo = lappend(*splitinfo, si);
 			}
-			ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
+		}
+
+		/*
+		 * Fill all pages. All the pages are new, ie. freshly allocated empty
+		 * pages, or a temporary copy of the old page.
+		 */
+		for (ptr = dist; ptr; ptr = ptr->next)
+		{
+			char	   *data = (char *) (ptr->list);
 
-			/*
-			 * fill page, we can do it because all these pages are new (ie not
-			 * linked in tree or masked by temp page
-			 */
-			data = (char *) (ptr->list);
 			for (i = 0; i < ptr->block.num; i++)
 			{
 				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
@@ -373,276 +466,388 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
 				data += IndexTupleSize((IndexTuple) data);
 			}
 
-			/* set up ItemPointer and remember it for parent */
-			ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
-			state->itup[state->ituplen] = ptr->itup;
-			state->ituplen++;
-		}
+			/* Set up rightlinks */
+			if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
+				GistPageGetOpaque(ptr->page)->rightlink =
+					ptr->next->block.blkno;
+			else
+				GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
 
-		/* saves old rightlink */
-		if (state->stack->blkno != GIST_ROOT_BLKNO)
-			rrlink = GistPageGetOpaque(dist->page)->rightlink;
+			if (ptr->next && !is_rootsplit)
+				GistMarkFollowRight(ptr->page);
+			else
+				GistClearFollowRight(ptr->page);
+
+			/*
+			 * Copy the NSN of the original page to all pages. The
+			 * F_FOLLOW_RIGHT flags ensure that scans will follow the
+			 * rightlinks until the downlinks are inserted.
+			 */
+			GistPageGetOpaque(ptr->page)->nsn = oldnsn;
+		}
 
 		START_CRIT_SECTION();
 
 		/*
-		 * must mark buffers dirty before XLogInsert, even though we'll still
-		 * be changing their opaque fields below. set up right links.
+		 * Must mark buffers dirty before XLogInsert, even though we'll still
+		 * be changing their opaque fields below.
 		 */
 		for (ptr = dist; ptr; ptr = ptr->next)
-		{
 			MarkBufferDirty(ptr->buffer);
-			GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ?
-				ptr->next->block.blkno : rrlink;
-		}
-
-		/* restore splitted non-root page */
-		if (state->stack->blkno != GIST_ROOT_BLKNO)
-		{
-			PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
-			dist->page = BufferGetPage(dist->buffer);
-		}
-
-		if (!state->r->rd_istemp)
-		{
-			XLogRecPtr	recptr;
-			XLogRecData *rdata;
-
-			rdata = formSplitRdata(state->r->rd_node, state->stack->blkno,
-								   is_leaf, &(state->key), dist);
+		if (BufferIsValid(leftchildbuf))
+			MarkBufferDirty(leftchildbuf);
 
-			recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
+		/*
+		 * The first page in the chain was a temporary working copy meant to
+		 * replace the old page. Copy it over the old page.
+		 */
+		PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
+		dist->page = BufferGetPage(dist->buffer);
 
-			for (ptr = dist; ptr; ptr = ptr->next)
-			{
-				PageSetLSN(ptr->page, recptr);
-				PageSetTLI(ptr->page, ThisTimeLineID);
-			}
-		}
+		/* Write the WAL record */
+		if (RelationNeedsWAL(state->r))
+			recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
+								   dist, oldrlink, oldnsn, leftchildbuf);
 		else
-		{
-			for (ptr = dist; ptr; ptr = ptr->next)
-			{
-				PageSetLSN(ptr->page, XLogRecPtrForTemp);
-			}
-		}
-
-		/* set up NSN */
-		oldnsn = GistPageGetOpaque(dist->page)->nsn;
-		if (state->stack->blkno == GIST_ROOT_BLKNO)
-			/* if root split we should put initial value */
-			oldnsn = PageGetLSN(dist->page);
+			recptr = GetXLogRecPtrForTemp();
 
 		for (ptr = dist; ptr; ptr = ptr->next)
 		{
-			/* only for last set oldnsn */
-			GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ?
-				PageGetLSN(ptr->page) : oldnsn;
+			PageSetLSN(ptr->page, recptr);
+			PageSetTLI(ptr->page, ThisTimeLineID);
 		}
 
 		/*
-		 * release buffers, if it was a root split then release all buffers
-		 * because we create all buffers
+		 * Return the new child buffers to the caller.
+		 *
+		 * If this was a root split, we've already inserted the downlink
+		 * pointers, in the form of a new root page. Therefore we can release
+		 * all the new buffers, and keep just the root page locked.
 		 */
-		ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next;
-		for (; ptr; ptr = ptr->next)
-			UnlockReleaseBuffer(ptr->buffer);
-
-		if (state->stack->blkno == GIST_ROOT_BLKNO)
+		if (is_rootsplit)
 		{
-			gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key));
-			state->needInsertComplete = false;
+			for (ptr = dist->next; ptr; ptr = ptr->next)
+				UnlockReleaseBuffer(ptr->buffer);
 		}
-
-		END_CRIT_SECTION();
 	}
 	else
 	{
-		/* enough space */
+		/*
+		 * Enough space. We also get here if ntuples==0.
+		 */
 		START_CRIT_SECTION();
 
-		if (!is_leaf)
-			PageIndexTupleDelete(state->stack->page, state->stack->childoffnum);
-		gistfillbuffer(state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber);
+		if (OffsetNumberIsValid(oldoffnum))
+			PageIndexTupleDelete(page, oldoffnum);
+		gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
 
-		MarkBufferDirty(state->stack->buffer);
+		MarkBufferDirty(buffer);
 
-		if (!state->r->rd_istemp)
+		if (BufferIsValid(leftchildbuf))
+			MarkBufferDirty(leftchildbuf);
+
+		if (RelationNeedsWAL(state->r))
 		{
-			OffsetNumber noffs = 0,
-						offs[1];
-			XLogRecPtr	recptr;
-			XLogRecData *rdata;
+			OffsetNumber ndeloffs = 0,
+						deloffs[1];
 
-			if (!is_leaf)
+			if (OffsetNumberIsValid(oldoffnum))
 			{
-				/* only on inner page we should delete previous version */
-				offs[0] = state->stack->childoffnum;
-				noffs = 1;
+				deloffs[0] = oldoffnum;
+				ndeloffs = 1;
 			}
 
-			rdata = formUpdateRdata(state->r->rd_node, state->stack->buffer,
-									offs, noffs,
-									state->itup, state->ituplen,
-									&(state->key));
+			recptr = gistXLogUpdate(state->r->rd_node, buffer,
+									deloffs, ndeloffs, itup, ntup,
+									leftchildbuf);
 
-			recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
-			PageSetLSN(state->stack->page, recptr);
-			PageSetTLI(state->stack->page, ThisTimeLineID);
+			PageSetLSN(page, recptr);
+			PageSetTLI(page, ThisTimeLineID);
 		}
 		else
-			PageSetLSN(state->stack->page, XLogRecPtrForTemp);
-
-		if (state->stack->blkno == GIST_ROOT_BLKNO)
-			state->needInsertComplete = false;
+		{
+			recptr = GetXLogRecPtrForTemp();
+			PageSetLSN(page, recptr);
+		}
 
-		END_CRIT_SECTION();
+		*splitinfo = NIL;
+	}
 
-		if (state->ituplen > 1)
-		{						/* previous is_splitted==true */
+	/*
+	 * If we inserted the downlink for a child page, set NSN and clear
+	 * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
+	 * follow the rightlink if and only if they looked at the parent page
+	 * before we inserted the downlink.
+	 *
+	 * Note that we do this *after* writing the WAL record. That means that
+	 * the possible full page image in the WAL record does not include these
+	 * changes, and they must be replayed even if the page is restored from
+	 * the full page image. There's a chicken-and-egg problem: if we updated
+	 * the child pages first, we wouldn't know the recptr of the WAL record
+	 * we're about to write.
+	 */
+	if (BufferIsValid(leftchildbuf))
+	{
+		Page		leftpg = BufferGetPage(leftchildbuf);
 
-			/*
-			 * child was splited, so we must form union for insertion in
-			 * parent
-			 */
-			IndexTuple	newtup = gistunion(state->r, state->itup, state->ituplen, giststate);
+		GistPageGetOpaque(leftpg)->nsn = recptr;
+		GistClearFollowRight(leftpg);
 
-			ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno);
-			state->itup[0] = newtup;
-			state->ituplen = 1;
-		}
-		else if (is_leaf)
-		{
-			/*
-			 * itup[0] store key to adjust parent, we set it to valid to
-			 * correct check by GistTupleIsInvalid macro in gistgetadjusted()
-			 */
-			ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno);
-			GistTupleSetValid(state->itup[0]);
-		}
+		PageSetLSN(leftpg, recptr);
+		PageSetTLI(leftpg, ThisTimeLineID);
 	}
-	return is_splitted;
+
+	END_CRIT_SECTION();
+
+	return is_split;
 }
 
 /*
- * returns stack of pages, all pages in stack are pinned, and
- * leaf is X-locked
+ * Workhouse routine for doing insertion into a GiST index. Note that
+ * this routine assumes it is invoked in a short-lived memory context,
+ * so it does not bother releasing palloc'd allocations.
  */
-
 static void
-gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 {
 	ItemId		iid;
 	IndexTuple	idxtuple;
-	GISTPageOpaque opaque;
+	GISTInsertStack firststack;
+	GISTInsertStack *stack;
+	GISTInsertState state;
+	bool		xlocked = false;
+
+	memset(&state, 0, sizeof(GISTInsertState));
+	state.freespace = freespace;
+	state.r = r;
+
+	/* Start from the root */
+	firststack.blkno = GIST_ROOT_BLKNO;
+	firststack.lsn.xrecoff = 0;
+	firststack.parent = NULL;
+	state.stack = stack = &firststack;
 
 	/*
-	 * walk down, We don't lock page for a long time, but so we should be
-	 * ready to recheck path in a bad case... We remember, that page->lsn
-	 * should never be invalid.
+	 * Walk down along the path of smallest penalty, updating the parent
+	 * pointers with the key we're inserting as we go. If we crash in the
+	 * middle, the tree is consistent, although the possible parent updates
+	 * were a waste.
 	 */
 	for (;;)
 	{
-		if (XLogRecPtrIsInvalid(state->stack->lsn))
-			state->stack->buffer = ReadBuffer(state->r, state->stack->blkno);
-		LockBuffer(state->stack->buffer, GIST_SHARE);
-		gistcheckpage(state->r, state->stack->buffer);
+		if (XLogRecPtrIsInvalid(stack->lsn))
+			stack->buffer = ReadBuffer(state.r, stack->blkno);
+
+		/*
+		 * Be optimistic and grab shared lock first. Swap it for an exclusive
+		 * lock later if we need to update the page.
+		 */
+		if (!xlocked)
+		{
+			LockBuffer(stack->buffer, GIST_SHARE);
+			gistcheckpage(state.r, stack->buffer);
+		}
 
-		state->stack->page = (Page) BufferGetPage(state->stack->buffer);
-		opaque = GistPageGetOpaque(state->stack->page);
+		stack->page = (Page) BufferGetPage(stack->buffer);
+		stack->lsn = PageGetLSN(stack->page);
+		Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
 
-		state->stack->lsn = PageGetLSN(state->stack->page);
-		Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn));
+		/*
+		 * If this page was split but the downlink was never inserted to the
+		 * parent because the inserting backend crashed before doing that, fix
+		 * that now.
+		 */
+		if (GistFollowRight(stack->page))
+		{
+			if (!xlocked)
+			{
+				LockBuffer(stack->buffer, GIST_UNLOCK);
+				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+				xlocked = true;
+				/* someone might've completed the split when we unlocked */
+				if (!GistFollowRight(stack->page))
+					continue;
+			}
+			gistfixsplit(&state, giststate);
 
-		if (state->stack->blkno != GIST_ROOT_BLKNO &&
-			XLByteLT(state->stack->parent->lsn, opaque->nsn))
+			UnlockReleaseBuffer(stack->buffer);
+			xlocked = false;
+			state.stack = stack = stack->parent;
+			continue;
+		}
+
+		if (stack->blkno != GIST_ROOT_BLKNO &&
+			XLByteLT(stack->parent->lsn,
+					 GistPageGetOpaque(stack->page)->nsn))
 		{
 			/*
-			 * caused split non-root page is detected, go up to parent to
-			 * choose best child
+			 * Concurrent split detected. There's no guarantee that the
+			 * downlink for this page is consistent with the tuple we're
+			 * inserting anymore, so go back to parent and rechoose the best
+			 * child.
 			 */
-			UnlockReleaseBuffer(state->stack->buffer);
-			state->stack = state->stack->parent;
+			UnlockReleaseBuffer(stack->buffer);
+			xlocked = false;
+			state.stack = stack = stack->parent;
 			continue;
 		}
 
-		if (!GistPageIsLeaf(state->stack->page))
+		if (!GistPageIsLeaf(stack->page))
 		{
 			/*
-			 * This is an internal page, so continue to walk down the tree. We
-			 * find the child node that has the minimum insertion penalty and
-			 * recursively invoke ourselves to modify that node. Once the
-			 * recursive call returns, we may need to adjust the parent node
-			 * for two reasons: the child node split, or the key in this node
-			 * needs to be adjusted for the newly inserted key below us.
+			 * This is an internal page so continue to walk down the tree.
+			 * Find the child node that has the minimum insertion penalty.
 			 */
-			GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
-
-			state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate);
+			BlockNumber childblkno;
+			IndexTuple	newtup;
+			GISTInsertStack *item;
 
-			iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
-			idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid);
-			item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
-			LockBuffer(state->stack->buffer, GIST_UNLOCK);
+			stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate);
+			iid = PageGetItemId(stack->page, stack->childoffnum);
+			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
+			childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 
-			item->parent = state->stack;
-			item->child = NULL;
-			if (state->stack)
-				state->stack->child = item;
-			state->stack = item;
-		}
-		else
-		{
-			/* be carefull, during unlock/lock page may be changed... */
-			LockBuffer(state->stack->buffer, GIST_UNLOCK);
-			LockBuffer(state->stack->buffer, GIST_EXCLUSIVE);
-			state->stack->page = (Page) BufferGetPage(state->stack->buffer);
-			opaque = GistPageGetOpaque(state->stack->page);
+			/*
+			 * Check that it's not a leftover invalid tuple from pre-9.1
+			 */
+			if (GistTupleIsInvalid(idxtuple))
+				ereport(ERROR,
+						(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+								RelationGetRelationName(r)),
+						 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
+						 errhint("Please REINDEX it.")));
 
-			if (state->stack->blkno == GIST_ROOT_BLKNO)
+			/*
+			 * Check that the key representing the target child node is
+			 * consistent with the key we're inserting. Update it if it's not.
+			 */
+			newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
+			if (newtup)
 			{
 				/*
-				 * the only page can become inner instead of leaf is a root
-				 * page, so for root we should recheck it
+				 * Swap shared lock for an exclusive one. Beware, the page may
+				 * change while we unlock/lock the page...
 				 */
-				if (!GistPageIsLeaf(state->stack->page))
+				if (!xlocked)
 				{
-					/*
-					 * very rarely situation: during unlock/lock index with
-					 * number of pages = 1 was increased
-					 */
-					LockBuffer(state->stack->buffer, GIST_UNLOCK);
-					continue;
+					LockBuffer(stack->buffer, GIST_UNLOCK);
+					LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+					xlocked = true;
+					stack->page = (Page) BufferGetPage(stack->buffer);
+
+					if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn))
+					{
+						/* the page was changed while we unlocked it, retry */
+						continue;
+					}
 				}
 
 				/*
-				 * we don't need to check root split, because checking
-				 * leaf/inner is enough to recognize split for root
+				 * Update the tuple.
+				 *
+				 * We still hold the lock after gistinserttuples(), but it
+				 * might have to split the page to make the updated tuple fit.
+				 * In that case the updated tuple might migrate to the other
+				 * half of the split, so we have to go back to the parent and
+				 * descend back to the half that's a better fit for the new
+				 * tuple.
 				 */
-
+				if (gistinserttuples(&state, stack, giststate, &newtup, 1,
+									 stack->childoffnum, InvalidBuffer))
+				{
+					/*
+					 * If this was a root split, the root page continues to be
+					 * the parent and the updated tuple went to one of the
+					 * child pages, so we just need to retry from the root
+					 * page.
+					 */
+					if (stack->blkno != GIST_ROOT_BLKNO)
+					{
+						UnlockReleaseBuffer(stack->buffer);
+						xlocked = false;
+						state.stack = stack = stack->parent;
+					}
+					continue;
+				}
 			}
-			else if (XLByteLT(state->stack->parent->lsn, opaque->nsn))
+			LockBuffer(stack->buffer, GIST_UNLOCK);
+			xlocked = false;
+
+			/* descend to the chosen child */
+			item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+			item->blkno = childblkno;
+			item->parent = stack;
+			state.stack = stack = item;
+		}
+		else
+		{
+			/*
+			 * Leaf page. Insert the new key. We've already updated all the
+			 * parents on the way down, but we might have to split the page if
+			 * it doesn't fit. gistinserthere() will take care of that.
+			 */
+
+			/*
+			 * Swap shared lock for an exclusive one. Be careful, the page may
+			 * change while we unlock/lock the page...
+			 */
+			if (!xlocked)
 			{
-				/*
-				 * detecting split during unlock/lock, so we should find
-				 * better child on parent
-				 */
+				LockBuffer(stack->buffer, GIST_UNLOCK);
+				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+				xlocked = true;
+				stack->page = (Page) BufferGetPage(stack->buffer);
+				stack->lsn = PageGetLSN(stack->page);
 
-				/* forget buffer */
-				UnlockReleaseBuffer(state->stack->buffer);
+				if (stack->blkno == GIST_ROOT_BLKNO)
+				{
+					/*
+					 * the only page that can become inner instead of leaf is
+					 * the root page, so for root we should recheck it
+					 */
+					if (!GistPageIsLeaf(stack->page))
+					{
+						/*
+						 * very rare situation: during unlock/lock index with
+						 * number of pages = 1 was increased
+						 */
+						LockBuffer(stack->buffer, GIST_UNLOCK);
+						xlocked = false;
+						continue;
+					}
 
-				state->stack = state->stack->parent;
-				continue;
+					/*
+					 * we don't need to check root split, because checking
+					 * leaf/inner is enough to recognize split for root
+					 */
+				}
+				else if (GistFollowRight(stack->page) ||
+						 XLByteLT(stack->parent->lsn,
+								  GistPageGetOpaque(stack->page)->nsn))
+				{
+					/*
+					 * The page was split while we momentarily unlocked the
+					 * page. Go back to parent.
+					 */
+					UnlockReleaseBuffer(stack->buffer);
+					xlocked = false;
+					state.stack = stack = stack->parent;
+					continue;
+				}
 			}
 
-			state->stack->lsn = PageGetLSN(state->stack->page);
+			/* now state.stack->(page, buffer and blkno) points to leaf page */
+
+			gistinserttuples(&state, stack, giststate, &itup, 1,
+							 InvalidOffsetNumber, InvalidBuffer);
+			LockBuffer(stack->buffer, GIST_UNLOCK);
 
-			/* ok we found a leaf page and it X-locked */
+			/* Release any pins we might still hold before exiting */
+			for (; stack; stack = stack->parent)
+				ReleaseBuffer(stack->buffer);
 			break;
 		}
 	}
-
-	/* now state->stack->(page, buffer and blkno) points to leaf page */
 }
 
 /*
@@ -650,7 +855,7 @@ gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
  *
  * returns from the beginning of closest parent;
  *
- * To prevent deadlocks, this should lock only one page simultaneously.
+ * To prevent deadlocks, this should lock only one page at a time.
  */
 GISTInsertStack *
 gistFindPath(Relation r, BlockNumber child)
@@ -685,6 +890,13 @@ gistFindPath(Relation r, BlockNumber child)
 
 		top->lsn = PageGetLSN(page);
 
+		/*
+		 * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
+		 * downlink. This should not normally happen..
+		 */
+		if (GistFollowRight(page))
+			elog(ERROR, "concurrent GiST page split was incomplete");
+
 		if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) &&
 			GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
 		{
@@ -713,8 +925,6 @@ gistFindPath(Relation r, BlockNumber child)
 				ptr = top;
 				while (ptr->parent)
 				{
-					/* set child link */
-					ptr->parent->child = ptr;
 					/* move childoffnum.. */
 					if (ptr == top)
 					{
@@ -756,17 +966,16 @@ gistFindPath(Relation r, BlockNumber child)
 	return NULL;
 }
 
-
 /*
- * Returns X-locked parent of stack page
+ * Updates the stack so that child->parent is the correct parent of the
+ * child. child->parent must be exclusively locked on entry, and will
+ * remain so at exit, but it might not be the same page anymore.
  */
-
 static void
 gistFindCorrectParent(Relation r, GISTInsertStack *child)
 {
 	GISTInsertStack *parent = child->parent;
 
-	LockBuffer(parent->buffer, GIST_EXCLUSIVE);
 	gistcheckpage(r, parent->buffer);
 	parent->page = (Page) BufferGetPage(parent->buffer);
 
@@ -838,83 +1047,232 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child)
 
 		/* install new chain of parents to stack */
 		child->parent = parent;
-		parent->child = child;
 
 		/* make recursive call to normal processing */
+		LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
 		gistFindCorrectParent(r, child);
 	}
 
 	return;
 }
 
-void
-gistmakedeal(GISTInsertState *state, GISTSTATE *giststate)
+/*
+ * Form a downlink pointer for the page in 'buf'.
+ */
+static IndexTuple
+gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate,
+				 GISTInsertStack *stack)
 {
-	int			is_splitted;
-	ItemId		iid;
-	IndexTuple	oldtup,
-				newtup;
+	Page		page = BufferGetPage(buf);
+	OffsetNumber maxoff;
+	OffsetNumber offset;
+	IndexTuple	downlink = NULL;
 
-	/* walk up */
-	while (true)
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
 	{
-		/*
-		 * After this call: 1. if child page was splited, then itup contains
-		 * keys for each page 2. if  child page wasn't splited, then itup
-		 * contains additional for adjustment of current key
-		 */
+		IndexTuple	ituple = (IndexTuple)
+		PageGetItem(page, PageGetItemId(page, offset));
 
-		if (state->stack->parent)
+		if (downlink == NULL)
+			downlink = CopyIndexTuple(ituple);
+		else
 		{
-			/*
-			 * X-lock parent page before proceed child, gistFindCorrectParent
-			 * should find and lock it
-			 */
-			gistFindCorrectParent(state->r, state->stack);
+			IndexTuple	newdownlink;
+
+			newdownlink = gistgetadjusted(rel, downlink, ituple,
+										  giststate);
+			if (newdownlink)
+				downlink = newdownlink;
 		}
-		is_splitted = gistplacetopage(state, giststate);
+	}
+
+	/*
+	 * If the page is completely empty, we can't form a meaningful downlink
+	 * for it. But we have to insert a downlink for the page. Any key will do,
+	 * as long as its consistent with the downlink of parent page, so that we
+	 * can legally insert it to the parent. A minimal one that matches as few
+	 * scans as possible would be best, to keep scans from doing useless work,
+	 * but we don't know how to construct that. So we just use the downlink of
+	 * the original page that was split - that's as far from optimal as it can
+	 * get but will do..
+	 */
+	if (!downlink)
+	{
+		ItemId		iid;
 
-		/* parent locked above, so release child buffer */
-		UnlockReleaseBuffer(state->stack->buffer);
+		LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+		gistFindCorrectParent(rel, stack);
+		iid = PageGetItemId(stack->parent->page, stack->parent->childoffnum);
+		downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
+		downlink = CopyIndexTuple(downlink);
+		LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+	}
 
-		/* pop parent page from stack */
-		state->stack = state->stack->parent;
+	ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf));
+	GistTupleSetValid(downlink);
 
-		/* stack is void */
-		if (!state->stack)
-			break;
+	return downlink;
+}
 
-		/*
-		 * child did not split, so we can check is it needed to update parent
-		 * tuple
-		 */
-		if (!is_splitted)
-		{
-			/* parent's tuple */
-			iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
-			oldtup = (IndexTuple) PageGetItem(state->stack->page, iid);
-			newtup = gistgetadjusted(state->r, oldtup, state->itup[0], giststate);
-
-			if (!newtup)
-			{					/* not need to update key */
-				LockBuffer(state->stack->buffer, GIST_UNLOCK);
-				break;
-			}
 
-			state->itup[0] = newtup;
+/*
+ * Complete the incomplete split of state->stack->page.
+ */
+static void
+gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
+{
+	GISTInsertStack *stack = state->stack;
+	Buffer		buf;
+	Page		page;
+	List	   *splitinfo = NIL;
+
+	elog(LOG, "fixing incomplete split in index \"%s\", block %u",
+		 RelationGetRelationName(state->r), stack->blkno);
+
+	Assert(GistFollowRight(stack->page));
+	Assert(OffsetNumberIsValid(stack->parent->childoffnum));
+
+	buf = stack->buffer;
+
+	/*
+	 * Read the chain of split pages, following the rightlinks. Construct a
+	 * downlink tuple for each page.
+	 */
+	for (;;)
+	{
+		GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+		IndexTuple	downlink;
+
+		page = BufferGetPage(buf);
+
+		/* Form the new downlink tuples to insert to parent */
+		downlink = gistformdownlink(state->r, buf, giststate, stack);
+
+		si->buf = buf;
+		si->downlink = downlink;
+
+		splitinfo = lappend(splitinfo, si);
+
+		if (GistFollowRight(page))
+		{
+			/* lock next page */
+			buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
+			LockBuffer(buf, GIST_EXCLUSIVE);
 		}
-	}							/* while */
+		else
+			break;
+	}
+
+	/* Insert the downlinks */
+	gistfinishsplit(state, stack, giststate, splitinfo);
+}
+
+/*
+ * Insert tuples to stack->buffer. If 'oldoffnum' is valid, the new tuples
+ * replace an old tuple at oldoffnum. The caller must hold an exclusive lock
+ * on the page.
+ *
+ * If leftchild is valid, we're inserting/updating the downlink for the
+ * page to the right of leftchild. We clear the F_FOLLOW_RIGHT flag and
+ * update NSN on leftchild, atomically with the insertion of the downlink.
+ *
+ * Returns 'true' if the page had to be split. On return, we will continue
+ * to hold an exclusive lock on state->stack->buffer, but if we had to split
+ * the page, it might not contain the tuple we just inserted/updated.
+ */
+static bool
+gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+				 GISTSTATE *giststate,
+				 IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+				 Buffer leftchild)
+{
+	List	   *splitinfo;
+	bool		is_split;
+
+	is_split = gistplacetopage(state, giststate, stack->buffer,
+							   tuples, ntup, oldoffnum,
+							   leftchild,
+							   &splitinfo);
+	if (splitinfo)
+		gistfinishsplit(state, stack, giststate, splitinfo);
+
+	return is_split;
+}
+
+/*
+ * Finish an incomplete split by inserting/updating the downlinks in
+ * parent page. 'splitinfo' contains all the child pages, exclusively-locked,
+ * involved in the split, from left-to-right.
+ */
+static void
+gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+				GISTSTATE *giststate, List *splitinfo)
+{
+	ListCell   *lc;
+	List	   *reversed;
+	GISTPageSplitInfo *right;
+	GISTPageSplitInfo *left;
+	IndexTuple	tuples[2];
+
+	/* A split always contains at least two halves */
+	Assert(list_length(splitinfo) >= 2);
+
+	/*
+	 * We need to insert downlinks for each new page, and update the downlink
+	 * for the original (leftmost) page in the split. Begin at the rightmost
+	 * page, inserting one downlink at a time until there's only two pages
+	 * left. Finally insert the downlink for the last new page and update the
+	 * downlink for the original page as one operation.
+	 */
+
+	/* for convenience, create a copy of the list in reverse order */
+	reversed = NIL;
+	foreach(lc, splitinfo)
+	{
+		reversed = lcons(lfirst(lc), reversed);
+	}
 
-	/* release all parent buffers */
-	while (state->stack)
+	LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+	gistFindCorrectParent(state->r, stack);
+
+	while (list_length(reversed) > 2)
 	{
-		ReleaseBuffer(state->stack->buffer);
-		state->stack = state->stack->parent;
+		right = (GISTPageSplitInfo *) linitial(reversed);
+		left = (GISTPageSplitInfo *) lsecond(reversed);
+
+		if (gistinserttuples(state, stack->parent, giststate,
+							 &right->downlink, 1,
+							 InvalidOffsetNumber,
+							 left->buf))
+		{
+			/*
+			 * If the parent page was split, need to relocate the original
+			 * parent pointer.
+			 */
+			gistFindCorrectParent(state->r, stack);
+		}
+		UnlockReleaseBuffer(right->buf);
+		reversed = list_delete_first(reversed);
 	}
 
-	/* say to xlog that insert is completed */
-	if (state->needInsertComplete && !state->r->rd_istemp)
-		gistxlogInsertCompletion(state->r->rd_node, &(state->key), 1);
+	right = (GISTPageSplitInfo *) linitial(reversed);
+	left = (GISTPageSplitInfo *) lsecond(reversed);
+
+	/*
+	 * Finally insert downlink for the remaining right page and update the
+	 * downlink for the original page to not contain the tuples that were
+	 * moved to the new pages.
+	 */
+	tuples[0] = left->downlink;
+	tuples[1] = right->downlink;
+	gistinserttuples(state, stack->parent, giststate,
+					 tuples, 2,
+					 stack->parent->childoffnum,
+					 left->buf);
+	LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+	UnlockReleaseBuffer(right->buf);
+	Assert(left->buf == stack->buffer);
 }
 
 /*
@@ -965,8 +1323,7 @@ gistSplit(Relation r,
 		ROTATEDIST(res);
 		res->block.num = v.splitVector.spl_nright;
 		res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
-		res->itup = (v.spl_rightvalid) ? gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false)
-			: gist_form_invalid_tuple(GIST_ROOT_BLKNO);
+		res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
 	}
 
 	if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
@@ -988,51 +1345,16 @@ gistSplit(Relation r,
 		ROTATEDIST(res);
 		res->block.num = v.splitVector.spl_nleft;
 		res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
-		res->itup = (v.spl_leftvalid) ? gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false)
-			: gist_form_invalid_tuple(GIST_ROOT_BLKNO);
+		res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
 	}
 
 	return res;
 }
 
 /*
- * buffer must be pinned and locked by caller
+ * Fill a GISTSTATE with information about the index
  */
 void
-gistnewroot(Relation r, Buffer buffer, IndexTuple *itup, int len, ItemPointer key)
-{
-	Page		page;
-
-	Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
-	page = BufferGetPage(buffer);
-
-	START_CRIT_SECTION();
-
-	GISTInitBuffer(buffer, 0);
-	gistfillbuffer(page, itup, len, FirstOffsetNumber);
-
-	MarkBufferDirty(buffer);
-
-	if (!r->rd_istemp)
-	{
-		XLogRecPtr	recptr;
-		XLogRecData *rdata;
-
-		rdata = formUpdateRdata(r->rd_node, buffer,
-								NULL, 0,
-								itup, len, key);
-
-		recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_NEW_ROOT, rdata);
-		PageSetLSN(page, recptr);
-		PageSetTLI(page, ThisTimeLineID);
-	}
-	else
-		PageSetLSN(page, XLogRecPtrForTemp);
-
-	END_CRIT_SECTION();
-}
-
-void
 initGISTstate(GISTSTATE *giststate, Relation index)
 {
 	int			i;
@@ -1066,6 +1388,29 @@ initGISTstate(GISTSTATE *giststate, Relation index)
 		fmgr_info_copy(&(giststate->equalFn[i]),
 					   index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
 					   CurrentMemoryContext);
+		/* opclasses are not required to provide a Distance method */
+		if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
+			fmgr_info_copy(&(giststate->distanceFn[i]),
+						 index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
+						   CurrentMemoryContext);
+		else
+			giststate->distanceFn[i].fn_oid = InvalidOid;
+
+		/*
+		 * If the index column has a specified collation, we should honor that
+		 * while doing comparisons.  However, we may have a collatable storage
+		 * type for a noncollatable indexed data type.	If there's no index
+		 * collation then specify default collation in case the support
+		 * functions need collation.  This is harmless if the support
+		 * functions don't care about collation, so we just do it
+		 * unconditionally.  (We could alternatively call get_typcollation,
+		 * but that seems like expensive overkill --- there aren't going to be
+		 * any cases where a GiST storage type has a nondefault collation.)
+		 */
+		if (OidIsValid(index->rd_indcollation[i]))
+			giststate->supportCollation[i] = index->rd_indcollation[i];
+		else
+			giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
 	}
 }
 
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 216910307a..1aba686844 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -4,11 +4,11 @@
  *	  fetch tuples from a GiST scan.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.85 2010/02/26 02:00:33 momjian Exp $
+ *	  src/backend/access/gist/gistget.c
  *
  *-------------------------------------------------------------------------
  */
@@ -20,504 +20,568 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 
 
-static OffsetNumber gistfindnext(IndexScanDesc scan, OffsetNumber n);
-static int64 gistnext(IndexScanDesc scan, TIDBitmap *tbm);
-static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan,
-				  OffsetNumber offset);
-
-static void
-killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr)
+/*
+ * gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
+ *
+ * The index tuple might represent either a heap tuple or a lower index page,
+ * depending on whether the containing page is a leaf page or not.
+ *
+ * On success return for a heap tuple, *recheck_p is set to indicate
+ * whether recheck is needed.  We recheck if any of the consistent() functions
+ * request it.	recheck is not interesting when examining a non-leaf entry,
+ * since we must visit the lower index page if there's any doubt.
+ *
+ * If we are doing an ordered scan, so->distances[] is filled with distance
+ * data from the distance() functions before returning success.
+ *
+ * We must decompress the key in the IndexTuple before passing it to the
+ * sk_funcs (which actually are the opclass Consistent or Distance methods).
+ *
+ * Note that this function is always invoked in a short-lived memory context,
+ * so we don't need to worry about cleaning up allocated memory, either here
+ * or in the implementation of any Consistent or Distance methods.
+ */
+static bool
+gistindex_keytest(IndexScanDesc scan,
+				  IndexTuple tuple,
+				  Page page,
+				  OffsetNumber offset,
+				  bool *recheck_p)
 {
-	Page		p;
-	OffsetNumber offset;
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+	GISTSTATE  *giststate = so->giststate;
+	ScanKey		key = scan->keyData;
+	int			keySize = scan->numberOfKeys;
+	double	   *distance_p;
+	Relation	r = scan->indexRelation;
 
-	LockBuffer(so->curbuf, GIST_SHARE);
-	gistcheckpage(r, so->curbuf);
-	p = (Page) BufferGetPage(so->curbuf);
+	*recheck_p = false;
 
-	if (XLByteEQ(so->stack->lsn, PageGetLSN(p)))
+	/*
+	 * If it's a leftover invalid tuple from pre-9.1, treat it as a match with
+	 * minimum possible distances.	This means we'll always follow it to the
+	 * referenced page.
+	 */
+	if (GistTupleIsInvalid(tuple))
 	{
-		/* page unchanged, so all is simple */
-		offset = ItemPointerGetOffsetNumber(iptr);
-		ItemIdMarkDead(PageGetItemId(p, offset));
-		SetBufferCommitInfoNeedsSave(so->curbuf);
+		int			i;
+
+		if (GistPageIsLeaf(page))		/* shouldn't happen */
+			elog(ERROR, "invalid GiST tuple found on leaf page");
+		for (i = 0; i < scan->numberOfOrderBys; i++)
+			so->distances[i] = -get_float8_infinity();
+		return true;
 	}
-	else
+
+	/* Check whether it matches according to the Consistent functions */
+	while (keySize > 0)
 	{
-		OffsetNumber maxoff = PageGetMaxOffsetNumber(p);
+		Datum		datum;
+		bool		isNull;
 
-		for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
-		{
-			IndexTuple	ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset));
+		datum = index_getattr(tuple,
+							  key->sk_attno,
+							  giststate->tupdesc,
+							  &isNull);
 
-			if (ItemPointerEquals(&(ituple->t_tid), iptr))
+		if (key->sk_flags & SK_ISNULL)
+		{
+			/*
+			 * On non-leaf page we can't conclude that child hasn't NULL
+			 * values because of assumption in GiST: union (VAL, NULL) is VAL.
+			 * But if on non-leaf page key IS NULL, then all children are
+			 * NULL.
+			 */
+			if (key->sk_flags & SK_SEARCHNULL)
 			{
-				/* found */
-				ItemIdMarkDead(PageGetItemId(p, offset));
-				SetBufferCommitInfoNeedsSave(so->curbuf);
-				break;
+				if (GistPageIsLeaf(page) && !isNull)
+					return false;
+			}
+			else
+			{
+				Assert(key->sk_flags & SK_SEARCHNOTNULL);
+				if (isNull)
+					return false;
 			}
 		}
-	}
+		else if (isNull)
+		{
+			return false;
+		}
+		else
+		{
+			Datum		test;
+			bool		recheck;
+			GISTENTRY	de;
 
-	LockBuffer(so->curbuf, GIST_UNLOCK);
-}
+			gistdentryinit(giststate, key->sk_attno - 1, &de,
+						   datum, r, page, offset,
+						   FALSE, isNull);
 
-/*
- * gistgettuple() -- Get the next tuple in the scan
- */
-Datum
-gistgettuple(PG_FUNCTION_ARGS)
-{
-	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
-	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
-	GISTScanOpaque so;
-	bool		res;
+			/*
+			 * Call the Consistent function to evaluate the test.  The
+			 * arguments are the index datum (as a GISTENTRY*), the comparison
+			 * datum, the comparison operator's strategy number and subtype
+			 * from pg_amop, and the recheck flag.
+			 *
+			 * (Presently there's no need to pass the subtype since it'll
+			 * always be zero, but might as well pass it for possible future
+			 * use.)
+			 *
+			 * We initialize the recheck flag to true (the safest assumption)
+			 * in case the Consistent function forgets to set it.
+			 */
+			recheck = true;
 
-	so = (GISTScanOpaque) scan->opaque;
+			test = FunctionCall5Coll(&key->sk_func,
+									 key->sk_collation,
+									 PointerGetDatum(&de),
+									 key->sk_argument,
+									 Int32GetDatum(key->sk_strategy),
+									 ObjectIdGetDatum(key->sk_subtype),
+									 PointerGetDatum(&recheck));
 
-	if (dir != ForwardScanDirection)
-		elog(ERROR, "GiST doesn't support other scan directions than forward");
+			if (!DatumGetBool(test))
+				return false;
+			*recheck_p |= recheck;
+		}
 
-	/*
-	 * If we have produced an index tuple in the past and the executor has
-	 * informed us we need to mark it as "killed", do so now.
-	 */
-	if (scan->kill_prior_tuple && ItemPointerIsValid(&(so->curpos)))
-		killtuple(scan->indexRelation, so, &(so->curpos));
+		key++;
+		keySize--;
+	}
 
-	/*
-	 * Get the next tuple that matches the search key.
-	 */
-	res = (gistnext(scan, NULL) > 0);
+	/* OK, it passes --- now let's compute the distances */
+	key = scan->orderByData;
+	distance_p = so->distances;
+	keySize = scan->numberOfOrderBys;
+	while (keySize > 0)
+	{
+		Datum		datum;
+		bool		isNull;
 
-	PG_RETURN_BOOL(res);
-}
+		datum = index_getattr(tuple,
+							  key->sk_attno,
+							  giststate->tupdesc,
+							  &isNull);
 
-Datum
-gistgetbitmap(PG_FUNCTION_ARGS)
-{
-	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
-	TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
-	int64		ntids;
+		if ((key->sk_flags & SK_ISNULL) || isNull)
+		{
+			/* Assume distance computes as null and sorts to the end */
+			*distance_p = get_float8_infinity();
+		}
+		else
+		{
+			Datum		dist;
+			GISTENTRY	de;
 
-	ntids = gistnext(scan, tbm);
+			gistdentryinit(giststate, key->sk_attno - 1, &de,
+						   datum, r, page, offset,
+						   FALSE, isNull);
 
-	PG_RETURN_INT64(ntids);
+			/*
+			 * Call the Distance function to evaluate the distance.  The
+			 * arguments are the index datum (as a GISTENTRY*), the comparison
+			 * datum, and the ordering operator's strategy number and subtype
+			 * from pg_amop.
+			 *
+			 * (Presently there's no need to pass the subtype since it'll
+			 * always be zero, but might as well pass it for possible future
+			 * use.)
+			 *
+			 * Note that Distance functions don't get a recheck argument. We
+			 * can't tolerate lossy distance calculations on leaf tuples;
+			 * there is no opportunity to re-sort the tuples afterwards.
+			 */
+			dist = FunctionCall4Coll(&key->sk_func,
+									 key->sk_collation,
+									 PointerGetDatum(&de),
+									 key->sk_argument,
+									 Int32GetDatum(key->sk_strategy),
+									 ObjectIdGetDatum(key->sk_subtype));
+
+			*distance_p = DatumGetFloat8(dist);
+		}
+
+		key++;
+		distance_p++;
+		keySize--;
+	}
+
+	return true;
 }
 
 /*
- * Fetch tuple(s) that match the search key; this can be invoked
- * either to fetch the first such tuple or subsequent matching tuples.
+ * Scan all items on the GiST index page identified by *pageItem, and insert
+ * them into the queue (or directly to output areas)
+ *
+ * scan: index scan we are executing
+ * pageItem: search queue item identifying an index page to scan
+ * myDistances: distances array associated with pageItem, or NULL at the root
+ * tbm: if not NULL, gistgetbitmap's output bitmap
+ * ntids: if not NULL, gistgetbitmap's output tuple counter
  *
- * This function is used by both gistgettuple and gistgetbitmap. When
- * invoked from gistgettuple, tbm is null and the next matching tuple
- * is returned in scan->xs_ctup.t_self.  When invoked from getbitmap,
- * tbm is non-null and all matching tuples are added to tbm before
- * returning.  In both cases, the function result is the number of
- * returned tuples.
+ * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
+ * tuples should be reported directly into the bitmap.	If they are NULL,
+ * we're doing a plain or ordered indexscan.  For a plain indexscan, heap
+ * tuple TIDs are returned into so->pageData[].  For an ordered indexscan,
+ * heap tuple TIDs are pushed into individual search queue items.
  *
- * If scan specifies to skip killed tuples, continue looping until we find a
- * non-killed tuple that matches the search key.
+ * If we detect that the index page has split since we saw its downlink
+ * in the parent, we push its new right sibling onto the queue so the
+ * sibling will be processed next.
  */
-static int64
-gistnext(IndexScanDesc scan, TIDBitmap *tbm)
+static void
+gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
+			 TIDBitmap *tbm, int64 *ntids)
 {
-	Page		p;
-	OffsetNumber n;
-	GISTScanOpaque so;
-	GISTSearchStack *stk;
-	IndexTuple	it;
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+	Buffer		buffer;
+	Page		page;
 	GISTPageOpaque opaque;
-	int64		ntids = 0;
+	OffsetNumber maxoff;
+	OffsetNumber i;
+	GISTSearchTreeItem *tmpItem = so->tmpTreeItem;
+	bool		isNew;
+	MemoryContext oldcxt;
 
-	so = (GISTScanOpaque) scan->opaque;
+	Assert(!GISTSearchItemIsHeap(*pageItem));
 
-	if (so->qual_ok == false)
-		return 0;
+	buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
+	LockBuffer(buffer, GIST_SHARE);
+	gistcheckpage(scan->indexRelation, buffer);
+	page = BufferGetPage(buffer);
+	opaque = GistPageGetOpaque(page);
 
-	if (so->curbuf == InvalidBuffer)
+	/*
+	 * Check if we need to follow the rightlink. We need to follow it if the
+	 * page was concurrently split since we visited the parent (in which case
+	 * parentlsn < nsn), or if the the system crashed after a page split but
+	 * before the downlink was inserted into the parent.
+	 */
+	if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
+		(GistFollowRight(page) ||
+		 XLByteLT(pageItem->data.parentlsn, opaque->nsn)) &&
+		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
 	{
-		if (ItemPointerIsValid(&so->curpos) == false)
-		{
-			/* Being asked to fetch the first entry, so start at the root */
-			Assert(so->curbuf == InvalidBuffer);
-			Assert(so->stack == NULL);
+		/* There was a page split, follow right link to add pages */
+		GISTSearchItem *item;
 
-			so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO);
+		/* This can't happen when starting at the root */
+		Assert(myDistances != NULL);
 
-			stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack));
+		oldcxt = MemoryContextSwitchTo(so->queueCxt);
 
-			stk->next = NULL;
-			stk->block = GIST_ROOT_BLKNO;
+		/* Create new GISTSearchItem for the right sibling index page */
+		item = palloc(sizeof(GISTSearchItem));
+		item->next = NULL;
+		item->blkno = opaque->rightlink;
+		item->data.parentlsn = pageItem->data.parentlsn;
 
-			pgstat_count_index_scan(scan->indexRelation);
-		}
-		else
-		{
-			/* scan is finished */
-			return 0;
-		}
+		/* Insert it into the queue using same distances as for this page */
+		tmpItem->head = item;
+		tmpItem->lastHeap = NULL;
+		memcpy(tmpItem->distances, myDistances,
+			   sizeof(double) * scan->numberOfOrderBys);
+
+		(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
+
+		MemoryContextSwitchTo(oldcxt);
 	}
 
+	so->nPageData = so->curPageData = 0;
+
 	/*
-	 * check stored pointers from last visit
+	 * check all tuples on page
 	 */
-	if (so->nPageData > 0)
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
+		IndexTuple	it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+		bool		match;
+		bool		recheck;
+
 		/*
-		 * gistgetmulti never should go here
+		 * Must call gistindex_keytest in tempCxt, and clean up any leftover
+		 * junk afterward.
 		 */
-		Assert(tbm == NULL);
+		oldcxt = MemoryContextSwitchTo(so->tempCxt);
 
-		if (so->curPageData < so->nPageData)
-		{
-			scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr;
-			scan->xs_recheck = so->pageData[so->curPageData].recheck;
+		match = gistindex_keytest(scan, it, page, i, &recheck);
 
-			ItemPointerSet(&so->curpos,
-						   BufferGetBlockNumber(so->curbuf),
-						   so->pageData[so->curPageData].pageOffset);
+		MemoryContextSwitchTo(oldcxt);
+		MemoryContextReset(so->tempCxt);
 
-			so->curPageData++;
+		/* Ignore tuple if it doesn't match */
+		if (!match)
+			continue;
 
-			return 1;
+		if (tbm && GistPageIsLeaf(page))
+		{
+			/*
+			 * getbitmap scan, so just push heap tuple TIDs into the bitmap
+			 * without worrying about ordering
+			 */
+			tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
+			(*ntids)++;
+		}
+		else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
+		{
+			/*
+			 * Non-ordered scan, so report heap tuples in so->pageData[]
+			 */
+			so->pageData[so->nPageData].heapPtr = it->t_tid;
+			so->pageData[so->nPageData].recheck = recheck;
+			so->nPageData++;
 		}
 		else
 		{
 			/*
-			 * Go to the next page
+			 * Must push item into search queue.  We get here for any lower
+			 * index page, and also for heap tuples if doing an ordered
+			 * search.
 			 */
-			stk = so->stack->next;
-			pfree(so->stack);
-			so->stack = stk;
+			GISTSearchItem *item;
 
-			/* If we're out of stack entries, we're done */
-			if (so->stack == NULL)
+			oldcxt = MemoryContextSwitchTo(so->queueCxt);
+
+			/* Create new GISTSearchItem for this item */
+			item = palloc(sizeof(GISTSearchItem));
+			item->next = NULL;
+
+			if (GistPageIsLeaf(page))
+			{
+				/* Creating heap-tuple GISTSearchItem */
+				item->blkno = InvalidBlockNumber;
+				item->data.heap.heapPtr = it->t_tid;
+				item->data.heap.recheck = recheck;
+			}
+			else
 			{
-				ReleaseBuffer(so->curbuf);
-				so->curbuf = InvalidBuffer;
-				return 0;
+				/* Creating index-page GISTSearchItem */
+				item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
+				/* lsn of current page is lsn of parent page for child */
+				item->data.parentlsn = PageGetLSN(page);
 			}
 
-			so->curbuf = ReleaseAndReadBuffer(so->curbuf,
-											  scan->indexRelation,
-											  stk->block);
+			/* Insert it into the queue using new distance data */
+			tmpItem->head = item;
+			tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL;
+			memcpy(tmpItem->distances, so->distances,
+				   sizeof(double) * scan->numberOfOrderBys);
+
+			(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
+
+			MemoryContextSwitchTo(oldcxt);
 		}
 	}
 
+	UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Extract next item (in order) from search queue
+ *
+ * Returns a GISTSearchItem or NULL.  Caller must pfree item when done with it.
+ *
+ * NOTE: on successful return, so->curTreeItem is the GISTSearchTreeItem that
+ * contained the result item.  Callers can use so->curTreeItem->distances as
+ * the distances value for the item.
+ */
+static GISTSearchItem *
+getNextGISTSearchItem(GISTScanOpaque so)
+{
 	for (;;)
 	{
-		CHECK_FOR_INTERRUPTS();
+		GISTSearchItem *item;
 
-		/* First of all, we need lock buffer */
-		Assert(so->curbuf != InvalidBuffer);
-		LockBuffer(so->curbuf, GIST_SHARE);
-		gistcheckpage(scan->indexRelation, so->curbuf);
-		p = BufferGetPage(so->curbuf);
-		opaque = GistPageGetOpaque(p);
-
-		/* remember lsn to identify page changed for tuple's killing */
-		so->stack->lsn = PageGetLSN(p);
-
-		/* check page split, occured since visit to parent */
-		if (!XLogRecPtrIsInvalid(so->stack->parentlsn) &&
-			XLByteLT(so->stack->parentlsn, opaque->nsn) &&
-			opaque->rightlink != InvalidBlockNumber /* sanity check */ &&
-			(so->stack->next == NULL || so->stack->next->block != opaque->rightlink)	/* check if already
-				added */ )
+		/* Update curTreeItem if we don't have one */
+		if (so->curTreeItem == NULL)
 		{
-			/* detect page split, follow right link to add pages */
-
-			stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));
-			stk->next = so->stack->next;
-			stk->block = opaque->rightlink;
-			stk->parentlsn = so->stack->parentlsn;
-			memset(&(stk->lsn), 0, sizeof(GistNSN));
-			so->stack->next = stk;
+			so->curTreeItem = (GISTSearchTreeItem *) rb_leftmost(so->queue);
+			/* Done when tree is empty */
+			if (so->curTreeItem == NULL)
+				break;
 		}
 
-		/* if page is empty, then just skip it */
-		if (PageIsEmpty(p))
+		item = so->curTreeItem->head;
+		if (item != NULL)
 		{
-			LockBuffer(so->curbuf, GIST_UNLOCK);
-			stk = so->stack->next;
-			pfree(so->stack);
-			so->stack = stk;
-
-			if (so->stack == NULL)
-			{
-				ReleaseBuffer(so->curbuf);
-				so->curbuf = InvalidBuffer;
-				return ntids;
-			}
-
-			so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation,
-											  stk->block);
-			continue;
+			/* Delink item from chain */
+			so->curTreeItem->head = item->next;
+			if (item == so->curTreeItem->lastHeap)
+				so->curTreeItem->lastHeap = NULL;
+			/* Return item; caller is responsible to pfree it */
+			return item;
 		}
 
-		n = FirstOffsetNumber;
-
-		/* wonderful, we can look at page */
-		so->nPageData = so->curPageData = 0;
-
-		for (;;)
-		{
-			n = gistfindnext(scan, n);
-
-			if (!OffsetNumberIsValid(n))
-			{
-				/*
-				 * If we was called from gistgettuple and current buffer
-				 * contains something matched then make a recursive call - it
-				 * will return ItemPointer from so->pageData. But we save
-				 * buffer pinned to support tuple's killing
-				 */
-				if (!tbm && so->nPageData > 0)
-				{
-					LockBuffer(so->curbuf, GIST_UNLOCK);
-					return gistnext(scan, NULL);
-				}
+		/* curTreeItem is exhausted, so remove it from rbtree */
+		rb_delete(so->queue, (RBNode *) so->curTreeItem);
+		so->curTreeItem = NULL;
+	}
 
-				/*
-				 * We ran out of matching index entries on the current page,
-				 * so pop the top stack entry and use it to continue the
-				 * search.
-				 */
-				LockBuffer(so->curbuf, GIST_UNLOCK);
-				stk = so->stack->next;
-				pfree(so->stack);
-				so->stack = stk;
-
-				/* If we're out of stack entries, we're done */
-
-				if (so->stack == NULL)
-				{
-					ReleaseBuffer(so->curbuf);
-					so->curbuf = InvalidBuffer;
-					return ntids;
-				}
-
-				so->curbuf = ReleaseAndReadBuffer(so->curbuf,
-												  scan->indexRelation,
-												  stk->block);
-				/* XXX	go up */
-				break;
-			}
+	return NULL;
+}
 
-			if (GistPageIsLeaf(p))
-			{
-				/*
-				 * We've found a matching index entry in a leaf page, so
-				 * return success. Note that we keep "curbuf" pinned so that
-				 * we can efficiently resume the index scan later.
-				 */
+/*
+ * Fetch next heap tuple in an ordered search
+ */
+static bool
+getNextNearest(IndexScanDesc scan)
+{
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+	bool		res = false;
 
-				if (!(scan->ignore_killed_tuples &&
-					  ItemIdIsDead(PageGetItemId(p, n))))
-				{
-					it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-					ntids++;
-					if (tbm != NULL)
-						tbm_add_tuples(tbm, &it->t_tid, 1, scan->xs_recheck);
-					else
-					{
-						so->pageData[so->nPageData].heapPtr = it->t_tid;
-						so->pageData[so->nPageData].pageOffset = n;
-						so->pageData[so->nPageData].recheck = scan->xs_recheck;
-						so->nPageData++;
-					}
-				}
-			}
-			else
-			{
-				/*
-				 * We've found an entry in an internal node whose key is
-				 * consistent with the search key, so push it to stack
-				 */
-				stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));
+	do
+	{
+		GISTSearchItem *item = getNextGISTSearchItem(so);
 
-				it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-				stk->block = ItemPointerGetBlockNumber(&(it->t_tid));
-				memset(&(stk->lsn), 0, sizeof(GistNSN));
-				stk->parentlsn = so->stack->lsn;
+		if (!item)
+			break;
 
-				stk->next = so->stack->next;
-				so->stack->next = stk;
-			}
+		if (GISTSearchItemIsHeap(*item))
+		{
+			/* found a heap item at currently minimal distance */
+			scan->xs_ctup.t_self = item->data.heap.heapPtr;
+			scan->xs_recheck = item->data.heap.recheck;
+			res = true;
+		}
+		else
+		{
+			/* visit an index page, extract its items into queue */
+			CHECK_FOR_INTERRUPTS();
 
-			n = OffsetNumberNext(n);
+			gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
 		}
-	}
 
-	return ntids;
+		pfree(item);
+	} while (!res);
+
+	return res;
 }
 
 /*
- * gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
- *
- * On success return for a leaf tuple, scan->xs_recheck is set to indicate
- * whether recheck is needed.  We recheck if any of the consistent() functions
- * request it.
- *
- * We must decompress the key in the IndexTuple before passing it to the
- * sk_func (and we have previously overwritten the sk_func to use the
- * user-defined Consistent method, so we actually are invoking that).
- *
- * Note that this function is always invoked in a short-lived memory context,
- * so we don't need to worry about cleaning up allocated memory, either here
- * or in the implementation of any Consistent methods.
+ * gistgettuple() -- Get the next tuple in the scan
  */
-static bool
-gistindex_keytest(IndexTuple tuple,
-				  IndexScanDesc scan,
-				  OffsetNumber offset)
+Datum
+gistgettuple(PG_FUNCTION_ARGS)
 {
-	int			keySize = scan->numberOfKeys;
-	ScanKey		key = scan->keyData;
-	Relation	r = scan->indexRelation;
-	GISTScanOpaque so;
-	Page		p;
-	GISTSTATE  *giststate;
-
-	so = (GISTScanOpaque) scan->opaque;
-	giststate = so->giststate;
-	p = BufferGetPage(so->curbuf);
+	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 
-	scan->xs_recheck = false;
+	if (dir != ForwardScanDirection)
+		elog(ERROR, "GiST only supports forward scan direction");
 
-	/*
-	 * Tuple doesn't restore after crash recovery because of incomplete insert
-	 */
-	if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple))
-		return true;
+	if (!so->qual_ok)
+		PG_RETURN_BOOL(false);
 
-	while (keySize > 0)
+	if (so->firstCall)
 	{
-		Datum		datum;
-		bool		isNull;
-		Datum		test;
-		bool		recheck;
-		GISTENTRY	de;
+		/* Begin the scan by processing the root page */
+		GISTSearchItem fakeItem;
 
-		datum = index_getattr(tuple,
-							  key->sk_attno,
-							  giststate->tupdesc,
-							  &isNull);
+		pgstat_count_index_scan(scan->indexRelation);
 
-		if (key->sk_flags & SK_ISNULL)
+		so->firstCall = false;
+		so->curTreeItem = NULL;
+		so->curPageData = so->nPageData = 0;
+
+		fakeItem.blkno = GIST_ROOT_BLKNO;
+		memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+		gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
+	}
+
+	if (scan->numberOfOrderBys > 0)
+	{
+		/* Must fetch tuples in strict distance order */
+		PG_RETURN_BOOL(getNextNearest(scan));
+	}
+	else
+	{
+		/* Fetch tuples index-page-at-a-time */
+		for (;;)
 		{
-			/*
-			 * On non-leaf page we can't conclude that child hasn't NULL
-			 * values because of assumption in GiST: union (VAL, NULL) is VAL.
-			 * But if on non-leaf page key IS NULL, then all children are
-			 * NULL.
-			 */
-			if (key->sk_flags & SK_SEARCHNULL)
+			if (so->curPageData < so->nPageData)
 			{
-				if (GistPageIsLeaf(p) && !isNull)
-					return false;
+				/* continuing to return tuples from a leaf page */
+				scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr;
+				scan->xs_recheck = so->pageData[so->curPageData].recheck;
+				so->curPageData++;
+				PG_RETURN_BOOL(true);
 			}
-			else
+
+			/* find and process the next index page */
+			do
 			{
-				Assert(key->sk_flags & SK_SEARCHNOTNULL);
-				if (isNull)
-					return false;
-			}
-		}
-		else if (isNull)
-		{
-			return false;
-		}
-		else
-		{
-			gistdentryinit(giststate, key->sk_attno - 1, &de,
-						   datum, r, p, offset,
-						   FALSE, isNull);
+				GISTSearchItem *item = getNextGISTSearchItem(so);
 
-			/*
-			 * Call the Consistent function to evaluate the test.  The
-			 * arguments are the index datum (as a GISTENTRY*), the comparison
-			 * datum, the comparison operator's strategy number and subtype
-			 * from pg_amop, and the recheck flag.
-			 *
-			 * (Presently there's no need to pass the subtype since it'll
-			 * always be zero, but might as well pass it for possible future
-			 * use.)
-			 *
-			 * We initialize the recheck flag to true (the safest assumption)
-			 * in case the Consistent function forgets to set it.
-			 */
-			recheck = true;
+				if (!item)
+					PG_RETURN_BOOL(false);
 
-			test = FunctionCall5(&key->sk_func,
-								 PointerGetDatum(&de),
-								 key->sk_argument,
-								 Int32GetDatum(key->sk_strategy),
-								 ObjectIdGetDatum(key->sk_subtype),
-								 PointerGetDatum(&recheck));
+				CHECK_FOR_INTERRUPTS();
 
-			if (!DatumGetBool(test))
-				return false;
-			scan->xs_recheck |= recheck;
-		}
+				/*
+				 * While scanning a leaf page, ItemPointers of matching heap
+				 * tuples are stored in so->pageData.  If there are any on
+				 * this page, we fall out of the inner "do" and loop around to
+				 * return them.
+				 */
+				gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
 
-		keySize--;
-		key++;
+				pfree(item);
+			} while (so->nPageData == 0);
+		}
 	}
 
-	return true;
+	PG_RETURN_BOOL(false);		/* keep compiler quiet */
 }
 
 /*
- * Return the offset of the first index entry that is consistent with
- * the search key after offset 'n' in the current page. If there are
- * no more consistent entries, return InvalidOffsetNumber.
- * On success, scan->xs_recheck is set correctly, too.
- * Page should be locked....
+ * gistgetbitmap() -- Get a bitmap of all heap tuple locations
  */
-static OffsetNumber
-gistfindnext(IndexScanDesc scan, OffsetNumber n)
+Datum
+gistgetbitmap(PG_FUNCTION_ARGS)
 {
-	OffsetNumber maxoff;
-	IndexTuple	it;
-	GISTScanOpaque so;
-	MemoryContext oldcxt;
-	Page		p;
+	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+	TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+	int64		ntids = 0;
+	GISTSearchItem fakeItem;
 
-	so = (GISTScanOpaque) scan->opaque;
-	p = BufferGetPage(so->curbuf);
-	maxoff = PageGetMaxOffsetNumber(p);
+	if (!so->qual_ok)
+		PG_RETURN_INT64(0);
+
+	pgstat_count_index_scan(scan->indexRelation);
+
+	/* Begin the scan by processing the root page */
+	so->curTreeItem = NULL;
+	so->curPageData = so->nPageData = 0;
+
+	fakeItem.blkno = GIST_ROOT_BLKNO;
+	memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+	gistScanPage(scan, &fakeItem, NULL, tbm, &ntids);
 
 	/*
-	 * Make sure we're in a short-lived memory context when we invoke a
-	 * user-supplied GiST method in gistindex_keytest(), so we don't leak
-	 * memory
+	 * While scanning a leaf page, ItemPointers of matching heap tuples will
+	 * be stored directly into tbm, so we don't need to deal with them here.
 	 */
-	oldcxt = MemoryContextSwitchTo(so->tempCxt);
-
-	while (n >= FirstOffsetNumber && n <= maxoff)
+	for (;;)
 	{
-		it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-		if (gistindex_keytest(it, scan, n))
+		GISTSearchItem *item = getNextGISTSearchItem(so);
+
+		if (!item)
 			break;
 
-		n = OffsetNumberNext(n);
-	}
+		CHECK_FOR_INTERRUPTS();
 
-	MemoryContextSwitchTo(oldcxt);
-	MemoryContextReset(so->tempCxt);
+		gistScanPage(scan, item, so->curTreeItem->distances, tbm, &ntids);
 
-	/*
-	 * If we found a matching entry, return its offset; otherwise return
-	 * InvalidOffsetNumber to inform the caller to go to the next page.
-	 */
-	if (n >= FirstOffsetNumber && n <= maxoff)
-		return n;
-	else
-		return InvalidOffsetNumber;
+		pfree(item);
+	}
+
+	PG_RETURN_INT64(ntids);
 }
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
index cb34b26113..43c4b1251b 100644
--- a/src/backend/access/gist/gistproc.c
+++ b/src/backend/access/gist/gistproc.c
@@ -6,11 +6,11 @@
  * This gives R-tree behavior, with Guttman's poly-time split algorithm.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.21 2010/02/26 02:00:33 momjian Exp $
+ *	src/backend/access/gist/gistproc.c
  *
  *-------------------------------------------------------------------------
  */
@@ -380,12 +380,12 @@ gist_box_picksplit(PG_FUNCTION_ARGS)
 	for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
 	{
 		cur = DatumGetBoxP(entryvec->vector[i].key);
-		if (allisequal == true && (
-								   pageunion.high.x != cur->high.x ||
-								   pageunion.high.y != cur->high.y ||
-								   pageunion.low.x != cur->low.x ||
-								   pageunion.low.y != cur->low.y
-								   ))
+		if (allisequal && (
+						   pageunion.high.x != cur->high.x ||
+						   pageunion.high.y != cur->high.y ||
+						   pageunion.low.x != cur->low.x ||
+						   pageunion.low.y != cur->low.y
+						   ))
 			allisequal = false;
 
 		adjustBox(&pageunion, cur);
@@ -904,6 +904,76 @@ gist_point_compress(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(entry);
 }
 
+#define point_point_distance(p1,p2) \
+	DatumGetFloat8(DirectFunctionCall2(point_distance, \
+									   PointPGetDatum(p1), PointPGetDatum(p2)))
+
+static double
+computeDistance(bool isLeaf, BOX *box, Point *point)
+{
+	double		result = 0.0;
+
+	if (isLeaf)
+	{
+		/* simple point to point distance */
+		result = point_point_distance(point, &box->low);
+	}
+	else if (point->x <= box->high.x && point->x >= box->low.x &&
+			 point->y <= box->high.y && point->y >= box->low.y)
+	{
+		/* point inside the box */
+		result = 0.0;
+	}
+	else if (point->x <= box->high.x && point->x >= box->low.x)
+	{
+		/* point is over or below box */
+		Assert(box->low.y <= box->high.y);
+		if (point->y > box->high.y)
+			result = point->y - box->high.y;
+		else if (point->y < box->low.y)
+			result = box->low.y - point->y;
+		else
+			elog(ERROR, "inconsistent point values");
+	}
+	else if (point->y <= box->high.y && point->y >= box->low.y)
+	{
+		/* point is to left or right of box */
+		Assert(box->low.x <= box->high.x);
+		if (point->x > box->high.x)
+			result = point->x - box->high.x;
+		else if (point->x < box->low.x)
+			result = box->low.x - point->x;
+		else
+			elog(ERROR, "inconsistent point values");
+	}
+	else
+	{
+		/* closest point will be a vertex */
+		Point		p;
+		double		subresult;
+
+		result = point_point_distance(point, &box->low);
+
+		subresult = point_point_distance(point, &box->high);
+		if (result > subresult)
+			result = subresult;
+
+		p.x = box->low.x;
+		p.y = box->high.y;
+		subresult = point_point_distance(point, &p);
+		if (result > subresult)
+			result = subresult;
+
+		p.x = box->high.x;
+		p.y = box->low.y;
+		subresult = point_point_distance(point, &p);
+		if (result > subresult)
+			result = subresult;
+	}
+
+	return result;
+}
+
 static bool
 gist_point_consistent_internal(StrategyNumber strategy,
 							   bool isLeaf, BOX *key, Point *query)
@@ -954,8 +1024,8 @@ gist_point_consistent(PG_FUNCTION_ARGS)
 {
 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
-	bool		result;
 	bool	   *recheck = (bool *) PG_GETARG_POINTER(4);
+	bool		result;
 	StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
 
 	switch (strategyGroup)
@@ -1034,9 +1104,32 @@ gist_point_consistent(PG_FUNCTION_ARGS)
 			}
 			break;
 		default:
-			result = false;		/* silence compiler warning */
 			elog(ERROR, "unknown strategy number: %d", strategy);
+			result = false;		/* keep compiler quiet */
 	}
 
 	PG_RETURN_BOOL(result);
 }
+
+Datum
+gist_point_distance(PG_FUNCTION_ARGS)
+{
+	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+	double		distance;
+	StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
+
+	switch (strategyGroup)
+	{
+		case PointStrategyNumberGroup:
+			distance = computeDistance(GIST_LEAF(entry),
+									   DatumGetBoxP(entry->key),
+									   PG_GETARG_POINT_P(1));
+			break;
+		default:
+			elog(ERROR, "unknown strategy number: %d", strategy);
+			distance = 0.0;		/* keep compiler quiet */
+	}
+
+	PG_RETURN_FLOAT8(distance);
+}
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index a53d8cd087..5662a3a4aa 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -4,11 +4,11 @@
  *	  routines to manage scans on GiST index relations
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.79 2010/02/26 02:00:33 momjian Exp $
+ *	  src/backend/access/gist/gistscan.c
  *
  *-------------------------------------------------------------------------
  */
@@ -20,18 +20,112 @@
 #include "access/relscan.h"
 #include "storage/bufmgr.h"
 #include "utils/memutils.h"
+#include "utils/rel.h"
 
-static void gistfreestack(GISTSearchStack *s);
+
+/*
+ * RBTree support functions for the GISTSearchTreeItem queue
+ */
+
+static int
+GISTSearchTreeItemComparator(const RBNode *a, const RBNode *b, void *arg)
+{
+	const GISTSearchTreeItem *sa = (const GISTSearchTreeItem *) a;
+	const GISTSearchTreeItem *sb = (const GISTSearchTreeItem *) b;
+	IndexScanDesc scan = (IndexScanDesc) arg;
+	int			i;
+
+	/* Order according to distance comparison */
+	for (i = 0; i < scan->numberOfOrderBys; i++)
+	{
+		if (sa->distances[i] != sb->distances[i])
+			return (sa->distances[i] > sb->distances[i]) ? 1 : -1;
+	}
+
+	return 0;
+}
+
+static void
+GISTSearchTreeItemCombiner(RBNode *existing, const RBNode *newrb, void *arg)
+{
+	GISTSearchTreeItem *scurrent = (GISTSearchTreeItem *) existing;
+	const GISTSearchTreeItem *snew = (const GISTSearchTreeItem *) newrb;
+	GISTSearchItem *newitem = snew->head;
+
+	/* snew should have just one item in its chain */
+	Assert(newitem && newitem->next == NULL);
+
+	/*
+	 * If new item is heap tuple, it goes to front of chain; otherwise insert
+	 * it before the first index-page item, so that index pages are visited in
+	 * LIFO order, ensuring depth-first search of index pages.	See comments
+	 * in gist_private.h.
+	 */
+	if (GISTSearchItemIsHeap(*newitem))
+	{
+		newitem->next = scurrent->head;
+		scurrent->head = newitem;
+		if (scurrent->lastHeap == NULL)
+			scurrent->lastHeap = newitem;
+	}
+	else if (scurrent->lastHeap == NULL)
+	{
+		newitem->next = scurrent->head;
+		scurrent->head = newitem;
+	}
+	else
+	{
+		newitem->next = scurrent->lastHeap->next;
+		scurrent->lastHeap->next = newitem;
+	}
+}
+
+static RBNode *
+GISTSearchTreeItemAllocator(void *arg)
+{
+	IndexScanDesc scan = (IndexScanDesc) arg;
+
+	return palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
+}
+
+static void
+GISTSearchTreeItemDeleter(RBNode *rb, void *arg)
+{
+	pfree(rb);
+}
+
+
+/*
+ * Index AM API functions for scanning GiST indexes
+ */
 
 Datum
 gistbeginscan(PG_FUNCTION_ARGS)
 {
 	Relation	r = (Relation) PG_GETARG_POINTER(0);
 	int			nkeys = PG_GETARG_INT32(1);
-	ScanKey		key = (ScanKey) PG_GETARG_POINTER(2);
+	int			norderbys = PG_GETARG_INT32(2);
 	IndexScanDesc scan;
+	GISTScanOpaque so;
 
-	scan = RelationGetIndexScan(r, nkeys, key);
+	scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+	/* initialize opaque data */
+	so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData));
+	so->queueCxt = AllocSetContextCreate(CurrentMemoryContext,
+										 "GiST queue context",
+										 ALLOCSET_DEFAULT_MINSIZE,
+										 ALLOCSET_DEFAULT_INITSIZE,
+										 ALLOCSET_DEFAULT_MAXSIZE);
+	so->tempCxt = createTempGistContext();
+	so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
+	initGISTstate(so->giststate, scan->indexRelation);
+	/* workspaces with size dependent on numberOfOrderBys: */
+	so->tmpTreeItem = palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
+	so->distances = palloc(sizeof(double) * scan->numberOfOrderBys);
+	so->qual_ok = true;			/* in case there are zero keys */
+
+	scan->opaque = so;
 
 	PG_RETURN_POINTER(scan);
 }
@@ -41,42 +135,28 @@ gistrescan(PG_FUNCTION_ARGS)
 {
 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	ScanKey		key = (ScanKey) PG_GETARG_POINTER(1);
-	GISTScanOpaque so;
+	ScanKey		orderbys = (ScanKey) PG_GETARG_POINTER(3);
+
+	/* nkeys and norderbys arguments are ignored */
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 	int			i;
+	MemoryContext oldCxt;
 
-	so = (GISTScanOpaque) scan->opaque;
-	if (so != NULL)
-	{
-		/* rescan an existing indexscan --- reset state */
-		gistfreestack(so->stack);
-		so->stack = NULL;
-		/* drop pins on buffers -- no locks held */
-		if (BufferIsValid(so->curbuf))
-		{
-			ReleaseBuffer(so->curbuf);
-			so->curbuf = InvalidBuffer;
-		}
-	}
-	else
-	{
-		/* initialize opaque data */
-		so = (GISTScanOpaque) palloc(sizeof(GISTScanOpaqueData));
-		so->stack = NULL;
-		so->tempCxt = createTempGistContext();
-		so->curbuf = InvalidBuffer;
-		so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
-		initGISTstate(so->giststate, scan->indexRelation);
-
-		scan->opaque = so;
-	}
+	/* rescan an existing indexscan --- reset state */
+	MemoryContextReset(so->queueCxt);
+	so->curTreeItem = NULL;
 
-	/*
-	 * Clear all the pointers.
-	 */
-	ItemPointerSetInvalid(&so->curpos);
-	so->nPageData = so->curPageData = 0;
+	/* create new, empty RBTree for search queue */
+	oldCxt = MemoryContextSwitchTo(so->queueCxt);
+	so->queue = rb_create(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys,
+						  GISTSearchTreeItemComparator,
+						  GISTSearchTreeItemCombiner,
+						  GISTSearchTreeItemAllocator,
+						  GISTSearchTreeItemDeleter,
+						  scan);
+	MemoryContextSwitchTo(oldCxt);
 
-	so->qual_ok = true;
+	so->firstCall = true;
 
 	/* Update scan key, if a new one is given */
 	if (key && scan->numberOfKeys > 0)
@@ -85,8 +165,8 @@ gistrescan(PG_FUNCTION_ARGS)
 				scan->numberOfKeys * sizeof(ScanKeyData));
 
 		/*
-		 * Modify the scan key so that all the Consistent method is called for
-		 * all comparisons. The original operator is passed to the Consistent
+		 * Modify the scan key so that the Consistent method is called for all
+		 * comparisons. The original operator is passed to the Consistent
 		 * function in the form of its strategy number, which is available
 		 * from the sk_strategy field, and its subtype from the sk_subtype
 		 * field.
@@ -95,9 +175,11 @@ gistrescan(PG_FUNCTION_ARGS)
 		 * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we
 		 * assume all indexable operators are strict).
 		 */
+		so->qual_ok = true;
+
 		for (i = 0; i < scan->numberOfKeys; i++)
 		{
-			ScanKey		skey = &(scan->keyData[i]);
+			ScanKey		skey = scan->keyData + i;
 
 			skey->sk_func = so->giststate->consistentFn[skey->sk_attno - 1];
 
@@ -109,6 +191,33 @@ gistrescan(PG_FUNCTION_ARGS)
 		}
 	}
 
+	/* Update order-by key, if a new one is given */
+	if (orderbys && scan->numberOfOrderBys > 0)
+	{
+		memmove(scan->orderByData, orderbys,
+				scan->numberOfOrderBys * sizeof(ScanKeyData));
+
+		/*
+		 * Modify the order-by key so that the Distance method is called for
+		 * all comparisons. The original operator is passed to the Distance
+		 * function in the form of its strategy number, which is available
+		 * from the sk_strategy field, and its subtype from the sk_subtype
+		 * field.
+		 */
+		for (i = 0; i < scan->numberOfOrderBys; i++)
+		{
+			ScanKey		skey = scan->orderByData + i;
+
+			skey->sk_func = so->giststate->distanceFn[skey->sk_attno - 1];
+
+			/* Check we actually have a distance function ... */
+			if (!OidIsValid(skey->sk_func.fn_oid))
+				elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
+					 GIST_DISTANCE_PROC, skey->sk_attno,
+					 RelationGetRelationName(scan->indexRelation));
+		}
+	}
+
 	PG_RETURN_VOID();
 }
 
@@ -130,33 +239,14 @@ Datum
 gistendscan(PG_FUNCTION_ARGS)
 {
 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
-	GISTScanOpaque so;
-
-	so = (GISTScanOpaque) scan->opaque;
+	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 
-	if (so != NULL)
-	{
-		gistfreestack(so->stack);
-		if (so->giststate != NULL)
-			freeGISTstate(so->giststate);
-		/* drop pins on buffers -- we aren't holding any locks */
-		if (BufferIsValid(so->curbuf))
-			ReleaseBuffer(so->curbuf);
-		MemoryContextDelete(so->tempCxt);
-		pfree(scan->opaque);
-	}
+	freeGISTstate(so->giststate);
+	MemoryContextDelete(so->queueCxt);
+	MemoryContextDelete(so->tempCxt);
+	pfree(so->tmpTreeItem);
+	pfree(so->distances);
+	pfree(so);
 
 	PG_RETURN_VOID();
 }
-
-static void
-gistfreestack(GISTSearchStack *s)
-{
-	while (s != NULL)
-	{
-		GISTSearchStack *p = s->next;
-
-		pfree(s);
-		s = p;
-	}
-}
diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c
index 5700e530fe..bd846cecca 100644
--- a/src/backend/access/gist/gistsplit.c
+++ b/src/backend/access/gist/gistsplit.c
@@ -4,11 +4,11 @@
  *	  Split page algorithm
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistsplit.c,v 1.12 2010/01/02 16:57:34 momjian Exp $
+ *	  src/backend/access/gist/gistsplit.c
  *
  *-------------------------------------------------------------------------
  */
@@ -325,16 +325,18 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC
 	evec->n = v->spl_nleft;
 	memcpy(evec->vector, entryvec->vector + FirstOffsetNumber,
 		   sizeof(GISTENTRY) * evec->n);
-	v->spl_ldatum = FunctionCall2(&giststate->unionFn[attno],
-								  PointerGetDatum(evec),
-								  PointerGetDatum(&nbytes));
+	v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno],
+									  giststate->supportCollation[attno],
+									  PointerGetDatum(evec),
+									  PointerGetDatum(&nbytes));
 
 	evec->n = v->spl_nright;
 	memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft,
 		   sizeof(GISTENTRY) * evec->n);
-	v->spl_rdatum = FunctionCall2(&giststate->unionFn[attno],
-								  PointerGetDatum(evec),
-								  PointerGetDatum(&nbytes));
+	v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno],
+									  giststate->supportCollation[attno],
+									  PointerGetDatum(evec),
+									  PointerGetDatum(&nbytes));
 }
 
 /*
@@ -361,9 +363,10 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
 	sv->spl_ldatum = v->spl_lattr[attno];
 	sv->spl_rdatum = v->spl_rattr[attno];
 
-	FunctionCall2(&giststate->picksplitFn[attno],
-				  PointerGetDatum(entryvec),
-				  PointerGetDatum(sv));
+	FunctionCall2Coll(&giststate->picksplitFn[attno],
+					  giststate->supportCollation[attno],
+					  PointerGetDatum(entryvec),
+					  PointerGetDatum(sv));
 
 	if (sv->spl_nleft == 0 || sv->spl_nright == 0)
 	{
@@ -500,58 +503,6 @@ gistSplitHalf(GIST_SPLITVEC *v, int len)
 }
 
 /*
- * if it was invalid tuple then we need special processing.
- * We move all invalid tuples on right page.
- *
- * if there is no place on left page, gistSplit will be called one more
- * time for left page.
- *
- * Normally, we never exec this code, but after crash replay it's possible
- * to get 'invalid' tuples (probability is low enough)
- */
-static void
-gistSplitByInvalid(GISTSTATE *giststate, GistSplitVector *v, IndexTuple *itup, int len)
-{
-	int			i;
-	static OffsetNumber offInvTuples[MaxOffsetNumber];
-	int			nOffInvTuples = 0;
-
-	for (i = 1; i <= len; i++)
-		if (GistTupleIsInvalid(itup[i - 1]))
-			offInvTuples[nOffInvTuples++] = i;
-
-	if (nOffInvTuples == len)
-	{
-		/* corner case, all tuples are invalid */
-		v->spl_rightvalid = v->spl_leftvalid = false;
-		gistSplitHalf(&v->splitVector, len);
-	}
-	else
-	{
-		GistSplitUnion gsvp;
-
-		v->splitVector.spl_right = offInvTuples;
-		v->splitVector.spl_nright = nOffInvTuples;
-		v->spl_rightvalid = false;
-
-		v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
-		v->splitVector.spl_nleft = 0;
-		for (i = 1; i <= len; i++)
-			if (!GistTupleIsInvalid(itup[i - 1]))
-				v->splitVector.spl_left[v->splitVector.spl_nleft++] = i;
-		v->spl_leftvalid = true;
-
-		gsvp.equiv = NULL;
-		gsvp.attr = v->spl_lattr;
-		gsvp.len = v->splitVector.spl_nleft;
-		gsvp.entries = v->splitVector.spl_left;
-		gsvp.isnull = v->spl_lisnull;
-
-		gistunionsubkeyvec(giststate, itup, &gsvp, 0);
-	}
-}
-
-/*
  * trys to split page by attno key, in a case of null
  * values move its to separate page.
  */
@@ -568,12 +519,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist
 		Datum		datum;
 		bool		IsNull;
 
-		if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1]))
-		{
-			gistSplitByInvalid(giststate, v, itup, len);
-			return;
-		}
-
 		datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull);
 		gistdentryinit(giststate, attno, &(entryvec->vector[i]),
 					   datum, r, page, i,
@@ -582,8 +527,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist
 			offNullTuples[nOffNullTuples++] = i;
 	}
 
-	v->spl_leftvalid = v->spl_rightvalid = true;
-
 	if (nOffNullTuples == len)
 	{
 		/*
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 03c5773d4d..1754a10369 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -4,15 +4,17 @@
  *	  utilities routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.35 2010/01/02 16:57:34 momjian Exp $
+ *			src/backend/access/gist/gistutil.c
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include <math.h>
+
 #include "access/gist_private.h"
 #include "access/reloptions.h"
 #include "storage/freespace.h"
@@ -152,7 +154,7 @@ gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
  * invalid tuple. Resulting Datums aren't compressed.
  */
 
-bool
+void
 gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey,
 				   Datum *attr, bool *isnull)
 {
@@ -180,10 +182,6 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke
 			Datum		datum;
 			bool		IsNull;
 
-			if (GistTupleIsInvalid(itvec[j]))
-				return FALSE;	/* signals that union with invalid tuple =>
-								 * result is invalid */
-
 			datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull);
 			if (IsNull)
 				continue;
@@ -211,15 +209,14 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke
 			}
 
 			/* Make union and store in attr array */
-			attr[i] = FunctionCall2(&giststate->unionFn[i],
-									PointerGetDatum(evec),
-									PointerGetDatum(&attrsize));
+			attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
+										giststate->supportCollation[i],
+										PointerGetDatum(evec),
+										PointerGetDatum(&attrsize));
 
 			isnull[i] = FALSE;
 		}
 	}
-
-	return TRUE;
 }
 
 /*
@@ -231,8 +228,7 @@ gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
 {
 	memset(isnullS, TRUE, sizeof(bool) * giststate->tupdesc->natts);
 
-	if (!gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS))
-		return gist_form_invalid_tuple(InvalidBlockNumber);
+	gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS);
 
 	return gistFormTuple(giststate, r, attrS, isnullS, false);
 }
@@ -278,9 +274,10 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno,
 		}
 
 		*dstisnull = FALSE;
-		*dst = FunctionCall2(&giststate->unionFn[attno],
-							 PointerGetDatum(evec),
-							 PointerGetDatum(&dstsize));
+		*dst = FunctionCall2Coll(&giststate->unionFn[attno],
+								 giststate->supportCollation[attno],
+								 PointerGetDatum(evec),
+								 PointerGetDatum(&dstsize));
 	}
 }
 
@@ -289,9 +286,10 @@ gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
 {
 	bool		result;
 
-	FunctionCall3(&giststate->equalFn[attno],
-				  a, b,
-				  PointerGetDatum(&result));
+	FunctionCall3Coll(&giststate->equalFn[attno],
+					  giststate->supportCollation[attno],
+					  a, b,
+					  PointerGetDatum(&result));
 	return result;
 }
 
@@ -328,9 +326,6 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis
 	IndexTuple	newtup = NULL;
 	int			i;
 
-	if (GistTupleIsInvalid(oldtup) || GistTupleIsInvalid(addtup))
-		return gist_form_invalid_tuple(ItemPointerGetBlockNumber(&(oldtup->t_tid)));
-
 	gistDeCompressAtt(giststate, r, oldtup, NULL,
 					  (OffsetNumber) 0, oldentries, oldisnull);
 
@@ -401,14 +396,6 @@ gistchoose(Relation r, Page p, IndexTuple it,	/* it has compressed entry */
 		int			j;
 		IndexTuple	itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
 
-		if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup))
-		{
-			ereport(LOG,
-					(errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery",
-							RelationGetRelationName(r))));
-			continue;
-		}
-
 		sum_grow = 0;
 		for (j = 0; j < r->rd_att->natts; j++)
 		{
@@ -460,8 +447,9 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
 
 		gistentryinit(*e, k, r, pg, o, l);
 		dep = (GISTENTRY *)
-			DatumGetPointer(FunctionCall1(&giststate->decompressFn[nkey],
-										  PointerGetDatum(e)));
+			DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
+										   giststate->supportCollation[nkey],
+											  PointerGetDatum(e)));
 		/* decompressFn may just return the given pointer */
 		if (dep != e)
 			gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
@@ -486,8 +474,9 @@ gistcentryinit(GISTSTATE *giststate, int nkey,
 
 		gistentryinit(*e, k, r, pg, o, l);
 		cep = (GISTENTRY *)
-			DatumGetPointer(FunctionCall1(&giststate->compressFn[nkey],
-										  PointerGetDatum(e)));
+			DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey],
+										   giststate->supportCollation[nkey],
+											  PointerGetDatum(e)));
 		/* compressFn may just return the given pointer */
 		if (cep != e)
 			gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset,
@@ -521,7 +510,12 @@ gistFormTuple(GISTSTATE *giststate, Relation r,
 	}
 
 	res = index_form_tuple(giststate->tupdesc, compatt, isnull);
-	GistTupleSetValid(res);
+
+	/*
+	 * The offset number on tuples on internal pages is unused. For historical
+	 * reasons, it is set 0xffff.
+	 */
+	ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
 	return res;
 }
 
@@ -532,16 +526,23 @@ gistpenalty(GISTSTATE *giststate, int attno,
 {
 	float		penalty = 0.0;
 
-	if (giststate->penaltyFn[attno].fn_strict == FALSE || (isNullOrig == FALSE && isNullAdd == FALSE))
-		FunctionCall3(&giststate->penaltyFn[attno],
-					  PointerGetDatum(orig),
-					  PointerGetDatum(add),
-					  PointerGetDatum(&penalty));
+	if (giststate->penaltyFn[attno].fn_strict == FALSE ||
+		(isNullOrig == FALSE && isNullAdd == FALSE))
+	{
+		FunctionCall3Coll(&giststate->penaltyFn[attno],
+						  giststate->supportCollation[attno],
+						  PointerGetDatum(orig),
+						  PointerGetDatum(add),
+						  PointerGetDatum(&penalty));
+		/* disallow negative or NaN penalty */
+		if (isnan(penalty) || penalty < 0.0)
+			penalty = 0.0;
+	}
 	else if (isNullOrig && isNullAdd)
 		penalty = 0.0;
 	else
-		penalty = 1e10;			/* try to prevent to mix null and non-null
-								 * value */
+		penalty = 1e10;			/* try to prevent mixing null and non-null
+								 * values */
 
 	return penalty;
 }
@@ -677,3 +678,24 @@ gistoptions(PG_FUNCTION_ARGS)
 		PG_RETURN_BYTEA_P(result);
 	PG_RETURN_NULL();
 }
+
+/*
+ * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
+ * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
+ * sequence of LSNs for that purpose. Each call generates an LSN that is
+ * greater than any previous value returned by this function in the same
+ * session.
+ */
+XLogRecPtr
+GetXLogRecPtrForTemp(void)
+{
+	static XLogRecPtr counter = {0, 1};
+
+	counter.xrecoff++;
+	if (counter.xrecoff == 0)
+	{
+		counter.xlogid++;
+		counter.xrecoff++;
+	}
+	return counter;
+}
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index abd3d99956..33e6f34154 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -4,11 +4,11 @@
  *	  vacuuming routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.48 2010/02/08 05:17:31 tgl Exp $
+ *	  src/backend/access/gist/gistvacuum.c
  *
  *-------------------------------------------------------------------------
  */
@@ -26,13 +26,6 @@
 #include "utils/memutils.h"
 
 
-typedef struct GistBulkDeleteResult
-{
-	IndexBulkDeleteResult std;	/* common state */
-	bool		needReindex;
-} GistBulkDeleteResult;
-
-
 /*
  * VACUUM cleanup: update FSM
  */
@@ -40,13 +33,11 @@ Datum
 gistvacuumcleanup(PG_FUNCTION_ARGS)
 {
 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
-	GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1);
+	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 	Relation	rel = info->index;
 	BlockNumber npages,
 				blkno;
 	BlockNumber totFreePages;
-	BlockNumber lastBlock = GIST_ROOT_BLKNO,
-				lastFilledBlock = GIST_ROOT_BLKNO;
 	bool		needLock;
 
 	/* No-op in ANALYZE ONLY mode */
@@ -56,10 +47,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 	/* Set up all-zero stats if gistbulkdelete wasn't called */
 	if (stats == NULL)
 	{
-		stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 		/* use heap's tuple count */
-		stats->std.num_index_tuples = info->num_heap_tuples;
-		stats->std.estimated_count = info->estimated_count;
+		stats->num_index_tuples = info->num_heap_tuples;
+		stats->estimated_count = info->estimated_count;
 
 		/*
 		 * XXX the above is wrong if index is partial.	Would it be OK to just
@@ -67,11 +58,6 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 		 */
 	}
 
-	if (stats->needReindex)
-		ereport(NOTICE,
-				(errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery",
-						RelationGetRelationName(rel))));
-
 	/*
 	 * Need lock unless it's local to this backend.
 	 */
@@ -102,20 +88,17 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 			totFreePages++;
 			RecordFreeIndexPage(rel, blkno);
 		}
-		else
-			lastFilledBlock = blkno;
 		UnlockReleaseBuffer(buffer);
 	}
-	lastBlock = npages - 1;
 
 	/* Finally, vacuum the FSM */
 	IndexFreeSpaceMapVacuum(info->index);
 
 	/* return statistics */
-	stats->std.pages_free = totFreePages;
+	stats->pages_free = totFreePages;
 	if (needLock)
 		LockRelationForExtension(rel, ExclusiveLock);
-	stats->std.num_pages = RelationGetNumberOfBlocks(rel);
+	stats->num_pages = RelationGetNumberOfBlocks(rel);
 	if (needLock)
 		UnlockRelationForExtension(rel, ExclusiveLock);
 
@@ -135,7 +118,7 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	GISTPageOpaque opaque = GistPageGetOpaque(page);
 
 	if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
-		XLByteLT(stack->parentlsn, opaque->nsn) &&
+		(GistFollowRight(page) || XLByteLT(stack->parentlsn, opaque->nsn)) &&
 		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
 	{
 		/* split page detected, install right link to the stack */
@@ -162,7 +145,7 @@ Datum
 gistbulkdelete(PG_FUNCTION_ARGS)
 {
 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
-	GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1);
+	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
 	void	   *callback_state = (void *) PG_GETARG_POINTER(3);
 	Relation	rel = info->index;
@@ -171,10 +154,10 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 
 	/* first time through? */
 	if (stats == NULL)
-		stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 	/* we'll re-count the tuples each time */
-	stats->std.estimated_count = false;
-	stats->std.num_index_tuples = 0;
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
 
 	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
 	stack->blkno = GIST_ROOT_BLKNO;
@@ -232,10 +215,10 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 				{
 					todelete[ntodelete] = i - ntodelete;
 					ntodelete++;
-					stats->std.tuples_removed += 1;
+					stats->tuples_removed += 1;
 				}
 				else
-					stats->std.num_index_tuples += 1;
+					stats->num_index_tuples += 1;
 			}
 
 			if (ntodelete)
@@ -248,27 +231,18 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 					PageIndexTupleDelete(page, todelete[i]);
 				GistMarkTuplesDeleted(page);
 
-				if (!rel->rd_istemp)
+				if (RelationNeedsWAL(rel))
 				{
-					XLogRecData *rdata;
 					XLogRecPtr	recptr;
-					gistxlogPageUpdate *xlinfo;
 
-					rdata = formUpdateRdata(rel->rd_node, buffer,
+					recptr = gistXLogUpdate(rel->rd_node, buffer,
 											todelete, ntodelete,
-											NULL, 0,
-											NULL);
-					xlinfo = (gistxlogPageUpdate *) rdata->next->data;
-
-					recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
+											NULL, 0, InvalidBuffer);
 					PageSetLSN(page, recptr);
 					PageSetTLI(page, ThisTimeLineID);
-
-					pfree(xlinfo);
-					pfree(rdata);
 				}
 				else
-					PageSetLSN(page, XLogRecPtrForTemp);
+					PageSetLSN(page, GetXLogRecPtrForTemp());
 
 				END_CRIT_SECTION();
 			}
@@ -293,7 +267,11 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 				stack->next = ptr;
 
 				if (GistTupleIsInvalid(idxtuple))
-					stats->needReindex = true;
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+									RelationGetRelationName(rel)),
+							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
+							 errhint("Please REINDEX it.")));
 			}
 		}
 
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7f5dd990c8..02c4ec3a6f 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -4,11 +4,11 @@
  *	  WAL replay logic for GiST.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.35 2010/01/02 16:57:34 momjian Exp $
+ *			 src/backend/access/gist/gistxlog.c
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -20,15 +20,6 @@
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
-
-typedef struct
-{
-	gistxlogPageUpdate *data;
-	int			len;
-	IndexTuple *itup;
-	OffsetNumber *todelete;
-} PageUpdateRecord;
-
 typedef struct
 {
 	gistxlogPage *header;
@@ -41,144 +32,37 @@ typedef struct
 	NewPage    *page;
 } PageSplitRecord;
 
-/* track for incomplete inserts, idea was taken from nbtxlog.c */
-
-typedef struct gistIncompleteInsert
-{
-	RelFileNode node;
-	BlockNumber origblkno;		/* for splits */
-	ItemPointerData key;
-	int			lenblk;
-	BlockNumber *blkno;
-	XLogRecPtr	lsn;
-	BlockNumber *path;
-	int			pathlen;
-} gistIncompleteInsert;
-
-
 static MemoryContext opCtx;		/* working memory for operations */
-static MemoryContext insertCtx; /* holds incomplete_inserts list */
-static List *incomplete_inserts;
-
-
-#define ItemPointerEQ(a, b) \
-	( ItemPointerGetOffsetNumber(a) == ItemPointerGetOffsetNumber(b) && \
-	  ItemPointerGetBlockNumber (a) == ItemPointerGetBlockNumber(b) )
-
 
+/*
+ * Replay the clearing of F_FOLLOW_RIGHT flag.
+ */
 static void
-pushIncompleteInsert(RelFileNode node, XLogRecPtr lsn, ItemPointerData key,
-					 BlockNumber *blkno, int lenblk,
-					 PageSplitRecord *xlinfo /* to extract blkno info */ )
+gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn,
+						 BlockNumber leftblkno)
 {
-	MemoryContext oldCxt;
-	gistIncompleteInsert *ninsert;
+	Buffer		buffer;
 
-	if (!ItemPointerIsValid(&key))
+	buffer = XLogReadBuffer(node, leftblkno, false);
+	if (BufferIsValid(buffer))
+	{
+		Page		page = (Page) BufferGetPage(buffer);
 
 		/*
-		 * if key is null then we should not store insertion as incomplete,
-		 * because it's a vacuum operation..
+		 * Note that we still update the page even if page LSN is equal to the
+		 * LSN of this record, because the updated NSN is not included in the
+		 * full page image.
 		 */
-		return;
-
-	oldCxt = MemoryContextSwitchTo(insertCtx);
-	ninsert = (gistIncompleteInsert *) palloc(sizeof(gistIncompleteInsert));
-
-	ninsert->node = node;
-	ninsert->key = key;
-	ninsert->lsn = lsn;
-
-	if (lenblk && blkno)
-	{
-		ninsert->lenblk = lenblk;
-		ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk);
-		memcpy(ninsert->blkno, blkno, sizeof(BlockNumber) * ninsert->lenblk);
-		ninsert->origblkno = *blkno;
-	}
-	else
-	{
-		int			i;
-
-		Assert(xlinfo);
-		ninsert->lenblk = xlinfo->data->npage;
-		ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk);
-		for (i = 0; i < ninsert->lenblk; i++)
-			ninsert->blkno[i] = xlinfo->page[i].header->blkno;
-		ninsert->origblkno = xlinfo->data->origblkno;
-	}
-	Assert(ninsert->lenblk > 0);
-
-	/*
-	 * Stick the new incomplete insert onto the front of the list, not the
-	 * back.  This is so that gist_xlog_cleanup will process incompletions in
-	 * last-in-first-out order.
-	 */
-	incomplete_inserts = lcons(ninsert, incomplete_inserts);
-
-	MemoryContextSwitchTo(oldCxt);
-}
-
-static void
-forgetIncompleteInsert(RelFileNode node, ItemPointerData key)
-{
-	ListCell   *l;
-
-	if (!ItemPointerIsValid(&key))
-		return;
-
-	if (incomplete_inserts == NIL)
-		return;
-
-	foreach(l, incomplete_inserts)
-	{
-		gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l);
-
-		if (RelFileNodeEquals(node, insert->node) && ItemPointerEQ(&(insert->key), &(key)))
+		if (!XLByteLT(lsn, PageGetLSN(page)))
 		{
-			/* found */
-			incomplete_inserts = list_delete_ptr(incomplete_inserts, insert);
-			pfree(insert->blkno);
-			pfree(insert);
-			break;
-		}
-	}
-}
+			GistPageGetOpaque(page)->nsn = lsn;
+			GistClearFollowRight(page);
 
-static void
-decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record)
-{
-	char	   *begin = XLogRecGetData(record),
-			   *ptr;
-	int			i = 0,
-				addpath = 0;
-
-	decoded->data = (gistxlogPageUpdate *) begin;
-
-	if (decoded->data->ntodelete)
-	{
-		decoded->todelete = (OffsetNumber *) (begin + sizeof(gistxlogPageUpdate) + addpath);
-		addpath = MAXALIGN(sizeof(OffsetNumber) * decoded->data->ntodelete);
-	}
-	else
-		decoded->todelete = NULL;
-
-	decoded->len = 0;
-	ptr = begin + sizeof(gistxlogPageUpdate) + addpath;
-	while (ptr - begin < record->xl_len)
-	{
-		decoded->len++;
-		ptr += IndexTupleSize((IndexTuple) ptr);
-	}
-
-	decoded->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * decoded->len);
-
-	ptr = begin + sizeof(gistxlogPageUpdate) + addpath;
-	while (ptr - begin < record->xl_len)
-	{
-		decoded->itup[i] = (IndexTuple) ptr;
-		ptr += IndexTupleSize(decoded->itup[i]);
-		i++;
+			PageSetLSN(page, lsn);
+			PageSetTLI(page, ThisTimeLineID);
+			MarkBufferDirty(buffer);
+		}
+		UnlockReleaseBuffer(buffer);
 	}
 }
 
@@ -186,29 +70,22 @@ decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record)
  * redo any page update (except page split)
  */
 static void
-gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot)
+gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record)
 {
-	gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record);
-	PageUpdateRecord xlrec;
+	char	   *begin = XLogRecGetData(record);
+	gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin;
 	Buffer		buffer;
 	Page		page;
+	char	   *data;
 
-	/* we must fix incomplete_inserts list even if XLR_BKP_BLOCK_1 is set */
-	forgetIncompleteInsert(xldata->node, xldata->key);
+	if (BlockNumberIsValid(xldata->leftchild))
+		gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild);
 
-	if (!isnewroot && xldata->blkno != GIST_ROOT_BLKNO)
-		/* operation with root always finalizes insertion */
-		pushIncompleteInsert(xldata->node, lsn, xldata->key,
-							 &(xldata->blkno), 1,
-							 NULL);
-
-	/* nothing else to do if page was backed up (and no info to do it with) */
+	/* nothing more to do if page was backed up (and no info to do it with) */
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
 
-	decodePageUpdateRecord(&xlrec, record);
-
-	buffer = XLogReadBuffer(xlrec.data->node, xlrec.data->blkno, false);
+	buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
 	if (!BufferIsValid(buffer))
 		return;
 	page = (Page) BufferGetPage(buffer);
@@ -219,28 +96,52 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot)
 		return;
 	}
 
-	if (isnewroot)
-		GISTInitBuffer(buffer, 0);
-	else if (xlrec.data->ntodelete)
+	data = begin + sizeof(gistxlogPageUpdate);
+
+	/* Delete old tuples */
+	if (xldata->ntodelete > 0)
 	{
 		int			i;
+		OffsetNumber *todelete = (OffsetNumber *) data;
+
+		data += sizeof(OffsetNumber) * xldata->ntodelete;
 
-		for (i = 0; i < xlrec.data->ntodelete; i++)
-			PageIndexTupleDelete(page, xlrec.todelete[i]);
+		for (i = 0; i < xldata->ntodelete; i++)
+			PageIndexTupleDelete(page, todelete[i]);
 		if (GistPageIsLeaf(page))
 			GistMarkTuplesDeleted(page);
 	}
 
 	/* add tuples */
-	if (xlrec.len > 0)
-		gistfillbuffer(page, xlrec.itup, xlrec.len, InvalidOffsetNumber);
+	if (data - begin < record->xl_len)
+	{
+		OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+		OffsetNumberNext(PageGetMaxOffsetNumber(page));
 
-	/*
-	 * special case: leafpage, nothing to insert, nothing to delete, then
-	 * vacuum marks page
-	 */
-	if (GistPageIsLeaf(page) && xlrec.len == 0 && xlrec.data->ntodelete == 0)
-		GistClearTuplesDeleted(page);
+		while (data - begin < record->xl_len)
+		{
+			IndexTuple	itup = (IndexTuple) data;
+			Size		sz = IndexTupleSize(itup);
+			OffsetNumber l;
+
+			data += sz;
+
+			l = PageAddItem(page, (Item) itup, sz, off, false, false);
+			if (l == InvalidOffsetNumber)
+				elog(ERROR, "failed to add item to GiST index page, size %d bytes",
+					 (int) sz);
+			off++;
+		}
+	}
+	else
+	{
+		/*
+		 * special case: leafpage, nothing to insert, nothing to delete, then
+		 * vacuum marks page
+		 */
+		if (GistPageIsLeaf(page) && xldata->ntodelete == 0)
+			GistClearTuplesDeleted(page);
+	}
 
 	if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO)
 
@@ -315,41 +216,67 @@ decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record)
 static void
 gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
 {
+	gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
 	PageSplitRecord xlrec;
 	Buffer		buffer;
 	Page		page;
 	int			i;
-	int			flags;
+	bool		isrootsplit = false;
 
+	if (BlockNumberIsValid(xldata->leftchild))
+		gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild);
 	decodePageSplitRecord(&xlrec, record);
-	flags = xlrec.data->origleaf ? F_LEAF : 0;
 
 	/* loop around all pages */
 	for (i = 0; i < xlrec.data->npage; i++)
 	{
 		NewPage    *newpage = xlrec.page + i;
+		int			flags;
+
+		if (newpage->header->blkno == GIST_ROOT_BLKNO)
+		{
+			Assert(i == 0);
+			isrootsplit = true;
+		}
 
 		buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true);
 		Assert(BufferIsValid(buffer));
 		page = (Page) BufferGetPage(buffer);
 
 		/* ok, clear buffer */
+		if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO)
+			flags = F_LEAF;
+		else
+			flags = 0;
 		GISTInitBuffer(buffer, flags);
 
 		/* and fill it */
 		gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber);
 
+		if (newpage->header->blkno == GIST_ROOT_BLKNO)
+		{
+			GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
+			GistPageGetOpaque(page)->nsn = xldata->orignsn;
+			GistClearFollowRight(page);
+		}
+		else
+		{
+			if (i < xlrec.data->npage - 1)
+				GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno;
+			else
+				GistPageGetOpaque(page)->rightlink = xldata->origrlink;
+			GistPageGetOpaque(page)->nsn = xldata->orignsn;
+			if (i < xlrec.data->npage - 1 && !isrootsplit)
+				GistMarkFollowRight(page);
+			else
+				GistClearFollowRight(page);
+		}
+
 		PageSetLSN(page, lsn);
 		PageSetTLI(page, ThisTimeLineID);
 		MarkBufferDirty(buffer);
 		UnlockReleaseBuffer(buffer);
 	}
-
-	forgetIncompleteInsert(xlrec.data->node, xlrec.data->key);
-
-	pushIncompleteInsert(xlrec.data->node, lsn, xlrec.data->key,
-						 NULL, 0,
-						 &xlrec);
 }
 
 static void
@@ -372,24 +299,6 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
 	UnlockReleaseBuffer(buffer);
 }
 
-static void
-gistRedoCompleteInsert(XLogRecPtr lsn, XLogRecord *record)
-{
-	char	   *begin = XLogRecGetData(record),
-			   *ptr;
-	gistxlogInsertComplete *xlrec;
-
-	xlrec = (gistxlogInsertComplete *) begin;
-
-	ptr = begin + sizeof(gistxlogInsertComplete);
-	while (ptr - begin < record->xl_len)
-	{
-		Assert(record->xl_len - (ptr - begin) >= sizeof(ItemPointerData));
-		forgetIncompleteInsert(xlrec->node, *((ItemPointerData *) ptr));
-		ptr += sizeof(ItemPointerData);
-	}
-}
-
 void
 gist_redo(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -397,34 +306,27 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
 	MemoryContext oldCxt;
 
 	/*
-	 * GIST indexes do not require any conflict processing. NB: If we ever
+	 * GiST indexes do not require any conflict processing. NB: If we ever
 	 * implement a similar optimization we have in b-tree, and remove killed
 	 * tuples outside VACUUM, we'll need to handle that here.
 	 */
-
 	RestoreBkpBlocks(lsn, record, false);
 
 	oldCxt = MemoryContextSwitchTo(opCtx);
 	switch (info)
 	{
 		case XLOG_GIST_PAGE_UPDATE:
-			gistRedoPageUpdateRecord(lsn, record, false);
+			gistRedoPageUpdateRecord(lsn, record);
 			break;
 		case XLOG_GIST_PAGE_DELETE:
 			gistRedoPageDeleteRecord(lsn, record);
 			break;
-		case XLOG_GIST_NEW_ROOT:
-			gistRedoPageUpdateRecord(lsn, record, true);
-			break;
 		case XLOG_GIST_PAGE_SPLIT:
 			gistRedoPageSplitRecord(lsn, record);
 			break;
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(lsn, record);
 			break;
-		case XLOG_GIST_INSERT_COMPLETE:
-			gistRedoCompleteInsert(lsn, record);
-			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -434,20 +336,16 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
 }
 
 static void
-out_target(StringInfo buf, RelFileNode node, ItemPointerData key)
+out_target(StringInfo buf, RelFileNode node)
 {
 	appendStringInfo(buf, "rel %u/%u/%u",
 					 node.spcNode, node.dbNode, node.relNode);
-	if (ItemPointerIsValid(&key))
-		appendStringInfo(buf, "; tid %u/%u",
-						 ItemPointerGetBlockNumber(&key),
-						 ItemPointerGetOffsetNumber(&key));
 }
 
 static void
 out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
 {
-	out_target(buf, xlrec->node, xlrec->key);
+	out_target(buf, xlrec->node);
 	appendStringInfo(buf, "; block number %u", xlrec->blkno);
 }
 
@@ -463,7 +361,7 @@ static void
 out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
 {
 	appendStringInfo(buf, "page_split: ");
-	out_target(buf, xlrec->node, xlrec->key);
+	out_target(buf, xlrec->node);
 	appendStringInfo(buf, "; block number %u splits to %d pages",
 					 xlrec->origblkno, xlrec->npage);
 }
@@ -482,10 +380,6 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec)
 		case XLOG_GIST_PAGE_DELETE:
 			out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec);
 			break;
-		case XLOG_GIST_NEW_ROOT:
-			appendStringInfo(buf, "new_root: ");
-			out_target(buf, ((gistxlogPageUpdate *) rec)->node, ((gistxlogPageUpdate *) rec)->key);
-			break;
 		case XLOG_GIST_PAGE_SPLIT:
 			out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
 			break;
@@ -495,415 +389,102 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec)
 							 ((RelFileNode *) rec)->dbNode,
 							 ((RelFileNode *) rec)->relNode);
 			break;
-		case XLOG_GIST_INSERT_COMPLETE:
-			appendStringInfo(buf, "complete_insert: rel %u/%u/%u",
-							 ((gistxlogInsertComplete *) rec)->node.spcNode,
-							 ((gistxlogInsertComplete *) rec)->node.dbNode,
-							 ((gistxlogInsertComplete *) rec)->node.relNode);
-			break;
 		default:
 			appendStringInfo(buf, "unknown gist op code %u", info);
 			break;
 	}
 }
 
-IndexTuple
-gist_form_invalid_tuple(BlockNumber blkno)
-{
-	/*
-	 * we don't alloc space for null's bitmap, this is invalid tuple, be
-	 * carefull in read and write code
-	 */
-	Size		size = IndexInfoFindDataOffset(0);
-	IndexTuple	tuple = (IndexTuple) palloc0(size);
-
-	tuple->t_info |= size;
-
-	ItemPointerSetBlockNumber(&(tuple->t_tid), blkno);
-	GistTupleSetInvalid(tuple);
-
-	return tuple;
-}
-
-
-static void
-gistxlogFindPath(Relation index, gistIncompleteInsert *insert)
-{
-	GISTInsertStack *top;
-
-	insert->pathlen = 0;
-	insert->path = NULL;
-
-	if ((top = gistFindPath(index, insert->origblkno)) != NULL)
-	{
-		int			i;
-		GISTInsertStack *ptr;
-
-		for (ptr = top; ptr; ptr = ptr->parent)
-			insert->pathlen++;
-
-		insert->path = (BlockNumber *) palloc(sizeof(BlockNumber) * insert->pathlen);
-
-		i = 0;
-		for (ptr = top; ptr; ptr = ptr->parent)
-			insert->path[i++] = ptr->blkno;
-	}
-	else
-		elog(ERROR, "lost parent for block %u", insert->origblkno);
-}
-
-static SplitedPageLayout *
-gistMakePageLayout(Buffer *buffers, int nbuffers)
-{
-	SplitedPageLayout *res = NULL,
-			   *resptr;
-
-	while (nbuffers-- > 0)
-	{
-		Page		page = BufferGetPage(buffers[nbuffers]);
-		IndexTuple *vec;
-		int			veclen;
-
-		resptr = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
-
-		resptr->block.blkno = BufferGetBlockNumber(buffers[nbuffers]);
-		resptr->block.num = PageGetMaxOffsetNumber(page);
-
-		vec = gistextractpage(page, &veclen);
-		resptr->list = gistfillitupvec(vec, veclen, &(resptr->lenlist));
-
-		resptr->next = res;
-		res = resptr;
-	}
-
-	return res;
-}
-
-/*
- * Continue insert after crash.  In normal situations, there aren't any
- * incomplete inserts, but if a crash occurs partway through an insertion
- * sequence, we'll need to finish making the index valid at the end of WAL
- * replay.
- *
- * Note that we assume the index is now in a valid state, except for the
- * unfinished insertion.  In particular it's safe to invoke gistFindPath();
- * there shouldn't be any garbage pages for it to run into.
- *
- * To complete insert we can't use basic insertion algorithm because
- * during insertion we can't call user-defined support functions of opclass.
- * So, we insert 'invalid' tuples without real key and do it by separate algorithm.
- * 'invalid' tuple should be updated by vacuum full.
- */
-static void
-gistContinueInsert(gistIncompleteInsert *insert)
-{
-	IndexTuple *itup;
-	int			i,
-				lenitup;
-	Relation	index;
-
-	index = CreateFakeRelcacheEntry(insert->node);
-
-	/*
-	 * needed vector itup never will be more than initial lenblkno+2, because
-	 * during this processing Indextuple can be only smaller
-	 */
-	lenitup = insert->lenblk;
-	itup = (IndexTuple *) palloc(sizeof(IndexTuple) * (lenitup + 2 /* guarantee root split */ ));
-
-	for (i = 0; i < insert->lenblk; i++)
-		itup[i] = gist_form_invalid_tuple(insert->blkno[i]);
-
-	/*
-	 * any insertion of itup[] should make LOG message about
-	 */
-
-	if (insert->origblkno == GIST_ROOT_BLKNO)
-	{
-		/*
-		 * it was split root, so we should only make new root. it can't be
-		 * simple insert into root, we should replace all content of root.
-		 */
-		Buffer		buffer = XLogReadBuffer(insert->node, GIST_ROOT_BLKNO, true);
-
-		gistnewroot(index, buffer, itup, lenitup, NULL);
-		UnlockReleaseBuffer(buffer);
-	}
-	else
-	{
-		Buffer	   *buffers;
-		Page	   *pages;
-		int			numbuffer;
-		OffsetNumber *todelete;
-
-		/* construct path */
-		gistxlogFindPath(index, insert);
-
-		Assert(insert->pathlen > 0);
-
-		buffers = (Buffer *) palloc(sizeof(Buffer) * (insert->lenblk + 2 /* guarantee root split */ ));
-		pages = (Page *) palloc(sizeof(Page) * (insert->lenblk + 2 /* guarantee root split */ ));
-		todelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (insert->lenblk + 2 /* guarantee root split */ ));
-
-		for (i = 0; i < insert->pathlen; i++)
-		{
-			int			j,
-						k,
-						pituplen = 0;
-			uint8		xlinfo;
-			XLogRecData *rdata;
-			XLogRecPtr	recptr;
-			Buffer		tempbuffer = InvalidBuffer;
-			int			ntodelete = 0;
-
-			numbuffer = 1;
-			buffers[0] = ReadBuffer(index, insert->path[i]);
-			LockBuffer(buffers[0], GIST_EXCLUSIVE);
-
-			/*
-			 * we check buffer, because we restored page earlier
-			 */
-			gistcheckpage(index, buffers[0]);
-
-			pages[0] = BufferGetPage(buffers[0]);
-			Assert(!GistPageIsLeaf(pages[0]));
-
-			pituplen = PageGetMaxOffsetNumber(pages[0]);
-
-			/* find remove old IndexTuples to remove */
-			for (j = 0; j < pituplen && ntodelete < lenitup; j++)
-			{
-				BlockNumber blkno;
-				ItemId		iid = PageGetItemId(pages[0], j + FirstOffsetNumber);
-				IndexTuple	idxtup = (IndexTuple) PageGetItem(pages[0], iid);
-
-				blkno = ItemPointerGetBlockNumber(&(idxtup->t_tid));
-
-				for (k = 0; k < lenitup; k++)
-					if (ItemPointerGetBlockNumber(&(itup[k]->t_tid)) == blkno)
-					{
-						todelete[ntodelete] = j + FirstOffsetNumber - ntodelete;
-						ntodelete++;
-						break;
-					}
-			}
-
-			if (ntodelete == 0)
-				elog(PANIC, "gistContinueInsert: cannot find pointer to page(s)");
-
-			/*
-			 * we check space with subtraction only first tuple to delete,
-			 * hope, that wiil be enough space....
-			 */
-
-			if (gistnospace(pages[0], itup, lenitup, *todelete, 0))
-			{
-
-				/* no space left on page, so we must split */
-				buffers[numbuffer] = ReadBuffer(index, P_NEW);
-				LockBuffer(buffers[numbuffer], GIST_EXCLUSIVE);
-				GISTInitBuffer(buffers[numbuffer], 0);
-				pages[numbuffer] = BufferGetPage(buffers[numbuffer]);
-				gistfillbuffer(pages[numbuffer], itup, lenitup, FirstOffsetNumber);
-				numbuffer++;
-
-				if (BufferGetBlockNumber(buffers[0]) == GIST_ROOT_BLKNO)
-				{
-					Buffer		tmp;
-
-					/*
-					 * we split root, just copy content from root to new page
-					 */
-
-					/* sanity check */
-					if (i + 1 != insert->pathlen)
-						elog(PANIC, "unexpected pathlen in index \"%s\"",
-							 RelationGetRelationName(index));
-
-					/* fill new page, root will be changed later */
-					tempbuffer = ReadBuffer(index, P_NEW);
-					LockBuffer(tempbuffer, GIST_EXCLUSIVE);
-					memcpy(BufferGetPage(tempbuffer), pages[0], BufferGetPageSize(tempbuffer));
-
-					/* swap buffers[0] (was root) and temp buffer */
-					tmp = buffers[0];
-					buffers[0] = tempbuffer;
-					tempbuffer = tmp;	/* now in tempbuffer GIST_ROOT_BLKNO,
-										 * it is still unchanged */
-
-					pages[0] = BufferGetPage(buffers[0]);
-				}
-
-				START_CRIT_SECTION();
-
-				for (j = 0; j < ntodelete; j++)
-					PageIndexTupleDelete(pages[0], todelete[j]);
-
-				xlinfo = XLOG_GIST_PAGE_SPLIT;
-				rdata = formSplitRdata(index->rd_node, insert->path[i],
-									   false, &(insert->key),
-									 gistMakePageLayout(buffers, numbuffer));
-
-			}
-			else
-			{
-				START_CRIT_SECTION();
-
-				for (j = 0; j < ntodelete; j++)
-					PageIndexTupleDelete(pages[0], todelete[j]);
-				gistfillbuffer(pages[0], itup, lenitup, InvalidOffsetNumber);
-
-				xlinfo = XLOG_GIST_PAGE_UPDATE;
-				rdata = formUpdateRdata(index->rd_node, buffers[0],
-										todelete, ntodelete,
-										itup, lenitup, &(insert->key));
-			}
-
-			/*
-			 * use insert->key as mark for completion of insert (form*Rdata()
-			 * above) for following possible replays
-			 */
-
-			/* write pages, we should mark it dirty befor XLogInsert() */
-			for (j = 0; j < numbuffer; j++)
-			{
-				GistPageGetOpaque(pages[j])->rightlink = InvalidBlockNumber;
-				MarkBufferDirty(buffers[j]);
-			}
-			recptr = XLogInsert(RM_GIST_ID, xlinfo, rdata);
-			for (j = 0; j < numbuffer; j++)
-			{
-				PageSetLSN(pages[j], recptr);
-				PageSetTLI(pages[j], ThisTimeLineID);
-			}
-
-			END_CRIT_SECTION();
-
-			lenitup = numbuffer;
-			for (j = 0; j < numbuffer; j++)
-			{
-				itup[j] = gist_form_invalid_tuple(BufferGetBlockNumber(buffers[j]));
-				UnlockReleaseBuffer(buffers[j]);
-			}
-
-			if (tempbuffer != InvalidBuffer)
-			{
-				/*
-				 * it was a root split, so fill it by new values
-				 */
-				gistnewroot(index, tempbuffer, itup, lenitup, &(insert->key));
-				UnlockReleaseBuffer(tempbuffer);
-			}
-		}
-	}
-
-	FreeFakeRelcacheEntry(index);
-
-	ereport(LOG,
-			(errmsg("index %u/%u/%u needs VACUUM FULL or REINDEX to finish crash recovery",
-			insert->node.spcNode, insert->node.dbNode, insert->node.relNode),
-		   errdetail("Incomplete insertion detected during crash replay.")));
-}
-
 void
 gist_xlog_startup(void)
 {
-	incomplete_inserts = NIL;
-	insertCtx = AllocSetContextCreate(CurrentMemoryContext,
-									  "GiST recovery temporary context",
-									  ALLOCSET_DEFAULT_MINSIZE,
-									  ALLOCSET_DEFAULT_INITSIZE,
-									  ALLOCSET_DEFAULT_MAXSIZE);
 	opCtx = createTempGistContext();
 }
 
 void
 gist_xlog_cleanup(void)
 {
-	ListCell   *l;
-	MemoryContext oldCxt;
-
-	oldCxt = MemoryContextSwitchTo(opCtx);
-
-	foreach(l, incomplete_inserts)
-	{
-		gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l);
-
-		gistContinueInsert(insert);
-		MemoryContextReset(opCtx);
-	}
-	MemoryContextSwitchTo(oldCxt);
-
 	MemoryContextDelete(opCtx);
-	MemoryContextDelete(insertCtx);
-}
-
-bool
-gist_safe_restartpoint(void)
-{
-	if (incomplete_inserts)
-		return false;
-	return true;
 }
 
-
-XLogRecData *
-formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
-			   ItemPointer key, SplitedPageLayout *dist)
+/*
+ * Write WAL record of a page split.
+ */
+XLogRecPtr
+gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
+			  SplitedPageLayout *dist,
+			  BlockNumber origrlink, GistNSN orignsn,
+			  Buffer leftchildbuf)
 {
 	XLogRecData *rdata;
-	gistxlogPageSplit *xlrec = (gistxlogPageSplit *) palloc(sizeof(gistxlogPageSplit));
+	gistxlogPageSplit xlrec;
 	SplitedPageLayout *ptr;
 	int			npage = 0,
-				cur = 1;
+				cur;
+	XLogRecPtr	recptr;
 
-	ptr = dist;
-	while (ptr)
-	{
+	for (ptr = dist; ptr; ptr = ptr->next)
 		npage++;
-		ptr = ptr->next;
-	}
 
 	rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2));
 
-	xlrec->node = node;
-	xlrec->origblkno = blkno;
-	xlrec->origleaf = page_is_leaf;
-	xlrec->npage = (uint16) npage;
-	if (key)
-		xlrec->key = *key;
-	else
-		ItemPointerSetInvalid(&(xlrec->key));
+	xlrec.node = node;
+	xlrec.origblkno = blkno;
+	xlrec.origrlink = origrlink;
+	xlrec.orignsn = orignsn;
+	xlrec.origleaf = page_is_leaf;
+	xlrec.npage = (uint16) npage;
+	xlrec.leftchild =
+		BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
 
-	rdata[0].buffer = InvalidBuffer;
-	rdata[0].data = (char *) xlrec;
+	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = sizeof(gistxlogPageSplit);
-	rdata[0].next = NULL;
+	rdata[0].buffer = InvalidBuffer;
+
+	cur = 1;
 
-	ptr = dist;
-	while (ptr)
+	/*
+	 * Include a full page image of the child buf. (only necessary if a
+	 * checkpoint happened since the child page was split)
+	 */
+	if (BufferIsValid(leftchildbuf))
 	{
+		rdata[cur - 1].next = &(rdata[cur]);
+		rdata[cur].data = NULL;
+		rdata[cur].len = 0;
+		rdata[cur].buffer = leftchildbuf;
+		rdata[cur].buffer_std = true;
+		cur++;
+	}
+
+	for (ptr = dist; ptr; ptr = ptr->next)
+	{
+		rdata[cur - 1].next = &(rdata[cur]);
 		rdata[cur].buffer = InvalidBuffer;
 		rdata[cur].data = (char *) &(ptr->block);
 		rdata[cur].len = sizeof(gistxlogPage);
-		rdata[cur - 1].next = &(rdata[cur]);
 		cur++;
 
+		rdata[cur - 1].next = &(rdata[cur]);
 		rdata[cur].buffer = InvalidBuffer;
 		rdata[cur].data = (char *) (ptr->list);
 		rdata[cur].len = ptr->lenlist;
-		rdata[cur - 1].next = &(rdata[cur]);
-		rdata[cur].next = NULL;
 		cur++;
-		ptr = ptr->next;
 	}
+	rdata[cur - 1].next = NULL;
+
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
 
-	return rdata;
+	pfree(rdata);
+	return recptr;
 }
 
 /*
- * Construct the rdata array for an XLOG record describing a page update
- * (deletion and/or insertion of tuples on a single index page).
+ * Write XLOG record describing a page update. The update can include any
+ * number of deletions and/or insertions of tuples on a single index page.
+ *
+ * If this update inserts a downlink for a split page, also record that
+ * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
  *
  * Note that both the todelete array and the tuples are marked as belonging
  * to the target buffer; they need not be stored in XLOG if XLogInsert decides
@@ -911,27 +492,26 @@ formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
  * at least one rdata item referencing the buffer, even when ntodelete and
  * ituplen are both zero; this ensures that XLogInsert knows about the buffer.
  */
-XLogRecData *
-formUpdateRdata(RelFileNode node, Buffer buffer,
-				OffsetNumber *todelete, int ntodelete,
-				IndexTuple *itup, int ituplen, ItemPointer key)
+XLogRecPtr
+gistXLogUpdate(RelFileNode node, Buffer buffer,
+			   OffsetNumber *todelete, int ntodelete,
+			   IndexTuple *itup, int ituplen,
+			   Buffer leftchildbuf)
 {
 	XLogRecData *rdata;
 	gistxlogPageUpdate *xlrec;
 	int			cur,
 				i;
+	XLogRecPtr	recptr;
 
-	rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen));
+	rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen));
 	xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate));
 
 	xlrec->node = node;
 	xlrec->blkno = BufferGetBlockNumber(buffer);
 	xlrec->ntodelete = ntodelete;
-
-	if (key)
-		xlrec->key = *key;
-	else
-		ItemPointerSetInvalid(&(xlrec->key));
+	xlrec->leftchild =
+		BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
 
 	rdata[0].buffer = buffer;
 	rdata[0].buffer_std = true;
@@ -945,13 +525,13 @@ formUpdateRdata(RelFileNode node, Buffer buffer,
 	rdata[1].next = &(rdata[2]);
 
 	rdata[2].data = (char *) todelete;
-	rdata[2].len = MAXALIGN(sizeof(OffsetNumber) * ntodelete);
+	rdata[2].len = sizeof(OffsetNumber) * ntodelete;
 	rdata[2].buffer = buffer;
 	rdata[2].buffer_std = true;
-	rdata[2].next = NULL;
 
-	/* new tuples */
 	cur = 3;
+
+	/* new tuples */
 	for (i = 0; i < ituplen; i++)
 	{
 		rdata[cur - 1].next = &(rdata[cur]);
@@ -959,38 +539,26 @@ formUpdateRdata(RelFileNode node, Buffer buffer,
 		rdata[cur].len = IndexTupleSize(itup[i]);
 		rdata[cur].buffer = buffer;
 		rdata[cur].buffer_std = true;
-		rdata[cur].next = NULL;
 		cur++;
 	}
 
-	return rdata;
-}
-
-XLogRecPtr
-gistxlogInsertCompletion(RelFileNode node, ItemPointerData *keys, int len)
-{
-	gistxlogInsertComplete xlrec;
-	XLogRecData rdata[2];
-	XLogRecPtr	recptr;
-
-	Assert(len > 0);
-	xlrec.node = node;
-
-	rdata[0].buffer = InvalidBuffer;
-	rdata[0].data = (char *) &xlrec;
-	rdata[0].len = sizeof(gistxlogInsertComplete);
-	rdata[0].next = &(rdata[1]);
-
-	rdata[1].buffer = InvalidBuffer;
-	rdata[1].data = (char *) keys;
-	rdata[1].len = sizeof(ItemPointerData) * len;
-	rdata[1].next = NULL;
-
-	START_CRIT_SECTION();
-
-	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_INSERT_COMPLETE, rdata);
+	/*
+	 * Include a full page image of the child buf. (only necessary if a
+	 * checkpoint happened since the child page was split)
+	 */
+	if (BufferIsValid(leftchildbuf))
+	{
+		rdata[cur - 1].next = &(rdata[cur]);
+		rdata[cur].data = NULL;
+		rdata[cur].len = 0;
+		rdata[cur].buffer = leftchildbuf;
+		rdata[cur].buffer_std = true;
+		cur++;
+	}
+	rdata[cur - 1].next = NULL;
 
-	END_CRIT_SECTION();
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
 
+	pfree(rdata);
 	return recptr;
 }