Restructure index AM interface for index building and index tuple deletion,

per previous discussion on pghackers. Most of the duplicate code in different AMs' ambuild routines has been moved out to a common routine in index.c; this means that all index types now do the right things about inserting recently-dead tuples, etc. (I also removed support for EXTEND INDEX in the ambuild routines, since that's about to go away anyway, and it cluttered the code a lot.) The retail indextuple deletion routines have been replaced by a "bulk delete" routine in which the indexscan is inside the access method. I haven't pushed this change as far as it should go yet, but it should allow considerable simplification of the internal bookkeeping for deletions. Also, add flag columns to pg_am to eliminate various hardcoded tests on AM OIDs, and remove unused pg_am columns. Fix rtree and gist index types to not attempt to store NULLs; before this, gist usually crashed, while rtree managed not to crash but computed wacko bounding boxes for NULL entries (which might have had something to do with the performance problems we've heard about occasionally). Add AtEOXact routines to hash, rtree, and gist, all of which have static state that needs to be reset after an error. We discovered this need long ago for btree, but missed the other guys. Oh, one more thing: concurrent VACUUM is now the default.
author: Tom Lane 2001-07-15 22:48:19 +0000
committer: Tom Lane 2001-07-15 22:48:19 +0000
commit: c8076f09d2eb82a28f27f97230be470fffe7a1e0 (patch)
tree: 1e357e7e28313386f9d2e789d3905b37ce2d58f6 /src/backend/access
parent: 997439f59e1d487cb2bfa1384f6479fda0c4dd4c (diff)
18 files changed, 1036 insertions, 1200 deletions
diff --git a/src/backend/access/common/indexvalid.c b/src/backend/access/common/indexvalid.c
index 6a7c08b450..94e7efd522 100644
--- a/src/backend/access/common/indexvalid.c
+++ b/src/backend/access/common/indexvalid.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.26 2001/01/24 19:42:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.27 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -24,12 +24,9 @@
  */
 int			NIndexTupleProcessed;
 
+
 /* ----------------
- *		index_keytest
- *
- * old comments
- *		May eventually combine with other tests (like timeranges)?
- *		Should have Buffer buffer; as an argument and pass it to amgetattr.
+ *		index_keytest - does this index tuple satisfy the scan key(s)?
  * ----------------
  */
 bool
@@ -38,16 +35,16 @@ index_keytest(IndexTuple tuple,
 			  int scanKeySize,
 			  ScanKey key)
 {
-	bool		isNull;
-	Datum		datum;
-	Datum		test;
-
 	IncrIndexProcessed();
 
 	while (scanKeySize > 0)
 	{
+		Datum		datum;
+		bool		isNull;
+		Datum		test;
+
 		datum = index_getattr(tuple,
-							  key[0].sk_attno,
+							  key->sk_attno,
 							  tupdesc,
 							  &isNull);
 
@@ -57,25 +54,19 @@ index_keytest(IndexTuple tuple,
 			return false;
 		}
 
-		if (key[0].sk_flags & SK_ISNULL)
+		if (key->sk_flags & SK_ISNULL)
 			return false;
 
-		if (key[0].sk_flags & SK_COMMUTE)
-		{
-			test = FunctionCall2(&key[0].sk_func,
-								 key[0].sk_argument, datum);
-		}
+		if (key->sk_flags & SK_COMMUTE)
+			test = FunctionCall2(&key->sk_func, key->sk_argument, datum);
 		else
-		{
-			test = FunctionCall2(&key[0].sk_func,
-								 datum, key[0].sk_argument);
-		}
+			test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
 
-		if (DatumGetBool(test) == !!(key[0].sk_flags & SK_NEGATE))
+		if (DatumGetBool(test) == !!(key->sk_flags & SK_NEGATE))
 			return false;
 
-		scanKeySize -= 1;
 		key++;
+		scanKeySize--;
 	}
 
 	return true;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 9d6e2040f6..c99c4a7e6e 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.79 2001/06/11 05:00:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.80 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,7 +43,23 @@
 #define RIGHT_ADDED	0x02
 #define BOTH_ADDED	( LEFT_ADDED | RIGHT_ADDED )
 
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+	GISTSTATE	giststate;
+	int			numindexattrs;
+	double		indtuples;
+} GISTBuildState;
+
+
 /* non-export function prototypes */
+static void gistbuildCallback(Relation index,
+							  HeapTuple htup,
+							  Datum *attdata,
+							  char *nulls,
+							  bool tupleIsAlive,
+							  void *state);
 static void gistdoinsert(Relation r,
 			 IndexTuple itup,
 			 InsertIndexResult *res,
@@ -89,6 +105,7 @@ static void GISTInitBuffer(Buffer b, uint32 f);
 static OffsetNumber gistchoose(Relation r, Page p,
 		   IndexTuple it,
 		   GISTSTATE *giststate);
+static void gistdelete(Relation r, ItemPointer tid);
 #ifdef GIST_PAGEADDITEM
 static IndexTuple gist_tuple_replacekey(Relation r,
 					  GISTENTRY entry, IndexTuple t);
@@ -116,184 +133,36 @@ gistbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
-
-#endif
-	ExprContext *econtext;
-	GISTSTATE	giststate;
-	GISTENTRY	tmpcentry;
-	Buffer		buffer = InvalidBuffer;
-	bool	   *compvec;
-	int			i;
+	double		reltuples;
+	GISTBuildState buildstate;
+	Buffer		buffer;
 
 	/* no locking is needed */
 
-	initGISTstate(&giststate, index);
+	initGISTstate(&buildstate.giststate, index);
 
 	/*
 	 * We expect to be called exactly once for any index relation. If
 	 * that's not the case, big trouble's what we have.
 	 */
-	if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0)
-		elog(ERROR, "%s already contains data", RelationGetRelationName(index));
-
-	/* initialize the root page (if this is a new index) */
-	if (oldPred == NULL)
-	{
-		buffer = ReadBuffer(index, P_NEW);
-		GISTInitBuffer(buffer, F_LEAF);
-		WriteBuffer(buffer);
-	}
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
-	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
-	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
+	/* initialize the root page */
+	buffer = ReadBuffer(index, P_NEW);
+	GISTInitBuffer(buffer, F_LEAF);
+	WriteBuffer(buffer);
 
 	/* build the index */
-	nhtups = nitups = 0.0;
-
-	compvec = (bool *) palloc(sizeof(bool) * indexInfo->ii_NumIndexAttrs);
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* immediately compress keys to normalize */
-		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
-		{
-			gistcentryinit(&giststate, i, &tmpcentry, attdata[i],
-						   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
-						   -1 /* size is currently bogus */ , TRUE);
-			if (attdata[i] != tmpcentry.key &&
-				!(giststate.keytypbyval))
-				compvec[i] = TRUE;
-			else
-				compvec[i] = FALSE;
-			attdata[i] = tmpcentry.key;
-		}
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-		itup->t_tid = htup->t_self;
-
-		/*
-		 * Since we already have the index relation locked, we call
-		 * gistdoinsert directly.  Normal access method calls dispatch
-		 * through gistinsert, which locks the relation for write.	This
-		 * is the right thing to do if you're inserting single tups, but
-		 * not when you're initializing the whole index at once.
-		 */
-		gistdoinsert(index, itup, NULL, &giststate);
+	buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
+	buildstate.indtuples = 0;
 
-		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
-			if (compvec[i])
-				pfree(DatumGetPointer(attdata[i]));
-
-		pfree(itup);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   gistbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-
-	pfree(compvec);
-
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -313,14 +182,8 @@ gistbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
 #ifdef GISTDEBUG
@@ -331,6 +194,63 @@ gistbuild(PG_FUNCTION_ARGS)
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+gistbuildCallback(Relation index,
+				  HeapTuple htup,
+				  Datum *attdata,
+				  char *nulls,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	GISTBuildState   *buildstate = (GISTBuildState *) state;
+	IndexTuple	itup;
+	bool		compvec[INDEX_MAX_KEYS];
+	GISTENTRY	tmpcentry;
+	int			i;
+
+	/* immediately compress keys to normalize */
+	for (i = 0; i < buildstate->numindexattrs; i++)
+	{
+		gistcentryinit(&buildstate->giststate, i, &tmpcentry, attdata[i],
+					   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
+					   -1 /* size is currently bogus */ , TRUE);
+		if (attdata[i] != tmpcentry.key &&
+			!(buildstate->giststate.keytypbyval))
+			compvec[i] = TRUE;
+		else
+			compvec[i] = FALSE;
+		attdata[i] = tmpcentry.key;
+	}
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* GIST indexes don't index nulls, see notes in gistinsert */
+	if (! IndexTupleHasNulls(itup))
+	{
+		/*
+		 * Since we already have the index relation locked, we call
+		 * gistdoinsert directly.  Normal access method calls dispatch
+		 * through gistinsert, which locks the relation for write.	This
+		 * is the right thing to do if you're inserting single tups, but
+		 * not when you're initializing the whole index at once.
+		 */
+		gistdoinsert(index, itup, NULL, &buildstate->giststate);
+
+		buildstate->indtuples += 1;
+	}
+
+	for (i = 0; i < buildstate->numindexattrs; i++)
+		if (compvec[i])
+			pfree(DatumGetPointer(attdata[i]));
+
+	pfree(itup);
+}
+
+/*
  *	gistinsert -- wrapper for GiST tuple insertion.
  *
  *	  This is the public interface routine for tuple insertion in GiSTs.
@@ -343,25 +263,28 @@ gistinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	IndexTuple	itup;
 	GISTSTATE	giststate;
 	GISTENTRY	tmpentry;
 	int			i;
-	bool	   *compvec;
+	bool		compvec[INDEX_MAX_KEYS];
+
+	/*
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
+	 */
 
 	initGISTstate(&giststate, r);
 
 	/* immediately compress keys to normalize */
-	compvec = (bool *) palloc(sizeof(bool) * r->rd_att->natts);
 	for (i = 0; i < r->rd_att->natts; i++)
 	{
-		gistcentryinit(&giststate, i,&tmpentry, datum[i],
+		gistcentryinit(&giststate, i, &tmpentry, datum[i],
 					   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
 					   -1 /* size is currently bogus */ , TRUE);
 		if (datum[i] != tmpentry.key && !(giststate.keytypbyval))
@@ -374,18 +297,24 @@ gistinsert(PG_FUNCTION_ARGS)
 	itup->t_tid = *ht_ctid;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices()
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Currently, GIST indexes do not support indexing NULLs; considerable
+	 * infrastructure work would have to be done to do anything reasonable
+	 * with a NULL.
 	 */
+	if (IndexTupleHasNulls(itup))
+	{
+		res = NULL;
+	}
+	else
+	{
+		res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+		gistdoinsert(r, itup, &res, &giststate);
+	}
 
-	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
-	gistdoinsert(r, itup, &res, &giststate);
 	for (i = 0; i < r->rd_att->natts; i++)
 		if (compvec[i] == TRUE)
 			pfree(DatumGetPointer(datum[i]));
 	pfree(itup);
-	pfree(compvec);
 
 	PG_RETURN_POINTER(res);
 }
@@ -527,9 +456,7 @@ gistlayerinsert(Relation r, BlockNumber blkno,
 
 		/* key is modified, so old version must be deleted */
 		ItemPointerSet(&oldtid, blkno, child);
-		DirectFunctionCall2(gistdelete,
-							PointerGetDatum(r),
-							PointerGetDatum(&oldtid));
+		gistdelete(r, &oldtid);
 	}
 
 	ret = INSERTED;
@@ -1416,29 +1343,31 @@ gistfreestack(GISTSTACK *s)
 
 
 /*
-** remove an entry from a page
-*/
-Datum
-gistdelete(PG_FUNCTION_ARGS)
+ * Retail deletion of a single tuple.
+ *
+ * NB: this is no longer called externally, but is still needed by
+ * gistlayerinsert().  That dependency will have to be fixed if GIST
+ * is ever going to allow concurrent insertions.
+ */
+static void
+gistdelete(Relation r, ItemPointer tid)
 {
-	Relation	r = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
 	BlockNumber blkno;
 	OffsetNumber offnum;
 	Buffer		buf;
 	Page		page;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum
-	 * deletes index tuples now...
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
 	blkno = ItemPointerGetBlockNumber(tid);
 	offnum = ItemPointerGetOffsetNumber(tid);
 
 	/* adjust any scans that will be affected by this deletion */
+	/* NB: this works only for scans in *this* backend! */
 	gistadjscans(r, GISTOP_DEL, blkno, offnum);
 
 	/* delete the index tuple */
@@ -1448,10 +1377,93 @@ gistdelete(PG_FUNCTION_ARGS)
 	PageIndexTupleDelete(page, offnum);
 
 	WriteBuffer(buf);
+}
 
-	PG_RETURN_VOID();
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+gistbulkdelete(PG_FUNCTION_ARGS)
+{
+	Relation	rel = (Relation) PG_GETARG_POINTER(0);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
+	 */
+
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
+
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
+
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+			BlockNumber blkno;
+			OffsetNumber offnum;
+			Buffer		buf;
+			Page		page;
+
+			blkno = ItemPointerGetBlockNumber(indexptr);
+			offnum = ItemPointerGetOffsetNumber(indexptr);
+
+			/* adjust any scans that will be affected by this deletion */
+			gistadjscans(rel, GISTOP_DEL, blkno, offnum);
+
+			/* delete the index tuple */
+			buf = ReadBuffer(rel, blkno);
+			page = BufferGetPage(buf);
+
+			PageIndexTupleDelete(page, offnum);
+
+			WriteBuffer(buf);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 void
 initGISTstate(GISTSTATE *giststate, Relation index)
 {
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 672b121693..9358692a53 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.37 2001/06/28 16:00:07 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.38 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -59,13 +59,8 @@ gistbeginscan(PG_FUNCTION_ARGS)
 	ScanKey		key = (ScanKey) PG_GETARG_POINTER(3);
 	IndexScanDesc s;
 
-	/*
-	 * Let index_beginscan does its work...
-	 *
-	 * RelationSetLockForRead(r);
-	 */
-
 	s = RelationGetIndexScan(r, fromEnd, nkeys, key);
+
 	gistregscan(s);
 
 	PG_RETURN_POINTER(s);
@@ -283,6 +278,27 @@ gistdropscan(IndexScanDesc s)
 	pfree(l);
 }
 
+/*
+ * AtEOXact_gist() --- clean up gist subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var GISTScans.
+ */
+void
+AtEOXact_gist(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	GISTScans = NULL;
+}
+
 void
 gistadjscans(Relation rel, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 9617fcc33a..9b0e6cf28e 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.51 2001/05/07 00:43:15 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.52 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -21,13 +21,27 @@
 #include "access/genam.h"
 #include "access/hash.h"
 #include "access/heapam.h"
+#include "access/xlogutils.h"
 #include "catalog/index.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 
+
 bool		BuildingHash = false;
 
-#include "access/xlogutils.h"
+
+/* Working state for hashbuild and its callback */
+typedef struct
+{
+	double		indtuples;
+} HashBuildState;
+
+static void hashbuildCallback(Relation index,
+							  HeapTuple htup,
+							  Datum *attdata,
+							  char *nulls,
+							  bool tupleIsAlive,
+							  void *state);
 
 
 /*
@@ -44,161 +58,32 @@ hashbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	HashItem	hitem;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
+	double		reltuples;
+	HashBuildState buildstate;
 
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-
-	/* note that this is a new hash */
+	/* set flag to disable locking */
 	BuildingHash = true;
 
-	/* initialize the hash index metadata page (if this is a new index) */
-	if (oldPred == NULL)
-		_hash_metapinit(index);
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
 	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
+	 * We expect to be called exactly once for any index relation. If
+	 * that's not the case, big trouble's what we have.
 	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-	/* build the index */
-	nhtups = nitups = 0.0;
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+	/* initialize the hash index metadata page */
+	_hash_metapinit(index);
 
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-
-		/*
-		 * If the single index key is null, we don't insert it into the
-		 * index.  Hash tables support scans on '='. Relational algebra
-		 * says that A = B returns null if either A or B is null.  This
-		 * means that no qualification used in an index scan could ever
-		 * return true on a null attribute.  It also means that indices
-		 * can't be used by ISNULL or NOTNULL scans, but that's an
-		 * artifact of the strategy map architecture chosen in 1986, not
-		 * of the way nulls are handled here.
-		 */
-
-		if (IndexTupleHasNulls(itup))
-		{
-			pfree(itup);
-			continue;
-		}
-
-		itup->t_tid = htup->t_self;
-		hitem = _hash_formitem(itup);
-
-		res = _hash_doinsert(index, hitem);
-
-		pfree(hitem);
-		pfree(itup);
-		pfree(res);
-	}
+	/* build the index */
+	buildstate.indtuples = 0;
 
-	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   hashbuildCallback, (void *) &buildstate);
 
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
+	/* all done */
+	BuildingHash = false;
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -218,23 +103,54 @@ hashbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
-	/* all done */
-	BuildingHash = false;
-
 	PG_RETURN_VOID();
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+hashbuildCallback(Relation index,
+				  HeapTuple htup,
+				  Datum *attdata,
+				  char *nulls,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	HashBuildState   *buildstate = (HashBuildState *) state;
+	IndexTuple	itup;
+	HashItem	hitem;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* Hash indexes don't index nulls, see notes in hashinsert */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		return;
+	}
+
+	hitem = _hash_formitem(itup);
+
+	res = _hash_doinsert(index, hitem);
+
+	if (res)
+		pfree(res);
+
+	buildstate->indtuples += 1;
+
+	pfree(hitem);
+	pfree(itup);
+}
+
+/*
  *	hashinsert() -- insert an index tuple into a hash table.
  *
  *	Hash on the index tuple's key, find the appropriate location
@@ -248,10 +164,8 @@ hashinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	HashItem	hitem;
@@ -261,8 +175,21 @@ hashinsert(PG_FUNCTION_ARGS)
 	itup = index_formtuple(RelationGetDescr(rel), datum, nulls);
 	itup->t_tid = *ht_ctid;
 
+	/*
+	 * If the single index key is null, we don't insert it into the
+	 * index.  Hash tables support scans on '='. Relational algebra
+	 * says that A = B returns null if either A or B is null.  This
+	 * means that no qualification used in an index scan could ever
+	 * return true on a null attribute.  It also means that indices
+	 * can't be used by ISNULL or NOTNULL scans, but that's an
+	 * artifact of the strategy map architecture chosen in 1986, not
+	 * of the way nulls are handled here.
+	 */
 	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
 		PG_RETURN_POINTER((InsertIndexResult) NULL);
+	}
 
 	hitem = _hash_formitem(itup);
 
@@ -471,22 +398,74 @@ hashrestrpos(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-hashdelete(PG_FUNCTION_ARGS)
+hashbulkdelete(PG_FUNCTION_ARGS)
 {
 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
 
-	/* adjust any active scans that will be affected by this deletion */
-	_hash_adjscans(rel, tid);
+	tuples_removed = 0;
+	num_index_tuples = 0;
 
-	/* delete the data from the page */
-	_hash_pagedel(rel, tid);
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
 
-	PG_RETURN_VOID();
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
+
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+
+			/* adjust any active scans that will be affected by deletion */
+			/* (namely, my own scan) */
+			_hash_adjscans(rel, indexptr);
+
+			/* delete the data from the page */
+			_hash_pagedel(rel, indexptr);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 void
 hash_redo(XLogRecPtr lsn, XLogRecord *record)
 {
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 8e2ed1bb8a..c9fb065dbd 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.29 2001/03/07 21:20:26 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.30 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -112,14 +112,14 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
 
-	splitnum = metap->OVFL_POINT;
-	max_free = metap->SPARES[splitnum];
+	splitnum = metap->hashm_ovflpoint;
+	max_free = metap->hashm_spares[splitnum];
 
 	free_page = (max_free - 1) >> (metap->hashm_bshift + BYTE_TO_BIT);
 	free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
 
 	/* Look through all the free maps to find the first free block */
-	first_page = metap->LAST_FREED >> (metap->hashm_bshift + BYTE_TO_BIT);
+	first_page = metap->hashm_lastfreed >> (metap->hashm_bshift + BYTE_TO_BIT);
 	for (i = first_page; i <= free_page; i++)
 	{
 		Page		mappage;
@@ -138,7 +138,7 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 
 		if (i == first_page)
 		{
-			bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+			bit = metap->hashm_lastfreed & (BMPGSZ_BIT(metap) - 1);
 			j = bit / BITS_PER_MAP;
 			bit = bit & ~(BITS_PER_MAP - 1);
 		}
@@ -153,10 +153,10 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 	}
 
 	/* No Free Page Found - have to allocate a new page */
-	metap->LAST_FREED = metap->SPARES[splitnum];
-	metap->SPARES[splitnum]++;
-	offset = metap->SPARES[splitnum] -
-		(splitnum ? metap->SPARES[splitnum - 1] : 0);
+	metap->hashm_lastfreed = metap->hashm_spares[splitnum];
+	metap->hashm_spares[splitnum]++;
+	offset = metap->hashm_spares[splitnum] -
+		(splitnum ? metap->hashm_spares[splitnum - 1] : 0);
 
 #define OVMSG	"HASH: Out of overflow pages.  Out of luck.\n"
 
@@ -164,9 +164,9 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 	{
 		if (++splitnum >= NCACHED)
 			elog(ERROR, OVMSG);
-		metap->OVFL_POINT = splitnum;
-		metap->SPARES[splitnum] = metap->SPARES[splitnum - 1];
-		metap->SPARES[splitnum - 1]--;
+		metap->hashm_ovflpoint = splitnum;
+		metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1];
+		metap->hashm_spares[splitnum - 1]--;
 		offset = 0;
 	}
 
@@ -194,15 +194,15 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 		if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
 							 1, free_page))
 			elog(ERROR, "overflow_page: problem with _hash_initbitmap.");
-		metap->SPARES[splitnum]++;
+		metap->hashm_spares[splitnum]++;
 		offset++;
 		if (offset > SPLITMASK)
 		{
 			if (++splitnum >= NCACHED)
 				elog(ERROR, OVMSG);
-			metap->OVFL_POINT = splitnum;
-			metap->SPARES[splitnum] = metap->SPARES[splitnum - 1];
-			metap->SPARES[splitnum - 1]--;
+			metap->hashm_ovflpoint = splitnum;
+			metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1];
+			metap->hashm_spares[splitnum - 1]--;
 			offset = 0;
 		}
 	}
@@ -235,13 +235,13 @@ found:
 	 */
 
 	bit = 1 + bit + (i * BMPGSZ_BIT(metap));
-	if (bit >= metap->LAST_FREED)
-		metap->LAST_FREED = bit - 1;
+	if (bit >= metap->hashm_lastfreed)
+		metap->hashm_lastfreed = bit - 1;
 
 	/* Calculate the split number for this page */
-	for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+	for (i = 0; (i < splitnum) && (bit > metap->hashm_spares[i]); i++)
 		;
-	offset = (i ? bit - metap->SPARES[i - 1] : bit);
+	offset = (i ? bit - metap->hashm_spares[i - 1] : bit);
 	if (offset >= SPLITMASK)
 		elog(ERROR, OVMSG);
 
@@ -355,10 +355,10 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	 * element hashm_mapp[bitmappage].
 	 */
 	splitnum = (addr >> SPLITSHIFT);
-	ovflpgno = (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+	ovflpgno = (splitnum ? metap->hashm_spares[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
 
-	if (ovflpgno < metap->LAST_FREED)
-		metap->LAST_FREED = ovflpgno;
+	if (ovflpgno < metap->hashm_lastfreed)
+		metap->hashm_lastfreed = ovflpgno;
 
 	bitmappage = (ovflpgno >> (metap->hashm_bshift + BYTE_TO_BIT));
 	bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index d1b3aaa232..b8c520e3c0 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.31 2001/06/27 23:31:37 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.32 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -18,7 +18,7 @@
  *	  address of the page if it is an overflow page.
  *
  *	  The first page in a hash relation, page zero, is special -- it stores
- *	  information describing the hash table; it is referred to as teh
+ *	  information describing the hash table; it is referred to as the
  *	  "meta page." Pages one and higher store the actual data.
  *
  *-------------------------------------------------------------------------
@@ -48,6 +48,19 @@ static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket
  *	before the lock table is fully initialized, so we can't use it.
  *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
  *	system catalogs anyway.
+ *
+ *	Note that our page locks are actual lockmanager locks, not buffer
+ *	locks (as are used by btree, for example).  This is a good idea because
+ *	the algorithms are not deadlock-free, and we'd better be able to detect
+ *	and recover from deadlocks.
+ *
+ *	Another important difference from btree is that a hash indexscan
+ *	retains both a lock and a buffer pin on the current index page
+ *	between hashgettuple() calls (btree keeps only a buffer pin).
+ *	Because of this, it's safe to do item deletions with only a regular
+ *	write lock on a hash page --- there cannot be an indexscan stopped on
+ *	the page being deleted, other than an indexscan of our own backend,
+ *	which will be taken care of by _hash_adjscans.
  */
 
 
@@ -350,6 +363,16 @@ _hash_unsetpagelock(Relation rel,
 	}
 }
 
+/*
+ * Delete a hash index item.
+ *
+ * It is safe to delete an item after acquiring a regular WRITE lock on
+ * the page, because no other backend can hold a READ lock on the page,
+ * and that means no other backend currently has an indexscan stopped on
+ * any item of the item being deleted.  Our own backend might have such
+ * an indexscan (in fact *will*, since that's how VACUUM found the item
+ * in the first place), but _hash_adjscans will fix the scan position.
+ */
 void
 _hash_pagedel(Relation rel, ItemPointer tid)
 {
@@ -384,7 +407,7 @@ _hash_pagedel(Relation rel, ItemPointer tid)
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage((Page) metap, LH_META_PAGE);
-	++metap->hashm_nkeys;
+	metap->hashm_nkeys--;
 	_hash_wrtbuf(rel, metabuf);
 }
 
@@ -402,32 +425,32 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-	new_bucket = ++metap->MAX_BUCKET;
+	new_bucket = ++metap->hashm_maxbucket;
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
-	old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
+	old_bucket = (metap->hashm_maxbucket & metap->hashm_lowmask);
 
 	/*
-	 * If the split point is increasing (MAX_BUCKET's log base 2 *
+	 * If the split point is increasing (hashm_maxbucket's log base 2 *
 	 * increases), we need to copy the current contents of the spare split
 	 * bucket to the next bucket.
 	 */
-	spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
-	if (spare_ndx > metap->OVFL_POINT)
+	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
+	if (spare_ndx > metap->hashm_ovflpoint)
 	{
 
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-		metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
-		metap->OVFL_POINT = spare_ndx;
+		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
+		metap->hashm_ovflpoint = spare_ndx;
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 	}
 
-	if (new_bucket > metap->HIGH_MASK)
+	if (new_bucket > metap->hashm_highmask)
 	{
 
 		/* Starting a new doubling */
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-		metap->LOW_MASK = metap->HIGH_MASK;
-		metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
+		metap->hashm_lowmask = metap->hashm_highmask;
+		metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 
 	}
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
index 649e42fbeb..f4a91b5710 100644
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.24 2001/01/24 19:42:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.25 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Because we can be doing an index scan on a relation while we
@@ -45,6 +45,31 @@ typedef HashScanListData *HashScanList;
 
 static HashScanList HashScans = (HashScanList) NULL;
 
+
+/*
+ * AtEOXact_hash() --- clean up hash subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var HashScans.
+ */
+void
+AtEOXact_hash(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	HashScans = NULL;
+
+	/* If we were building a hash, we ain't anymore. */
+	BuildingHash = false;
+}
+
 /*
  *	_Hash_regscan() -- register a new scan.
  */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index adeccf5cc8..2b6be06168 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.51 2001/06/22 19:16:21 wieck Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.52 2001/07/15 22:48:15 tgl Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relationId
@@ -18,23 +18,17 @@
  *		index_rescan	- restart a scan of an index
  *		index_endscan	- end a scan
  *		index_insert	- insert an index tuple into a relation
- *		index_delete	- delete an item from an index relation
  *		index_markpos	- mark a scan position
  *		index_restrpos	- restore a scan position
  *		index_getnext	- get the next tuple from a scan
- * **	index_fetch		- retrieve tuple with tid
- * **	index_replace	- replace a tuple
- * **	index_getattr	- get an attribute from an index tuple
- *		index_getprocid - get a support procedure id from the rel tuple
- *
- *		IndexScanIsValid - check index scan
+ *		index_bulk_delete	- bulk deletion of index tuples
+ *		index_cost_estimator	- fetch amcostestimate procedure OID
+ *		index_getprocid - get a support procedure OID
  *
  * NOTES
  *		This file contains the index_ routines which used
  *		to be a scattered collection of stuff in access/genam.
  *
- *		The ** routines: index_fetch, index_replace, and index_getattr
- *		have not yet been implemented.	They may not be needed.
  *
  * old comments
  *		Scans are implemented as follows:
@@ -211,23 +205,6 @@ index_insert(Relation relation,
 }
 
 /* ----------------
- *		index_delete - delete an item from an index relation
- * ----------------
- */
-void
-index_delete(Relation relation, ItemPointer indexItem)
-{
-	RegProcedure procedure;
-
-	RELATION_CHECKS;
-	GET_REL_PROCEDURE(delete, amdelete);
-
-	OidFunctionCall2(procedure,
-					 PointerGetDatum(relation),
-					 PointerGetDatum(indexItem));
-}
-
-/* ----------------
  *		index_beginscan - start a scan of an index
  * ----------------
  */
@@ -379,6 +356,35 @@ index_getnext(IndexScanDesc scan,
 }
 
 /* ----------------
+ *		index_bulk_delete - do mass deletion of index entries
+ *
+ *		callback routine tells whether a given main-heap tuple is
+ *		to be deleted
+ *
+ *		return value is an optional palloc'd struct of statistics
+ * ----------------
+ */
+IndexBulkDeleteResult *
+index_bulk_delete(Relation relation,
+				  IndexBulkDeleteCallback callback,
+				  void *callback_state)
+{
+	RegProcedure procedure;
+	IndexBulkDeleteResult *result;
+
+	RELATION_CHECKS;
+	GET_REL_PROCEDURE(bulk_delete, ambulkdelete);
+
+	result = (IndexBulkDeleteResult *)
+		DatumGetPointer(OidFunctionCall3(procedure,
+										 PointerGetDatum(relation),
+										 PointerGetDatum((Pointer) callback),
+										 PointerGetDatum(callback_state)));
+
+	return result;
+}
+
+/* ----------------
  *		index_cost_estimator
  *
  *		Fetch the amcostestimate procedure OID for an index.
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index eba9bd4eef..bdc366dd0a 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/nbtree
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.10 2000/08/31 16:09:41 petere Exp $
+#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/access/nbtree
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtscan.o nbtsearch.o \
+OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
        nbtstrat.o nbtutils.o nbtsort.o
 
 all: SUBSYS.o
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index cff7ff0d65..d8ec739b2a 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.4 2000/07/25 05:26:40 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.5 2001/07/15 22:48:16 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -109,15 +109,11 @@ In addition, the following things are handy to know:
    is too high a price).  Rebuilding corrupted indexes during restart
    seems more attractive.
 
-+  On deletions, we need to adjust the position of active scans on
-   the index.  The code in nbtscan.c handles this.  We don't need to
-   do this for insertions or splits because _bt_restscan can find the
-   new position of the previously-found item.  NOTE that nbtscan.c
-   only copes with deletions issued by the current backend.  This
-   essentially means that concurrent deletions are not supported, but
-   that's true already in the Lehman and Yao algorithm.  nbtscan.c
-   exists only to support VACUUM and allow it to delete items while
-   it's scanning the index.
++  Deletions are handled by getting a super-exclusive lock on the target
+   page, so that no other backend has a pin on the page when the deletion
+   starts.  This means no scan is pointing at the page.  This is OK for
+   deleting leaf items, probably not OK for deleting internal nodes;
+   will need to think harder when it's time to support index compaction.
 
 +  "ScanKey" data structures are used in two fundamentally different ways
    in this code.  Searches for the initial position for a scan, as well as
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 8ffb9b9043..c91c568ed2 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.83 2001/06/22 19:16:21 wieck Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.84 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -126,7 +126,7 @@ top:
 		if (TransactionIdIsValid(xwait))
 		{
 			/* Have to wait for the other guy ... */
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			XactLockTableWait(xwait);
 			/* start over... */
 			_bt_freestack(stack);
@@ -234,7 +234,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 				if (TransactionIdIsValid(xwait))
 				{
 					if (nbuf != InvalidBuffer)
-						_bt_relbuf(rel, nbuf, BT_READ);
+						_bt_relbuf(rel, nbuf);
 					/* Tell _bt_doinsert to wait... */
 					return xwait;
 				}
@@ -263,7 +263,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 				break;
 			nblkno = opaque->btpo_next;
 			if (nbuf != InvalidBuffer)
-				_bt_relbuf(rel, nbuf, BT_READ);
+				_bt_relbuf(rel, nbuf);
 			nbuf = _bt_getbuf(rel, nblkno, BT_READ);
 			page = BufferGetPage(nbuf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -273,7 +273,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 	}
 
 	if (nbuf != InvalidBuffer)
-		_bt_relbuf(rel, nbuf, BT_READ);
+		_bt_relbuf(rel, nbuf);
 
 	return NullTransactionId;
 }
@@ -397,7 +397,7 @@ _bt_insertonpg(Relation rel,
 			/* step right one page */
 			BlockNumber rblkno = lpageop->btpo_next;
 
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			buf = _bt_getbuf(rel, rblkno, BT_WRITE);
 			page = BufferGetPage(buf);
 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1175,12 +1175,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
 		 */
 		if (P_RIGHTMOST(opaque))
 		{
-			_bt_relbuf(rel, buf, access);
+			_bt_relbuf(rel, buf);
 			return (InvalidBuffer);
 		}
 
 		blkno = opaque->btpo_next;
-		_bt_relbuf(rel, buf, access);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1449,7 +1449,7 @@ _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
 							   &itup_off, &itup_blkno);
 			/* Keep lock on new "root" buffer ! */
 			if (buf != rootbuf)
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 			buf = newbuf;
 			page = BufferGetPage(buf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1525,7 +1525,7 @@ _bt_fixtree(Relation rel, BlockNumber blkno)
 			if (P_ISROOT(opaque))
 			{
 				/* Tree is Ok now */
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 				return;
 			}
 			/* Call _bt_fixroot() if there is no upper level */
@@ -1533,12 +1533,12 @@ _bt_fixtree(Relation rel, BlockNumber blkno)
 			{
 				elog(NOTICE, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel));
 				buf = _bt_fixroot(rel, buf, true);
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 				return;
 			}
 			/* Have to go up one level */
 			pblkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 		}
 		blkno = pblkno;
 	}
@@ -1571,7 +1571,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 	page = BufferGetPage(buf);
 	/* copy page to temp storage */
 	memmove(tbuf, page, PageGetPageSize(page));
-	_bt_relbuf(rel, buf, BT_READ);
+	_bt_relbuf(rel, buf);
 
 	page = (Page) tbuf;
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1682,7 +1682,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 					{
 						if (coff[i] != P_FIRSTDATAKEY(newopaque))
 							elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel));
-						_bt_relbuf(rel, buf, BT_WRITE);
+						_bt_relbuf(rel, buf);
 						buf = newbuf;
 						page = newpage;
 						opaque = newopaque;
@@ -1691,7 +1691,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 						continue;
 					}
 					/* unfound - need to insert on current page */
-					_bt_relbuf(rel, newbuf, BT_WRITE);
+					_bt_relbuf(rel, newbuf);
 				}
 				/* insert pointer */
 				ritem = (BTItem) PageGetItem(cpage[i - 1],
@@ -1718,10 +1718,10 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 									   &itup_off, &itup_blkno);
 					/* what buffer we need in ? */
 					if (newitemonleft)
-						_bt_relbuf(rel, newbuf, BT_WRITE);
+						_bt_relbuf(rel, newbuf);
 					else
 					{
-						_bt_relbuf(rel, buf, BT_WRITE);
+						_bt_relbuf(rel, buf);
 						buf = newbuf;
 						page = BufferGetPage(buf);
 						opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1741,7 +1741,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 
 			/* copy page with pointer to cblkno[cidx] to temp storage */
 			memmove(tbuf, page, PageGetPageSize(page));
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			page = (Page) tbuf;
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		}
@@ -1751,13 +1751,13 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 			goodbye = false;
 
 		/* Pointers to child pages are Ok - right end of child level ? */
-		_bt_relbuf(rel, cbuf[0], BT_READ);
-		_bt_relbuf(rel, cbuf[1], BT_READ);
+		_bt_relbuf(rel, cbuf[0]);
+		_bt_relbuf(rel, cbuf[1]);
 		if (cidx == 1 ||
 			(cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye)))
 		{
 			if (cidx == 2)
-				_bt_relbuf(rel, cbuf[2], BT_READ);
+				_bt_relbuf(rel, cbuf[2]);
 			return;
 		}
 		if (cblkno[0] == limit || cblkno[1] == limit)
@@ -1819,7 +1819,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		{
 			if (offnum <= stack.bts_offset)
 				elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel));
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			return;
 		}
 
@@ -1837,7 +1837,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		if (rbuf == InvalidBuffer)
 			elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel));
 		rblkno = BufferGetBlockNumber(rbuf);
-		_bt_relbuf(rel, rbuf, BT_READ);
+		_bt_relbuf(rel, rbuf);
 
 		/*
 		 * If we have parent item in true_stack then go up one level and
@@ -1845,7 +1845,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		 */
 		if (true_stack)
 		{
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			blkno = true_stack->bts_blkno;
 			true_stack = true_stack->bts_parent;
 			continue;
@@ -1860,19 +1860,19 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			continue;
 		}
 
 		/* Have to switch to excl buf lock and re-check btpo_parent */
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_WRITE);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			continue;
 		}
 
@@ -1913,7 +1913,7 @@ _bt_fixup(Relation rel, Buffer buf)
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			elog(NOTICE, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel));
 			_bt_fixtree(rel, blkno);
 			return;
@@ -1921,8 +1921,7 @@ _bt_fixup(Relation rel, Buffer buf)
 		if (P_LEFTMOST(opaque))
 			break;
 		blkno = opaque->btpo_prev;
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		ReleaseBuffer(buf);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_WRITE);
 	}
 
@@ -1932,9 +1931,7 @@ _bt_fixup(Relation rel, Buffer buf)
 	 */
 	elog(NOTICE, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel));
 	buf = _bt_fixroot(rel, buf, true);
-	_bt_relbuf(rel, buf, BT_WRITE);
-
-	return;
+	_bt_relbuf(rel, buf);
 }
 
 static OffsetNumber
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 67e1407b22..376274c562 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.52 2001/06/27 23:31:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.53 2001/07/15 22:48:16 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -138,7 +138,7 @@ _bt_getroot(Relation rel, int access)
 		/* If access = BT_READ, caller doesn't want us to create root yet */
 		if (access == BT_READ)
 		{
-			_bt_relbuf(rel, metabuf, BT_READ);
+			_bt_relbuf(rel, metabuf);
 			return InvalidBuffer;
 		}
 
@@ -215,14 +215,14 @@ _bt_getroot(Relation rel, int access)
 			 * guarantee no deadlocks, we have to release the metadata
 			 * page and start all over again.
 			 */
-			_bt_relbuf(rel, metabuf, BT_WRITE);
+			_bt_relbuf(rel, metabuf);
 			return _bt_getroot(rel, access);
 		}
 	}
 	else
 	{
 		rootblkno = metad->btm_root;
-		_bt_relbuf(rel, metabuf, BT_READ);		/* done with the meta page */
+		_bt_relbuf(rel, metabuf);		/* done with the meta page */
 
 		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 	}
@@ -270,8 +270,8 @@ _bt_getroot(Relation rel, int access)
 					goto check_parent;
 				}
 				else
-/* someone else already fixed root */
 				{
+					/* someone else already fixed root */
 					LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 					LockBuffer(rootbuf, BT_READ);
 				}
@@ -283,7 +283,7 @@ _bt_getroot(Relation rel, int access)
 			 * chance that parent is root page.
 			 */
 			newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
-			_bt_relbuf(rel, rootbuf, BT_READ);
+			_bt_relbuf(rel, rootbuf);
 			rootbuf = newrootbuf;
 			rootpage = BufferGetPage(rootbuf);
 			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
@@ -293,7 +293,7 @@ _bt_getroot(Relation rel, int access)
 		}
 
 		/* try again */
-		_bt_relbuf(rel, rootbuf, BT_READ);
+		_bt_relbuf(rel, rootbuf);
 		return _bt_getroot(rel, access);
 	}
 
@@ -350,10 +350,12 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 /*
  *	_bt_relbuf() -- release a locked buffer.
  *
- * Lock and pin (refcount) are both dropped.
+ * Lock and pin (refcount) are both dropped.  Note that either read or
+ * write lock can be dropped this way, but if we modified the buffer,
+ * this is NOT the right way to release a write lock.
  */
 void
-_bt_relbuf(Relation rel, Buffer buf, int access)
+_bt_relbuf(Relation rel, Buffer buf)
 {
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 	ReleaseBuffer(buf);
@@ -449,24 +451,23 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
 }
 
 /*
- * Delete an item from a btree.  It had better be a leaf item...
+ * Delete an item from a btree page.
+ *
+ * This routine assumes that the caller has pinned and locked the buffer,
+ * and will write the buffer afterwards.
  */
 void
-_bt_pagedel(Relation rel, ItemPointer tid)
+_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
 {
-	Buffer		buf;
-	Page		page;
-	BlockNumber blkno;
+	Page		page = BufferGetPage(buf);
 	OffsetNumber offno;
 
-	blkno = ItemPointerGetBlockNumber(tid);
 	offno = ItemPointerGetOffsetNumber(tid);
 
-	buf = _bt_getbuf(rel, blkno, BT_WRITE);
-	page = BufferGetPage(buf);
-
 	START_CRIT_SECTION();
+
 	PageIndexTupleDelete(page, offno);
+
 	/* XLOG stuff */
 	{
 		xl_btree_delete xlrec;
@@ -490,8 +491,6 @@ _bt_pagedel(Relation rel, ItemPointer tid)
 		PageSetLSN(page, recptr);
 		PageSetSUI(page, ThisStartUpID);
 	}
-	END_CRIT_SECTION();
 
-	/* write the buffer and release the lock */
-	_bt_wrtbuf(rel, buf);
+	END_CRIT_SECTION();
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b714296c8f..b142645624 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.81 2001/05/18 21:24:17 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.82 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,11 +28,27 @@
 #include "storage/sinval.h"
 #include "access/xlogutils.h"
 
-bool		BuildingBtree = false;		/* see comment in btbuild() */
-bool		FastBuild = true;	/* use sort/build instead */
 
- /* of insertion build */
+/* Working state for btbuild and its callback */
+typedef struct
+{
+	bool		usefast;
+	bool		isUnique;
+	bool		haveDead;
+	Relation	heapRel;
+	BTSpool    *spool;
+	/*
+	 * spool2 is needed only when the index is an unique index. Dead
+	 * tuples are put into spool2 instead of spool in order to avoid
+	 * uniqueness check.
+	 */
+	BTSpool    *spool2;
+	double		indtuples;
+} BTBuildState;
+
 
+bool		BuildingBtree = false;		/* see comment in btbuild() */
+bool		FastBuild = true;	/* use SORT instead of insertion build */
 
 /*
  * TEMPORARY FLAG FOR TESTING NEW FIX TREE
@@ -41,6 +57,29 @@ bool		FastBuild = true;	/* use sort/build instead */
 bool		FixBTree = true;
 
 static void _bt_restscan(IndexScanDesc scan);
+static void btbuildCallback(Relation index,
+							HeapTuple htup,
+							Datum *attdata,
+							char *nulls,
+							bool tupleIsAlive,
+							void *state);
+
+
+/*
+ * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
+ */
+void
+AtEOXact_nbtree(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/* If we were building a btree, we ain't anymore. */
+	BuildingBtree = false;
+}
+
 
 /*
  *	btbuild() -- build a new btree index.
@@ -56,42 +95,10 @@ btbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-	BTSpool    *spool = NULL;
-	BTItem		btitem;
-	bool		usefast;
-	Snapshot	snapshot;
-	TransactionId XmaxRecent;
+	double		reltuples;
+	BTBuildState buildstate;
 
-	/*
-	 * spool2 is needed only when the index is an unique index. Dead
-	 * tuples are put into spool2 instead of spool in order to avoid
-	 * uniqueness check.
-	 */
-	BTSpool    *spool2 = NULL;
-	bool		tupleIsAlive;
-	int			dead_count;
-
-	/* note that this is a new btree */
+	/* set flag to disable locking */
 	BuildingBtree = true;
 
 	/*
@@ -100,220 +107,63 @@ btbuild(PG_FUNCTION_ARGS)
 	 * look harder at this.  (there is some kind of incremental processing
 	 * going on there.) -- pma 08/29/95
 	 */
-	usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.isUnique = indexInfo->ii_Unique;
+	buildstate.haveDead = false;
+	buildstate.heapRel = heap;
+	buildstate.spool = NULL;
+	buildstate.spool2 = NULL;
+	buildstate.indtuples = 0;
 
 #ifdef BTREE_BUILD_STATS
 	if (Show_btree_build_stats)
 		ResetUsage();
 #endif	 /* BTREE_BUILD_STATS */
 
-	/* initialize the btree index metadata page (if this is a new index) */
-	if (oldPred == NULL)
-		_bt_metapinit(index);
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
 	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
+	 * We expect to be called exactly once for any index relation. If
+	 * that's not the case, big trouble's what we have.
 	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-
-		/*
-		 * we never want to use sort/build if we are extending an existing
-		 * partial index -- it works by inserting the newly-qualifying
-		 * tuples into the existing index. (sort/build would overwrite the
-		 * existing index with one consisting of the newly-qualifying
-		 * tuples.)
-		 */
-		usefast = false;
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	/* build the index */
-	nhtups = nitups = 0.0;
+	/* initialize the btree index metadata page */
+	_bt_metapinit(index);
 
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		spool = _bt_spoolinit(index, indexInfo->ii_Unique);
-
+		buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique);
 		/*
-		 * Different from spool,the uniqueness isn't checked for spool2.
+		 * Different from spool, the uniqueness isn't checked for spool2.
 		 */
 		if (indexInfo->ii_Unique)
-			spool2 = _bt_spoolinit(index, false);
+			buildstate.spool2 = _bt_spoolinit(index, false);
 	}
 
-	/* start a heap scan */
-	dead_count = 0;
-	snapshot = (IsBootstrapProcessingMode() ? SnapshotNow : SnapshotAny);
-	hscan = heap_beginscan(heap, 0, snapshot, 0, (ScanKey) NULL);
-	XmaxRecent = 0;
-	if (snapshot == SnapshotAny)
-		GetXmaxRecent(&XmaxRecent);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		if (snapshot == SnapshotAny)
-		{
-			tupleIsAlive = HeapTupleSatisfiesNow(htup->t_data);
-			if (!tupleIsAlive)
-			{
-				if ((htup->t_data->t_infomask & HEAP_XMIN_INVALID) != 0)
-					continue;
-				if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED &&
-					htup->t_data->t_xmax < XmaxRecent)
-					continue;
-			}
-		}
-		else
-			tupleIsAlive = true;
-
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-
-		/*
-		 * If the single index key is null, we don't insert it into the
-		 * index.  Btrees support scans on <, <=, =, >=, and >. Relational
-		 * algebra says that A op B (where op is one of the operators
-		 * above) returns null if either A or B is null.  This means that
-		 * no qualification used in an index scan could ever return true
-		 * on a null attribute.  It also means that indices can't be used
-		 * by ISNULL or NOTNULL scans, but that's an artifact of the
-		 * strategy map architecture chosen in 1986, not of the way nulls
-		 * are handled here.
-		 */
-
-		/*
-		 * New comments: NULLs handling. While we can't do NULL
-		 * comparison, we can follow simple rule for ordering items on
-		 * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
-		 * Sure, it's just rule for placing/finding items and no more -
-		 * keytest'll return FALSE for a = 5 for items having 'a' isNULL.
-		 * Look at _bt_compare for how it works. - vadim 03/23/97
-		 *
-		 * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
-		 */
-
-		itup->t_tid = htup->t_self;
-		btitem = _bt_formitem(itup);
-
-		/*
-		 * if we are doing bottom-up btree build, we insert the index into
-		 * a spool file for subsequent processing.	otherwise, we insert
-		 * into the btree.
-		 */
-		if (usefast)
-		{
-			if (tupleIsAlive || !spool2)
-				_bt_spool(btitem, spool);
-			else
-/* dead tuples are put into spool2 */
-			{
-				dead_count++;
-				_bt_spool(btitem, spool2);
-			}
-		}
-		else
-			res = _bt_doinsert(index, btitem, indexInfo->ii_Unique, heap);
-
-		pfree(btitem);
-		pfree(itup);
-		if (res)
-			pfree(res);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   btbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-	if (spool2 && !dead_count)	/* spool2 was found to be unnecessary */
+	if (buildstate.spool2 && !buildstate.haveDead)
 	{
-		_bt_spooldestroy(spool2);
-		spool2 = NULL;
+		/* spool2 turns out to be unnecessary */
+		_bt_spooldestroy(buildstate.spool2);
+		buildstate.spool2 = NULL;
 	}
 
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
-
 	/*
 	 * if we are doing bottom-up btree build, finish the build by (1)
 	 * completing the sort of the spool file, (2) inserting the sorted
 	 * tuples into btree pages and (3) building the upper levels.
 	 */
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		_bt_leafbuild(spool, spool2);
-		_bt_spooldestroy(spool);
-		if (spool2)
-			_bt_spooldestroy(spool2);
+		_bt_leafbuild(buildstate.spool, buildstate.spool2);
+		_bt_spooldestroy(buildstate.spool);
+		if (buildstate.spool2)
+			_bt_spooldestroy(buildstate.spool2);
 	}
 
 #ifdef BTREE_BUILD_STATS
@@ -325,6 +175,9 @@ btbuild(PG_FUNCTION_ARGS)
 	}
 #endif	 /* BTREE_BUILD_STATS */
 
+	/* all done */
+	BuildingBtree = false;
+
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
 	 * in pg_class to guarantee that the planner takes advantage of the
@@ -343,20 +196,63 @@ btbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+btbuildCallback(Relation index,
+				HeapTuple htup,
+				Datum *attdata,
+				char *nulls,
+				bool tupleIsAlive,
+				void *state)
+{
+	BTBuildState   *buildstate = (BTBuildState *) state;
+	IndexTuple	itup;
+	BTItem		btitem;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	btitem = _bt_formitem(itup);
+
+	/*
+	 * if we are doing bottom-up btree build, we insert the index into
+	 * a spool file for subsequent processing.	otherwise, we insert
+	 * into the btree.
+	 */
+	if (buildstate->usefast)
+	{
+		if (tupleIsAlive || buildstate->spool2 == NULL)
+			_bt_spool(btitem, buildstate->spool);
+		else
 		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
+			/* dead tuples are put into spool2 */
+			buildstate->haveDead = true;
+			_bt_spool(btitem, buildstate->spool2);
 		}
 	}
+	else
+	{
+		res = _bt_doinsert(index, btitem,
+						   buildstate->isUnique, buildstate->heapRel);
+		if (res)
+			pfree(res);
+	}
 
-	/* all done */
-	BuildingBtree = false;
+	buildstate->indtuples += 1;
 
-	PG_RETURN_VOID();
+	pfree(btitem);
+	pfree(itup);
 }
 
 /*
@@ -423,8 +319,10 @@ btgettuple(PG_FUNCTION_ARGS)
 
 	/*
 	 * Save heap TID to use it in _bt_restscan.  Then release the read
-	 * lock on the buffer so that we aren't blocking other backends. NOTE:
-	 * we do keep the pin on the buffer!
+	 * lock on the buffer so that we aren't blocking other backends.
+	 *
+	 * NOTE: we do keep the pin on the buffer!  This is essential to ensure
+	 * that someone else doesn't delete the index entry we are stopped on.
 	 */
 	if (res)
 	{
@@ -451,9 +349,6 @@ btbeginscan(PG_FUNCTION_ARGS)
 	/* get the scan */
 	scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
 
-	/* register scan in case we change pages it's using */
-	_bt_regscan(scan);
-
 	PG_RETURN_POINTER(scan);
 }
 
@@ -571,8 +466,6 @@ btendscan(PG_FUNCTION_ARGS)
 		pfree(so->keyData);
 	pfree(so);
 
-	_bt_dropscan(scan);
-
 	PG_RETURN_VOID();
 }
 
@@ -640,20 +533,127 @@ btrestrpos(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-btdelete(PG_FUNCTION_ARGS)
+btbulkdelete(PG_FUNCTION_ARGS)
 {
 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc scan;
+	BTScanOpaque so;
+	ItemPointer current;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * We use a standard IndexScanDesc scan object, but to speed up the loop,
+	 * we skip most of the wrapper layers of index_getnext and instead call
+	 * _bt_step directly.  This implies holding buffer lock on a target page
+	 * throughout the loop over the page's tuples.  Initially, we have a read
+	 * lock acquired by _bt_step when we stepped onto the page.  If we find
+	 * a tuple we need to delete, we trade in the read lock for an exclusive
+	 * write lock; after that, we hold the write lock until we step off the
+	 * page (fortunately, _bt_relbuf doesn't care which kind of lock it's
+	 * releasing).  This should minimize the amount of work needed per page.
+	 */
+	scan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+	so = (BTScanOpaque) scan->opaque;
+	current = &(scan->currentItemData);
 
-	/* adjust any active scans that will be affected by this deletion */
-	_bt_adjscans(rel, tid);
+	/* Use _bt_first to get started, then _bt_step to remaining tuples */
+	res = _bt_first(scan, ForwardScanDirection);
 
-	/* delete the data from the page */
-	_bt_pagedel(rel, tid);
+	if (res != NULL)
+	{
+		Buffer		buf;
+		BlockNumber	lockedBlock = InvalidBlockNumber;
 
-	PG_RETURN_VOID();
+		pfree(res);
+		/* we have the buffer pinned and locked */
+		buf = so->btso_curbuf;
+		Assert(BufferIsValid(buf));
+
+		do
+		{
+			Page		page;
+			BlockNumber	blkno;
+			OffsetNumber offnum;
+			BTItem		btitem;
+			IndexTuple	itup;
+			ItemPointer	htup;
+
+			/* current is the next index tuple */
+			blkno = ItemPointerGetBlockNumber(current);
+			offnum = ItemPointerGetOffsetNumber(current);
+			page = BufferGetPage(buf);
+			btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+			itup = &btitem->bti_itup;
+			htup = &(itup->t_tid);
+
+			if (callback(htup, callback_state))
+			{
+				/*
+				 * If this is first deletion on this page, trade in read
+				 * lock for a really-exclusive write lock.  Then, step back
+				 * one and re-examine the item, because someone else might
+				 * have inserted an item while we weren't holding the lock!
+				 */
+				if (blkno != lockedBlock)
+				{
+					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+					LockBufferForCleanup(buf);
+					lockedBlock = blkno;
+				}
+				else
+				{
+					/* Delete the item from the page */
+					_bt_itemdel(rel, buf, current);
+
+					/* Mark buffer dirty, but keep the lock and pin */
+					WriteNoReleaseBuffer(buf);
+
+					tuples_removed += 1;
+				}
+
+				/*
+				 * We need to back up the scan one item so that the next
+				 * cycle will re-examine the same offnum on this page.
+				 *
+				 * For now, just hack the current-item index.  Will need
+				 * to be smarter when deletion includes removal of empty
+				 * index pages.
+				 */
+				current->ip_posid--;
+			}
+			else
+				num_index_tuples += 1;
+		} while (_bt_step(scan, &buf, ForwardScanDirection));
+	}
+
+	index_endscan(scan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
 /*
@@ -676,7 +676,7 @@ _bt_restscan(IndexScanDesc scan)
 
 	/*
 	 * Get back the read lock we were holding on the buffer. (We still
-	 * have a reference-count pin on it, though.)
+	 * have a reference-count pin on it, so need not get that.)
 	 */
 	LockBuffer(buf, BT_READ);
 
@@ -729,7 +729,7 @@ _bt_restscan(IndexScanDesc scan)
 				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
 
 		blkno = opaque->btpo_next;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_READ);
 		page = BufferGetPage(buf);
 		maxoff = PageGetMaxOffsetNumber(page);
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
deleted file mode 100644
index e07914b344..0000000000
--- a/src/backend/access/nbtree/nbtscan.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * btscan.c
- *	  manage scans on btrees.
- *
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.33 2001/01/24 19:42:48 momjian Exp $
- *
- *
- * NOTES
- *	 Because we can be doing an index scan on a relation while we update
- *	 it, we need to avoid missing data that moves around in the index.
- *	 Insertions and page splits are no problem because _bt_restscan()
- *	 can figure out where the current item moved to, but if a deletion
- *	 happens at or before the current scan position, we'd better do
- *	 something to stay in sync.
- *
- *	 The routines in this file handle the problem for deletions issued
- *	 by the current backend.  Currently, that's all we need, since
- *	 deletions are only done by VACUUM and it gets an exclusive lock.
- *
- *	 The scheme is to manage a list of active scans in the current backend.
- *	 Whenever we remove a record from an index, we check the list of active
- *	 scans to see if any has been affected.  A scan is affected only if it
- *	 is on the same relation, and the same page, as the update.
- *
- *-------------------------------------------------------------------------
- */
-
-#include "postgres.h"
-
-#include "access/nbtree.h"
-
-typedef struct BTScanListData
-{
-	IndexScanDesc btsl_scan;
-	struct BTScanListData *btsl_next;
-} BTScanListData;
-
-typedef BTScanListData *BTScanList;
-
-static BTScanList BTScans = (BTScanList) NULL;
-
-static void _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
-
-/*
- * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
- *
- * This is here because it needs to touch this module's static var BTScans.
- */
-void
-AtEOXact_nbtree(void)
-{
-
-	/*
-	 * Note: these actions should only be necessary during xact abort; but
-	 * they can't hurt during a commit.
-	 */
-
-	/*
-	 * Reset the active-scans list to empty. We do not need to free the
-	 * list elements, because they're all palloc()'d, so they'll go away
-	 * at end of transaction anyway.
-	 */
-	BTScans = NULL;
-
-	/* If we were building a btree, we ain't anymore. */
-	BuildingBtree = false;
-}
-
-/*
- *	_bt_regscan() -- register a new scan.
- */
-void
-_bt_regscan(IndexScanDesc scan)
-{
-	BTScanList	new_el;
-
-	new_el = (BTScanList) palloc(sizeof(BTScanListData));
-	new_el->btsl_scan = scan;
-	new_el->btsl_next = BTScans;
-	BTScans = new_el;
-}
-
-/*
- *	_bt_dropscan() -- drop a scan from the scan list
- */
-void
-_bt_dropscan(IndexScanDesc scan)
-{
-	BTScanList	chk,
-				last;
-
-	last = (BTScanList) NULL;
-	for (chk = BTScans;
-		 chk != (BTScanList) NULL && chk->btsl_scan != scan;
-		 chk = chk->btsl_next)
-		last = chk;
-
-	if (chk == (BTScanList) NULL)
-		elog(ERROR, "btree scan list trashed; can't find 0x%p", (void *) scan);
-
-	if (last == (BTScanList) NULL)
-		BTScans = chk->btsl_next;
-	else
-		last->btsl_next = chk->btsl_next;
-
-	pfree(chk);
-}
-
-/*
- *	_bt_adjscans() -- adjust all scans in the scan list to compensate
- *					  for a given deletion
- */
-void
-_bt_adjscans(Relation rel, ItemPointer tid)
-{
-	BTScanList	l;
-	Oid			relid;
-
-	relid = RelationGetRelid(rel);
-	for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next)
-	{
-		if (relid == RelationGetRelid(l->btsl_scan->relation))
-			_bt_scandel(l->btsl_scan,
-						ItemPointerGetBlockNumber(tid),
-						ItemPointerGetOffsetNumber(tid));
-	}
-}
-
-/*
- *	_bt_scandel() -- adjust a single scan on deletion
- *
- */
-static void
-_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
-{
-	ItemPointer current;
-	Buffer		buf;
-	BTScanOpaque so;
-	OffsetNumber start;
-	Page		page;
-	BTPageOpaque opaque;
-
-	so = (BTScanOpaque) scan->opaque;
-	buf = so->btso_curbuf;
-
-	current = &(scan->currentItemData);
-	if (ItemPointerIsValid(current)
-		&& ItemPointerGetBlockNumber(current) == blkno
-		&& ItemPointerGetOffsetNumber(current) >= offno)
-	{
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		start = P_FIRSTDATAKEY(opaque);
-		if (ItemPointerGetOffsetNumber(current) == start)
-			ItemPointerSetInvalid(&(so->curHeapIptr));
-		else
-		{
-
-			/*
-			 * We have to lock buffer before _bt_step and unlock it after
-			 * that.
-			 */
-			LockBuffer(buf, BT_READ);
-			_bt_step(scan, &buf, BackwardScanDirection);
-			if (ItemPointerIsValid(current))
-			{
-				Page		pg = BufferGetPage(buf);
-				BTItem		btitem = (BTItem) PageGetItem(pg,
-				 PageGetItemId(pg, ItemPointerGetOffsetNumber(current)));
-
-				so->curHeapIptr = btitem->bti_itup.t_tid;
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			}
-		}
-	}
-
-	current = &(scan->currentMarkData);
-	if (ItemPointerIsValid(current)
-		&& ItemPointerGetBlockNumber(current) == blkno
-		&& ItemPointerGetOffsetNumber(current) >= offno)
-	{
-		page = BufferGetPage(so->btso_mrkbuf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		start = P_FIRSTDATAKEY(opaque);
-
-		if (ItemPointerGetOffsetNumber(current) == start)
-			ItemPointerSetInvalid(&(so->mrkHeapIptr));
-		else
-		{
-			ItemPointerData tmp;
-
-			tmp = *current;
-			*current = scan->currentItemData;
-			scan->currentItemData = tmp;
-			so->btso_curbuf = so->btso_mrkbuf;
-			so->btso_mrkbuf = buf;
-			buf = so->btso_curbuf;
-			LockBuffer(buf, BT_READ);	/* as above */
-
-			_bt_step(scan, &buf, BackwardScanDirection);
-
-			so->btso_curbuf = so->btso_mrkbuf;
-			so->btso_mrkbuf = buf;
-			tmp = *current;
-			*current = scan->currentItemData;
-			scan->currentItemData = tmp;
-			if (ItemPointerIsValid(current))
-			{
-				Page		pg = BufferGetPage(buf);
-				BTItem		btitem = (BTItem) PageGetItem(pg,
-				 PageGetItemId(pg, ItemPointerGetOffsetNumber(current)));
-
-				so->mrkHeapIptr = btitem->bti_itup.t_tid;
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);	/* as above */
-			}
-		}
-	}
-}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 59bf5358e4..295387ed51 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.66 2001/03/23 04:49:51 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.67 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -94,7 +94,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
 		new_stack->bts_parent = stack_in;
 
 		/* drop the read lock on the parent page, acquire one on the child */
-		_bt_relbuf(rel, *bufP, BT_READ);
+		_bt_relbuf(rel, *bufP);
 		*bufP = _bt_getbuf(rel, blkno, BT_READ);
 
 		/*
@@ -155,7 +155,7 @@ _bt_moveright(Relation rel,
 		/* step right one page */
 		BlockNumber rblkno = opaque->btpo_next;
 
-		_bt_relbuf(rel, buf, access);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, rblkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -406,7 +406,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	/* No more items, so close down the current-item info */
 	ItemPointerSetInvalid(current);
 	so->btso_curbuf = InvalidBuffer;
-	_bt_relbuf(rel, buf, BT_READ);
+	_bt_relbuf(rel, buf);
 
 	return (RetrieveIndexResult) NULL;
 }
@@ -760,7 +760,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 nomatches:
 		ItemPointerSetInvalid(current);
 		so->btso_curbuf = InvalidBuffer;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		res = (RetrieveIndexResult) NULL;
 	}
 
@@ -815,14 +815,14 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* if we're at end of scan, release the buffer and return */
 				if (P_RIGHTMOST(opaque))
 				{
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					ItemPointerSetInvalid(current);
 					*bufP = so->btso_curbuf = InvalidBuffer;
 					return false;
 				}
 				/* step right one page */
 				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, *bufP, BT_READ);
+				_bt_relbuf(rel, *bufP);
 				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -846,7 +846,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* if we're at end of scan, release the buffer and return */
 				if (P_LEFTMOST(opaque))
 				{
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					ItemPointerSetInvalid(current);
 					*bufP = so->btso_curbuf = InvalidBuffer;
 					return false;
@@ -854,7 +854,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* step left */
 				obknum = BufferGetBlockNumber(*bufP);
 				blkno = opaque->btpo_prev;
-				_bt_relbuf(rel, *bufP, BT_READ);
+				_bt_relbuf(rel, *bufP);
 				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -868,7 +868,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				while (opaque->btpo_next != obknum)
 				{
 					blkno = opaque->btpo_next;
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					*bufP = _bt_getbuf(rel, blkno, BT_READ);
 					page = BufferGetPage(*bufP);
 					opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -952,7 +952,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		itup = &(btitem->bti_itup);
 		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_READ);
 
 		page = BufferGetPage(buf);
@@ -968,7 +968,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 			do
 			{
 				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, buf, BT_READ);
+				_bt_relbuf(rel, buf);
 				buf = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(buf);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1035,7 +1035,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		/* no tuples in the index match this scan key */
 		ItemPointerSetInvalid(current);
 		so->btso_curbuf = InvalidBuffer;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		res = (RetrieveIndexResult) NULL;
 	}
 
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
index a8c6a13ea3..21831ef5d6 100644
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.62 2001/05/07 00:43:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.63 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,20 @@ typedef struct RTSTATE
 	FmgrInfo	interFn;		/* intersection function */
 } RTSTATE;
 
+/* Working state for rtbuild and its callback */
+typedef struct
+{
+	RTSTATE		rtState;
+	double		indtuples;
+} RTBuildState;
+
 /* non-export function prototypes */
+static void rtbuildCallback(Relation index,
+							HeapTuple htup,
+							Datum *attdata,
+							char *nulls,
+							bool tupleIsAlive,
+							void *state);
 static InsertIndexResult rtdoinsert(Relation r, IndexTuple itup,
 		   RTSTATE *rtstate);
 static void rttighten(Relation r, RTSTACK *stk, Datum datum, int att_size,
@@ -81,165 +94,44 @@ static int	nospace(Page p, IndexTuple it);
 static void initRtstate(RTSTATE *rtstate, Relation index);
 
 
+/*
+ * routine to build an index.  Basically calls insert over and over
+ */
 Datum
 rtbuild(PG_FUNCTION_ARGS)
 {
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
+	double		reltuples;
+	RTBuildState buildstate;
+	Buffer		buffer;
 
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-	Buffer		buffer = InvalidBuffer;
-	RTSTATE		rtState;
+	/* no locking is needed */
 
-	initRtstate(&rtState, index);
+	initRtstate(&buildstate.rtState, index);
 
 	/*
 	 * We expect to be called exactly once for any index relation. If
 	 * that's not the case, big trouble's what we have.
 	 */
-	if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0)
-		elog(ERROR, "%s already contains data", RelationGetRelationName(index));
-
-	/* initialize the root page (if this is a new index) */
-	if (oldPred == NULL)
-	{
-		buffer = ReadBuffer(index, P_NEW);
-		RTInitBuffer(buffer, F_LEAF);
-		WriteBuffer(buffer);
-	}
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
-	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
-	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-	/* count the tuples as we insert them */
-	nhtups = nitups = 0.0;
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-		itup->t_tid = htup->t_self;
+	/* initialize the root page */
+	buffer = ReadBuffer(index, P_NEW);
+	RTInitBuffer(buffer, F_LEAF);
+	WriteBuffer(buffer);
 
-		/*
-		 * Since we already have the index relation locked, we call
-		 * rtdoinsert directly.  Normal access method calls dispatch
-		 * through rtinsert, which locks the relation for write.  This is
-		 * the right thing to do if you're inserting single tups, but not
-		 * when you're initializing the whole index at once.
-		 */
+	/* build the index */
+	buildstate.indtuples = 0;
 
-		res = rtdoinsert(index, itup, &rtState);
-		pfree(itup);
-		pfree(res);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   rtbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -259,20 +151,57 @@ rtbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
 	PG_RETURN_VOID();
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+rtbuildCallback(Relation index,
+				HeapTuple htup,
+				Datum *attdata,
+				char *nulls,
+				bool tupleIsAlive,
+				void *state)
+{
+	RTBuildState   *buildstate = (RTBuildState *) state;
+	IndexTuple	itup;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* rtree indexes don't index nulls, see notes in rtinsert */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		return;
+	}
+
+	/*
+	 * Since we already have the index relation locked, we call
+	 * rtdoinsert directly.  Normal access method calls dispatch
+	 * through rtinsert, which locks the relation for write.  This is
+	 * the right thing to do if you're inserting single tups, but not
+	 * when you're initializing the whole index at once.
+	 */
+	res = rtdoinsert(index, itup, &buildstate->rtState);
+
+	if (res)
+		pfree(res);
+
+	buildstate->indtuples += 1;
+
+	pfree(itup);
+}
+
+/*
  *	rtinsert -- wrapper for rtree tuple insertion.
  *
  *	  This is the public interface routine for tuple insertion in rtrees.
@@ -285,10 +214,8 @@ rtinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	IndexTuple	itup;
@@ -297,12 +224,24 @@ rtinsert(PG_FUNCTION_ARGS)
 	/* generate an index tuple */
 	itup = index_formtuple(RelationGetDescr(r), datum, nulls);
 	itup->t_tid = *ht_ctid;
+
+	/*
+	 * Currently, rtrees do not support indexing NULLs; considerable
+	 * infrastructure work would have to be done to do anything reasonable
+	 * with a NULL.
+	 */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		PG_RETURN_POINTER((InsertIndexResult) NULL);
+	}
+
 	initRtstate(&rtState, r);
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices()
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since rtree is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
 	res = rtdoinsert(r, itup, &rtState);
@@ -1104,40 +1043,92 @@ freestack(RTSTACK *s)
 	}
 }
 
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-rtdelete(PG_FUNCTION_ARGS)
+rtbulkdelete(PG_FUNCTION_ARGS)
 {
-	Relation	r = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
-	BlockNumber blkno;
-	OffsetNumber offnum;
-	Buffer		buf;
-	Page		page;
+	Relation	rel = (Relation) PG_GETARG_POINTER(0);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum
-	 * deletes index tuples now...
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since rtree is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
-	blkno = ItemPointerGetBlockNumber(tid);
-	offnum = ItemPointerGetOffsetNumber(tid);
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
 
-	/* adjust any scans that will be affected by this deletion */
-	rtadjscans(r, RTOP_DEL, blkno, offnum);
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
 
-	/* delete the index tuple */
-	buf = ReadBuffer(r, blkno);
-	page = BufferGetPage(buf);
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
 
-	PageIndexTupleDelete(page, offnum);
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+			BlockNumber blkno;
+			OffsetNumber offnum;
+			Buffer		buf;
+			Page		page;
 
-	WriteBuffer(buf);
+			blkno = ItemPointerGetBlockNumber(indexptr);
+			offnum = ItemPointerGetOffsetNumber(indexptr);
 
-	PG_RETURN_VOID();
+			/* adjust any scans that will be affected by this deletion */
+			/* (namely, my own scan) */
+			rtadjscans(rel, RTOP_DEL, blkno, offnum);
+
+			/* delete the index tuple */
+			buf = ReadBuffer(rel, blkno);
+			page = BufferGetPage(buf);
+
+			PageIndexTupleDelete(page, offnum);
+
+			WriteBuffer(buf);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 static void
 initRtstate(RTSTATE *rtstate, Relation index)
 {
diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c
index c9f1ab7b89..1311cfdc29 100644
--- a/src/backend/access/rtree/rtscan.c
+++ b/src/backend/access/rtree/rtscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.37 2001/06/09 18:16:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.38 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,13 +60,8 @@ rtbeginscan(PG_FUNCTION_ARGS)
 	ScanKey		key = (ScanKey) PG_GETARG_POINTER(3);
 	IndexScanDesc s;
 
-	/*
-	 * Let index_beginscan does its work...
-	 *
-	 * RelationSetLockForRead(r);
-	 */
-
 	s = RelationGetIndexScan(r, fromEnd, nkeys, key);
+
 	rtregscan(s);
 
 	PG_RETURN_POINTER(s);
@@ -282,6 +277,27 @@ rtdropscan(IndexScanDesc s)
 	pfree(l);
 }
 
+/*
+ * AtEOXact_rtree() --- clean up rtree subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var RTScans.
+ */
+void
+AtEOXact_rtree(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	RTScans = NULL;
+}
+
 void
 rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 6467179231..d32a6dda97 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.106 2001/07/12 04:11:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.107 2001/07/15 22:48:16 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -156,7 +156,10 @@
 
 #include <sys/time.h>
 
+#include "access/gistscan.h"
+#include "access/hash.h"
 #include "access/nbtree.h"
+#include "access/rtree.h"
 #include "access/xact.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
@@ -1040,7 +1043,10 @@ CommitTransaction(void)
 	smgrDoPendingDeletes(true);
 
 	AtEOXact_SPI();
+	AtEOXact_gist();
+	AtEOXact_hash();
 	AtEOXact_nbtree();
+	AtEOXact_rtree();
 	AtCommit_Cache();
 	AtCommit_Locks();
 	AtEOXact_CatCache(true);
@@ -1147,7 +1153,10 @@ AbortTransaction(void)
 	smgrDoPendingDeletes(false);
 
 	AtEOXact_SPI();
+	AtEOXact_gist();
+	AtEOXact_hash();
 	AtEOXact_nbtree();
+	AtEOXact_rtree();
 	AtAbort_Cache();
 	AtEOXact_CatCache(false);
 	AtAbort_Memory();
author	Tom Lane	2001-07-15 22:48:19 +0000
committer	Tom Lane	2001-07-15 22:48:19 +0000
commit	c8076f09d2eb82a28f27f97230be470fffe7a1e0 (patch)
tree	1e357e7e28313386f9d2e789d3905b37ce2d58f6 /src/backend/access
parent	997439f59e1d487cb2bfa1384f6479fda0c4dd4c (diff)