Change hash index creation so that rather than always establishing exactly

two buckets at the start, we create a number of buckets appropriate for the estimated size of the table. This avoids a lot of expensive bucket-split actions during initial index build on an already-populated table. This is one of the two core ideas of Tom Raney and Shreya Bhargava's patch to reduce hash index build time. I'm committing it separately to make it easier for people to test the effects of this separately from the effects of their other core idea (pre-sorting the index entries by bucket number).
author: Tom Lane 2008-03-15 20:46:31 +0000
committer: Tom Lane 2008-03-15 20:46:31 +0000
commit: a4d2e19e4eedeabb0187e3e9b3f9c6e12f10a455 (patch)
tree: 1363764fa137ad1669317300bddd98655a39ff44
parent: aad05c328511b2de5dd29e9ac56f079bb496a83c (diff)
6 files changed, 64 insertions, 23 deletions
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 5737a7192c..ebc01c32f5 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
 former.  The difference between the two represents the number of overflow
 pages appearing between the bucket page groups of splitpoints N and N+1.
 
+(Note: the above describes what happens when filling an initially minimally
+sized hash index.  In practice, we try to estimate the required index size
+and allocate a suitable number of splitpoints immediately, to avoid
+expensive re-splitting during initial index build.)
+
 When S splitpoints exist altogether, the array entries hashm_spares[0]
 through hashm_spares[S] are valid; hashm_spares[S] records the current
 total number of overflow pages.  New overflow pages are created as needed
@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap
 pages are a subset of the overflow pages.  It turns out in fact that each
 bitmap page's first bit represents itself --- this is not an essential
 property, but falls out of the fact that we only allocate another bitmap
-page when we really need one.  Bit number zero always corresponds to block
-number 3, which is the first bitmap page and is allocated during index
-creation.
+page when we really need one.  Bit number zero always corresponds to the
+first bitmap page, which is allocated during index creation just after all
+the initially created buckets.
 
 
 Lock definitions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fce7af2d92..24293d7c86 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -22,6 +22,7 @@
 #include "access/hash.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "optimizer/plancat.h"
 
 
 /* Working state for hashbuild and its callback */
@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS)
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
 	IndexBuildResult *result;
+	BlockNumber	relpages;
 	double		reltuples;
 	HashBuildState buildstate;
 
@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
-	/* initialize the hash index metadata page */
-	_hash_metapinit(index);
+	/* estimate the number of rows currently present in the table */
+	estimate_rel_size(heap, NULL, &relpages, &reltuples);
+
+	/* initialize the hash index metadata page and initial buckets */
+	_hash_metapinit(index, reltuples);
 
 	/* build the index */
 	buildstate.indtuples = 0;
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 63c7795a49..db59d26c23 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel,
 
 /*
  *	_hash_metapinit() -- Initialize the metadata page of a hash index,
- *				the two buckets that we begin with and the initial
- *				bitmap page.
+ *				the initial buckets, and the initial bitmap page.
+ *
+ * The initial number of buckets is dependent on num_tuples, an estimate
+ * of the number of tuples to be loaded into the index initially.
  *
  * We are fairly cavalier about locking here, since we know that no one else
  * could be accessing this index.  In particular the rule about not holding
  * multiple buffer locks is ignored.
  */
 void
-_hash_metapinit(Relation rel)
+_hash_metapinit(Relation rel, double num_tuples)
 {
 	HashMetaPage metap;
 	HashPageOpaque pageopaque;
@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel)
 	int32		data_width;
 	int32		item_width;
 	int32		ffactor;
-	uint16		i;
+	double		dnumbuckets;
+	uint32		num_buckets;
+	uint32		log2_num_buckets;
+	uint32		i;
 
 	/* safety check */
 	if (RelationGetNumberOfBlocks(rel) != 0)
@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel)
 		ffactor = 10;
 
 	/*
-	 * We initialize the metapage, the first two bucket pages, and the first
+	 * Choose the number of initial bucket pages to match the fill factor
+	 * given the estimated number of tuples.  We round up the result to the
+	 * next power of 2, however, and always force at least 2 bucket pages.
+	 * The upper limit is determined by considerations explained in
+	 * _hash_expandtable().
+	 */
+	dnumbuckets = num_tuples / ffactor;
+	if (dnumbuckets <= 2.0)
+		num_buckets = 2;
+	else if (dnumbuckets >= (double) 0x40000000)
+		num_buckets = 0x40000000;
+	else
+		num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+
+	log2_num_buckets = _hash_log2(num_buckets);
+	Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
+	Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+
+	/*
+	 * We initialize the metapage, the first N bucket pages, and the first
 	 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
 	 * calls to occur.	This ensures that the smgr level has the right idea of
 	 * the physical index length.
@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel)
 	metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
 
 	/*
-	 * We initialize the index with two buckets, 0 and 1, occupying physical
-	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
+	 * We initialize the index with N buckets, 0 .. N-1, occupying physical
+	 * blocks 1 to N.  The first freespace bitmap page is in block N+1.
+	 * Since N is a power of 2, we can set the masks this way:
 	 */
-	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
-	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */
+	metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
+	metap->hashm_highmask = (num_buckets << 1) - 1;
 
 	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 
-	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
-	metap->hashm_ovflpoint = 1;
+	/* Set up mapping for one spare page after the initial splitpoints */
+	metap->hashm_spares[log2_num_buckets] = 1;
+	metap->hashm_ovflpoint = log2_num_buckets;
 	metap->hashm_firstfree = 0;
 
 	/*
-	 * Initialize the first two buckets
+	 * Initialize the first N buckets
 	 */
-	for (i = 0; i <= 1; i++)
+	for (i = 0; i < num_buckets; i++)
 	{
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel)
 	/*
 	 * Initialize first bitmap page
 	 */
-	_hash_initbitmap(rel, metap, 3);
+	_hash_initbitmap(rel, metap, num_buckets + 1);
 
 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * index with 2^32 buckets would certainly overflow BlockNumber and hence
 	 * _hash_alloc_buckets() would fail, but if we supported buckets smaller
 	 * than a disk block then this would be an independent constraint.
+	 *
+	 * If you change this, see also the maximum initial number of buckets
+	 * in _hash_metapinit().
 	 */
 	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
 		goto fail;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 1b3f44890a..c04a4c03e7 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -45,8 +45,6 @@ bool		constraint_exclusion = false;
 get_relation_info_hook_type get_relation_info_hook = NULL;
 
 
-static void estimate_rel_size(Relation rel, int32 *attr_widths,
-				  BlockNumber *pages, double *tuples);
 static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel,
 						 bool include_notnull);
 
@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
  * relation's attr_width[] cache; we fill this in if we have need to compute
  * the attribute widths for estimation purposes.
  */
-static void
+void
 estimate_rel_size(Relation rel, int32 *attr_widths,
 				  BlockNumber *pages, double *tuples)
 {
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 63fa264799..aa0ff144a2 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
 extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
 				   int to_access);
-extern void _hash_metapinit(Relation rel);
+extern void _hash_metapinit(Relation rel, double num_tuples);
 extern void _hash_pageinit(Page page, Size size);
 extern void _hash_expandtable(Relation rel, Buffer metabuf);
 
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index f6c169b7bb..4e248604df 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -15,6 +15,7 @@
 #define PLANCAT_H
 
 #include "nodes/relation.h"
+#include "utils/rel.h"
 
 /* Hook for plugins to get control in get_relation_info() */
 typedef void (*get_relation_info_hook_type) (PlannerInfo *root,
@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook;
 extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
 				  bool inhparent, RelOptInfo *rel);
 
+extern void estimate_rel_size(Relation rel, int32 *attr_widths,
+							  BlockNumber *pages, double *tuples);
+
 extern bool relation_excluded_by_constraints(RelOptInfo *rel,
 								 RangeTblEntry *rte);
author	Tom Lane	2008-03-15 20:46:31 +0000
committer	Tom Lane	2008-03-15 20:46:31 +0000
commit	a4d2e19e4eedeabb0187e3e9b3f9c6e12f10a455 (patch)
tree	1363764fa137ad1669317300bddd98655a39ff44
parent	aad05c328511b2de5dd29e9ac56f079bb496a83c (diff)