summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane2008-03-15 20:46:31 +0000
committerTom Lane2008-03-15 20:46:31 +0000
commita4d2e19e4eedeabb0187e3e9b3f9c6e12f10a455 (patch)
tree1363764fa137ad1669317300bddd98655a39ff44
parentaad05c328511b2de5dd29e9ac56f079bb496a83c (diff)
Change hash index creation so that rather than always establishing exactly
two buckets at the start, we create a number of buckets appropriate for the estimated size of the table. This avoids a lot of expensive bucket-split actions during initial index build on an already-populated table. This is one of the two core ideas of Tom Raney and Shreya Bhargava's patch to reduce hash index build time. I'm committing it separately to make it easier for people to test the effects of this separately from the effects of their other core idea (pre-sorting the index entries by bucket number).
-rw-r--r--src/backend/access/hash/README11
-rw-r--r--src/backend/access/hash/hash.c9
-rw-r--r--src/backend/access/hash/hashpage.c57
-rw-r--r--src/backend/optimizer/util/plancat.c4
-rw-r--r--src/include/access/hash.h2
-rw-r--r--src/include/optimizer/plancat.h4
6 files changed, 64 insertions, 23 deletions
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 5737a7192c..ebc01c32f5 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1.
+(Note: the above describes what happens when filling an initially minimally
+sized hash index. In practice, we try to estimate the required index size
+and allocate a suitable number of splitpoints immediately, to avoid
+expensive re-splitting during initial index build.)
+
When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed
@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap
pages are a subset of the overflow pages. It turns out in fact that each
bitmap page's first bit represents itself --- this is not an essential
property, but falls out of the fact that we only allocate another bitmap
-page when we really need one. Bit number zero always corresponds to block
-number 3, which is the first bitmap page and is allocated during index
-creation.
+page when we really need one. Bit number zero always corresponds to the
+first bitmap page, which is allocated during index creation just after all
+the initially created buckets.
Lock definitions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fce7af2d92..24293d7c86 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -22,6 +22,7 @@
#include "access/hash.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
+#include "optimizer/plancat.h"
/* Working state for hashbuild and its callback */
@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS)
Relation index = (Relation) PG_GETARG_POINTER(1);
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
IndexBuildResult *result;
+ BlockNumber relpages;
double reltuples;
HashBuildState buildstate;
@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
- /* initialize the hash index metadata page */
- _hash_metapinit(index);
+ /* estimate the number of rows currently present in the table */
+ estimate_rel_size(heap, NULL, &relpages, &reltuples);
+
+ /* initialize the hash index metadata page and initial buckets */
+ _hash_metapinit(index, reltuples);
/* build the index */
buildstate.indtuples = 0;
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 63c7795a49..db59d26c23 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel,
/*
* _hash_metapinit() -- Initialize the metadata page of a hash index,
- * the two buckets that we begin with and the initial
- * bitmap page.
+ * the initial buckets, and the initial bitmap page.
+ *
+ * The initial number of buckets is dependent on num_tuples, an estimate
+ * of the number of tuples to be loaded into the index initially.
*
* We are fairly cavalier about locking here, since we know that no one else
* could be accessing this index. In particular the rule about not holding
* multiple buffer locks is ignored.
*/
void
-_hash_metapinit(Relation rel)
+_hash_metapinit(Relation rel, double num_tuples)
{
HashMetaPage metap;
HashPageOpaque pageopaque;
@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel)
int32 data_width;
int32 item_width;
int32 ffactor;
- uint16 i;
+ double dnumbuckets;
+ uint32 num_buckets;
+ uint32 log2_num_buckets;
+ uint32 i;
/* safety check */
if (RelationGetNumberOfBlocks(rel) != 0)
@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel)
ffactor = 10;
/*
- * We initialize the metapage, the first two bucket pages, and the first
+ * Choose the number of initial bucket pages to match the fill factor
+ * given the estimated number of tuples. We round up the result to the
+ * next power of 2, however, and always force at least 2 bucket pages.
+ * The upper limit is determined by considerations explained in
+ * _hash_expandtable().
+ */
+ dnumbuckets = num_tuples / ffactor;
+ if (dnumbuckets <= 2.0)
+ num_buckets = 2;
+ else if (dnumbuckets >= (double) 0x40000000)
+ num_buckets = 0x40000000;
+ else
+ num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+
+ log2_num_buckets = _hash_log2(num_buckets);
+ Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
+ Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+
+ /*
+ * We initialize the metapage, the first N bucket pages, and the first
* bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
* calls to occur. This ensures that the smgr level has the right idea of
* the physical index length.
@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel)
metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
/*
- * We initialize the index with two buckets, 0 and 1, occupying physical
- * blocks 1 and 2. The first freespace bitmap page is in block 3.
+ * We initialize the index with N buckets, 0 .. N-1, occupying physical
+ * blocks 1 to N. The first freespace bitmap page is in block N+1.
+ * Since N is a power of 2, we can set the masks this way:
*/
- metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
- metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
+ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
+ metap->hashm_highmask = (num_buckets << 1) - 1;
MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
- metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
- metap->hashm_ovflpoint = 1;
+ /* Set up mapping for one spare page after the initial splitpoints */
+ metap->hashm_spares[log2_num_buckets] = 1;
+ metap->hashm_ovflpoint = log2_num_buckets;
metap->hashm_firstfree = 0;
/*
- * Initialize the first two buckets
+ * Initialize the first N buckets
*/
- for (i = 0; i <= 1; i++)
+ for (i = 0; i < num_buckets; i++)
{
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
pg = BufferGetPage(buf);
@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel)
/*
* Initialize first bitmap page
*/
- _hash_initbitmap(rel, metap, 3);
+ _hash_initbitmap(rel, metap, num_buckets + 1);
/* all done */
_hash_wrtbuf(rel, metabuf);
@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* index with 2^32 buckets would certainly overflow BlockNumber and hence
* _hash_alloc_buckets() would fail, but if we supported buckets smaller
* than a disk block then this would be an independent constraint.
+ *
+ * If you change this, see also the maximum initial number of buckets
+ * in _hash_metapinit().
*/
if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
goto fail;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 1b3f44890a..c04a4c03e7 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -45,8 +45,6 @@ bool constraint_exclusion = false;
get_relation_info_hook_type get_relation_info_hook = NULL;
-static void estimate_rel_size(Relation rel, int32 *attr_widths,
- BlockNumber *pages, double *tuples);
static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel,
bool include_notnull);
@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
* relation's attr_width[] cache; we fill this in if we have need to compute
* the attribute widths for estimation purposes.
*/
-static void
+void
estimate_rel_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples)
{
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 63fa264799..aa0ff144a2 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access);
-extern void _hash_metapinit(Relation rel);
+extern void _hash_metapinit(Relation rel, double num_tuples);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index f6c169b7bb..4e248604df 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -15,6 +15,7 @@
#define PLANCAT_H
#include "nodes/relation.h"
+#include "utils/rel.h"
/* Hook for plugins to get control in get_relation_info() */
typedef void (*get_relation_info_hook_type) (PlannerInfo *root,
@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook;
extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
bool inhparent, RelOptInfo *rel);
+extern void estimate_rel_size(Relation rel, int32 *attr_widths,
+ BlockNumber *pages, double *tuples);
+
extern bool relation_excluded_by_constraints(RelOptInfo *rel,
RangeTblEntry *rte);