postgrespro
diff --git a/‎contrib/pageinspect/expected/hash.out
Lines changed: 2 additions & 2 deletions b/‎contrib/pageinspect/expected/hash.out
Lines changed: 2 additions & 2 deletions
diff --git a/‎contrib/pgstattuple/expected/pgstattuple.out
Lines changed: 2 additions & 2 deletions b/‎contrib/pgstattuple/expected/pgstattuple.out
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/src/sgml/pageinspect.sgml
Lines changed: 3 additions & 3 deletions b/‎doc/src/sgml/pageinspect.sgml
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/backend/access/hash/README
Lines changed: 39 additions & 23 deletions b/‎src/backend/access/hash/README
Lines changed: 39 additions & 23 deletions
diff --git a/‎src/backend/access/hash/hashovfl.c
Lines changed: 5 additions & 4 deletions b/‎src/backend/access/hash/hashovfl.c
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/backend/access/hash/hashpage.c
Lines changed: 33 additions & 29 deletions b/‎src/backend/access/hash/hashpage.c
Lines changed: 33 additions & 29 deletions
diff --git a/‎src/backend/access/hash/hashsort.c
Lines changed: 20 additions & 7 deletions b/‎src/backend/access/hash/hashsort.c
Lines changed: 20 additions & 7 deletions
@@ -45,7 +45,7 @@ lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM
 hash_metapage_info(get_raw_page('test_hash_a_idx', 0));
 -[ RECORD 1 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 magic     | 105121344
-version   | 2
+version   | 3
 ntuples   | 1
 bsize     | 8152
 bmsize    | 4096
@@ -57,7 +57,7 @@ ovflpoint | 2
 firstfree | 0
 nmaps     | 1
 procid    | 450
-spares    | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+spares    | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
 mapp      | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
 
 SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask,
 
@@ -134,7 +134,7 @@ create index test_hashidx on test using hash (b);
 select * from pgstathashindex('test_hashidx');
  version | bucket_pages | overflow_pages | bitmap_pages | zero_pages | live_items | dead_items | free_percent 
 ---------+--------------+----------------+--------------+------------+------------+------------+--------------
-       2 |            4 |              0 |            1 |          0 |          0 |          0 |          100
+       3 |            4 |              0 |            1 |          0 |          0 |          0 |          100
 (1 row)
 
 -- these should error with the wrong type
@@ -235,7 +235,7 @@ select pgstatindex('test_partition_idx');
 select pgstathashindex('test_partition_hash_idx');
    pgstathashindex   
 ---------------------
- (2,8,0,1,0,0,0,100)
+ (3,8,0,1,0,0,0,100)
 (1 row)
 
 drop table test_partitioned;
 
@@ -658,7 +658,7 @@ test=# SELECT * FROM hash_bitmap_info('con_hash_index', 2052);
 test=# SELECT * FROM hash_metapage_info(get_raw_page('con_hash_index', 0));
 -[ RECORD 1 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 magic     | 105121344
-version   | 2
+version   | 3
 ntuples   | 500500
 ffactor   | 40
 bsize     | 8152
@@ -667,11 +667,11 @@ bmshift   | 15
 maxbucket | 12512
 highmask  | 16383
 lowmask   | 8191
-ovflpoint | 14
+ovflpoint | 28
 firstfree | 1204
 nmaps     | 1
 procid    | 450
-spares    | {0,0,0,0,0,0,1,1,1,1,1,4,59,704,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+spares    | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
 mapp      | {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
 </screen>
      </para>
 
@@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
 move primary bucket pages around after they are created.
 
 Primary bucket pages (henceforth just "bucket pages") are allocated in
-power-of-2 groups, called "split points" in the code.  Buckets 0 and 1
-are created when the index is initialized.  At the first split, buckets 2
-and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
-when bucket 8 is needed, buckets 8-15 are allocated; etc.  All the bucket
-pages of a power-of-2 group appear consecutively in the index.  This
-addressing scheme allows the physical location of a bucket page to be
-computed from the bucket number relatively easily, using only a small
-amount of control information.  We take the log2() of the bucket number
-to determine which split point S the bucket belongs to, and then simply
-add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
-metapage) to compute the physical address.  hashm_spares[S] can be
-interpreted as the total number of overflow pages that have been allocated
-before the bucket pages of splitpoint S.  hashm_spares[0] is always 0,
-so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
-block numbers 1 and 2, just after the meta page.  We always have
-hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
-former.  The difference between the two represents the number of overflow
-pages appearing between the bucket page groups of splitpoints N and N+1.
-
+power-of-2 groups, called "split points" in the code.  That means at every new
+splitpoint we double the existing number of buckets.  Allocating huge chunks
+of bucket pages all at once isn't optimal and we will take ages to consume
+those.  To avoid this exponential growth of index size, we did use a trick to
+break up allocation of buckets at the splitpoint into 4 equal phases.  If
+(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
+we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
+of total buckets at each phase of splitpoint group.  Next quarter of allocation
+will only happen if buckets of the previous phase have been already consumed.
+For the initial splitpoint groups < 10 we will allocate all of their buckets in
+single phase only, as number of buckets allocated at initial groups are small
+in numbers.  And for the groups >= 10 the allocation process is distributed
+among four equal phases.  At group 10 we allocate (2 ^ 9) buckets in 4
+different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
+indicate the number of buckets allocated within each phase of splitpoint group
+10.  And, for splitpoint group 11 and 12 allocation phases will be
+{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively.  We
+can see that at each splitpoint group we double the total number of buckets
+from the previous group but in an incremental phase.  The bucket pages
+allocated within one phase of a splitpoint group will appear consecutively in
+the index.  This addressing scheme allows the physical location of a bucket
+page to be computed from the bucket number relatively easily, using only a
+small amount of control information.  If we look at the function
+_hash_spareindex for a given bucket number we first compute the
+splitpoint group it belongs to and then the phase to which the bucket belongs
+to.  Adding them we get the global splitpoint phase number S to which the
+bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
+is an array stored in the metapage) with given bucket number to compute its
+physical address.  The hashm_spares[S] can be interpreted as the total number
+of overflow pages that have been allocated before the bucket pages of
+splitpoint phase S.  The hashm_spares[0] is always 0, so that buckets 0 and 1
+always appear at block numbers 1 and 2, just after the meta page.  We always
+have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
+former.  The difference between the two represents the number of overflow pages
+appearing between the bucket page groups of splitpoints phase N and N+1.
 (Note: the above describes what happens when filling an initially minimally
-sized hash index.  In practice, we try to estimate the required index size
-and allocate a suitable number of splitpoints immediately, to avoid
+sized hash index.  In practice, we try to estimate the required index size and
+allocate a suitable number of splitpoints phases immediately, to avoid
 expensive re-splitting during initial index build.)
 
 When S splitpoints exist altogether, the array entries hashm_spares[0]
 through hashm_spares[S] are valid; hashm_spares[S] records the current
 total number of overflow pages.  New overflow pages are created as needed
 at the end of the index, and recorded by incrementing hashm_spares[S].
-When it is time to create a new splitpoint's worth of bucket pages, we
+When it is time to create a new splitpoint phase's worth of bucket pages, we
 copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
 stored in the hashm_ovflpoint field of the meta page).  This has the
 effect of reserving the correct number of bucket pages at the end of the
@@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
 an index extension we crash after allocating filesystem space and before
 updating the metapage.  Note that on filesystems that allow "holes" in
 files, it's entirely likely that pages before the logical EOF are not yet
-allocated: when we allocate a new splitpoint's worth of bucket pages, we
+allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
 physically zero the last such page to force the EOF up, and the first such
 page will be used immediately, but the intervening pages are not written
 until needed.
 
@@ -49,7 +49,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
 	 * Convert to absolute page number by adding the number of bucket pages
 	 * that exist before this split point.
 	 */
-	return (BlockNumber) ((1 << i) + ovflbitnum);
+	return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
 }
 
 /*
@@ -67,14 +67,15 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
 	/* Determine the split number containing this page */
 	for (i = 1; i <= splitnum; i++)
 	{
-		if (ovflblkno <= (BlockNumber) (1 << i))
+		if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
 			break;				/* oops */
-		bitnum = ovflblkno - (1 << i);
+		bitnum = ovflblkno - _hash_get_totalbuckets(i);
 
 		/*
 		 * bitnum has to be greater than number of overflow page added in
 		 * previous split point. The overflow page at this splitnum (i) if any
-		 * should start from ((2 ^ i) + metap->hashm_spares[i - 1] + 1).
+		 * should start from (_hash_get_totalbuckets(i) +
+		 * metap->hashm_spares[i - 1] + 1).
 		 */
 		if (bitnum > metap->hashm_spares[i - 1] &&
 			bitnum <= metap->hashm_spares[i])
 
@@ -502,14 +502,15 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
 	Page		page;
 	double		dnumbuckets;
 	uint32		num_buckets;
-	uint32		log2_num_buckets;
+	uint32		spare_index;
 	uint32		i;
 
 	/*
 	 * Choose the number of initial bucket pages to match the fill factor
 	 * given the estimated number of tuples.  We round up the result to the
-	 * next power of 2, however, and always force at least 2 bucket pages. The
-	 * upper limit is determined by considerations explained in
+	 * total number of buckets which has to be allocated before using its
+	 * _hashm_spare element. However always force at least 2 bucket pages.
+	 * The upper limit is determined by considerations explained in
 	 * _hash_expandtable().
 	 */
 	dnumbuckets = num_tuples / ffactor;
@@ -518,11 +519,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
 	else if (dnumbuckets >= (double) 0x40000000)
 		num_buckets = 0x40000000;
 	else
-		num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+		num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
 
-	log2_num_buckets = _hash_log2(num_buckets);
-	Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
-	Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+	spare_index = _hash_spareindex(num_buckets);
+	Assert(spare_index < HASH_MAX_SPLITPOINTS);
 
 	page = BufferGetPage(buf);
 	if (initpage)
@@ -563,18 +563,23 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
 
 	/*
 	 * We initialize the index with N buckets, 0 .. N-1, occupying physical
-	 * blocks 1 to N.  The first freespace bitmap page is in block N+1. Since
-	 * N is a power of 2, we can set the masks this way:
+	 * blocks 1 to N.  The first freespace bitmap page is in block N+1.
 	 */
-	metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
-	metap->hashm_highmask = (num_buckets << 1) - 1;
+	metap->hashm_maxbucket = num_buckets - 1;
+
+	/*
+	 * Set highmask as next immediate ((2 ^ x) - 1), which should be sufficient
+	 * to cover num_buckets.
+	 */
+	metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
+	metap->hashm_lowmask = (metap->hashm_highmask >> 1);
 
 	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 
 	/* Set up mapping for one spare page after the initial splitpoints */
-	metap->hashm_spares[log2_num_buckets] = 1;
-	metap->hashm_ovflpoint = log2_num_buckets;
+	metap->hashm_spares[spare_index] = 1;
+	metap->hashm_ovflpoint = spare_index;
 	metap->hashm_firstfree = 0;
 
 	/*
@@ -773,25 +778,25 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
 
 	/*
-	 * If the split point is increasing (hashm_maxbucket's log base 2
-	 * increases), we need to allocate a new batch of bucket pages.
+	 * If the split point is increasing we need to allocate a new batch of
+	 * bucket pages.
 	 */
-	spare_ndx = _hash_log2(new_bucket + 1);
+	spare_ndx = _hash_spareindex(new_bucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
+		uint32		buckets_to_add;
+
 		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 
 		/*
-		 * The number of buckets in the new splitpoint is equal to the total
-		 * number already in existence, i.e. new_bucket.  Currently this maps
-		 * one-to-one to blocks required, but someday we may need a more
-		 * complicated calculation here.  We treat allocation of buckets as a
-		 * separate WAL-logged action.  Even if we fail after this operation,
-		 * won't leak bucket pages; rather, the next split will consume this
-		 * space. In any case, even without failure we don't use all the space
-		 * in one split operation.
+		 * We treat allocation of buckets as a separate WAL-logged action.
+		 * Even if we fail after this operation, won't leak bucket pages;
+		 * rather, the next split will consume this space. In any case, even
+		 * without failure we don't use all the space in one split
+		 * operation.
 		 */
-		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
+		buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
+		if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
 		{
 			/* can't split due to BlockNumber overflow */
 			_hash_relbuf(rel, buf_oblkno);
@@ -836,10 +841,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	}
 
 	/*
-	 * If the split point is increasing (hashm_maxbucket's log base 2
-	 * increases), we need to adjust the hashm_spares[] array and
-	 * hashm_ovflpoint so that future overflow pages will be created beyond
-	 * this new batch of bucket pages.
+	 * If the split point is increasing we need to adjust the hashm_spares[]
+	 * array and hashm_ovflpoint so that future overflow pages will be created
+	 * beyond this new batch of bucket pages.
 	 */
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
 
@@ -37,7 +37,15 @@ struct HSpool
 {
 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
 	Relation	index;
-	uint32		hash_mask;		/* bitmask for hash codes */
+
+	/*
+	 * We sort the hash keys based on the buckets they belong to. Below masks
+	 * are used in _hash_hashkey2bucket to determine the bucket of given hash
+	 * key.
+	 */
+	uint32		high_mask;
+	uint32		low_mask;
+	uint32		max_buckets;
 };
 
 
@@ -56,11 +64,12 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
 	 * num_buckets buckets in the index, the appropriate mask can be computed
 	 * as follows.
 	 *
-	 * Note: at present, the passed-in num_buckets is always a power of 2, so
-	 * we could just compute num_buckets - 1.  We prefer not to assume that
-	 * here, though.
+	 * NOTE : This hash mask calculation should be in sync with similar
+	 * calculation in _hash_init_metabuffer.
 	 */
-	hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
+	hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
+	hspool->low_mask = (hspool->high_mask >> 1);
+	hspool->max_buckets = num_buckets - 1;
 
 	/*
 	 * We size the sort area as maintenance_work_mem rather than work_mem to
@@ -69,7 +78,9 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
 	 */
 	hspool->sortstate = tuplesort_begin_index_hash(heap,
 												   index,
-												   hspool->hash_mask,
+												   hspool->high_mask,
+												   hspool->low_mask,
+												   hspool->max_buckets,
 												   maintenance_work_mem,
 												   false);
 
@@ -122,7 +133,9 @@ _h_indexbuild(HSpool *hspool, Relation heapRel)
 #ifdef USE_ASSERT_CHECKING
 		uint32		lasthashkey = hashkey;
 
-		hashkey = _hash_get_indextuple_hashkey(itup) & hspool->hash_mask;
+		hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
+									   hspool->max_buckets, hspool->high_mask,
+									   hspool->low_mask);
 		Assert(hashkey >= lasthashkey);
 #endif