Skip to content

Commit ea69a0d

Browse files
committed
Expand hash indexes more gradually.
Since hash indexes typically have very few overflow pages, adding a new splitpoint essentially doubles the on-disk size of the index, which can lead to large and abrupt increases in disk usage (and perhaps long delays on occasion). To mitigate this problem to some degree, divide larger splitpoints into four equal phases. This means that, for example, instead of growing from 4GB to 8GB all at once, a hash index will now grow from 4GB to 5GB to 6GB to 7GB to 8GB, which is perhaps still not as smooth as we'd like but certainly an improvement. This changes the on-disk format of the metapage, so bump HASH_VERSION from 2 to 3. This will force a REINDEX of all existing hash indexes, but that's probably a good idea anyway. First, hash indexes from pre-10 versions of PostgreSQL could easily be corrupted, and we don't want to confuse corruption carried over from an older release with any corruption caused despite the new write-ahead logging in v10. Second, it will let us remove some backward-compatibility code added by commit 293e24e. Mithun Cy, reviewed by Amit Kapila, Jesper Pedersen and me. Regression test outputs updated by me. Discussion: http://postgr.es/m/CAD__OuhG6F1gQLCgMQNnMNgoCvOLQZz9zKYJQNYvYmmJoM42gA@mail.gmail.com Discussion: http://postgr.es/m/CA+TgmoYty0jCf-pa+m+vYUJ716+AxM7nv_syvyanyf5O-L_i2A@mail.gmail.com
1 parent 334bf9c commit ea69a0d

File tree

11 files changed

+218
-86
lines changed

11 files changed

+218
-86
lines changed

contrib/pageinspect/expected/hash.out

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM
4545
hash_metapage_info(get_raw_page('test_hash_a_idx', 0));
4646
-[ RECORD 1 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4747
magic | 105121344
48-
version | 2
48+
version | 3
4949
ntuples | 1
5050
bsize | 8152
5151
bmsize | 4096
@@ -57,7 +57,7 @@ ovflpoint | 2
5757
firstfree | 0
5858
nmaps | 1
5959
procid | 450
60-
spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
60+
spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
6161
mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
6262

6363
SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask,

contrib/pgstattuple/expected/pgstattuple.out

+2-2
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ create index test_hashidx on test using hash (b);
134134
select * from pgstathashindex('test_hashidx');
135135
version | bucket_pages | overflow_pages | bitmap_pages | zero_pages | live_items | dead_items | free_percent
136136
---------+--------------+----------------+--------------+------------+------------+------------+--------------
137-
2 | 4 | 0 | 1 | 0 | 0 | 0 | 100
137+
3 | 4 | 0 | 1 | 0 | 0 | 0 | 100
138138
(1 row)
139139

140140
-- these should error with the wrong type
@@ -235,7 +235,7 @@ select pgstatindex('test_partition_idx');
235235
select pgstathashindex('test_partition_hash_idx');
236236
pgstathashindex
237237
---------------------
238-
(2,8,0,1,0,0,0,100)
238+
(3,8,0,1,0,0,0,100)
239239
(1 row)
240240

241241
drop table test_partitioned;

doc/src/sgml/pageinspect.sgml

+3-3
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ test=# SELECT * FROM hash_bitmap_info('con_hash_index', 2052);
658658
test=# SELECT * FROM hash_metapage_info(get_raw_page('con_hash_index', 0));
659659
-[ RECORD 1 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
660660
magic | 105121344
661-
version | 2
661+
version | 3
662662
ntuples | 500500
663663
ffactor | 40
664664
bsize | 8152
@@ -667,11 +667,11 @@ bmshift | 15
667667
maxbucket | 12512
668668
highmask | 16383
669669
lowmask | 8191
670-
ovflpoint | 14
670+
ovflpoint | 28
671671
firstfree | 1204
672672
nmaps | 1
673673
procid | 450
674-
spares | {0,0,0,0,0,0,1,1,1,1,1,4,59,704,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
674+
spares | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
675675
mapp | {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
676676
</screen>
677677
</para>

src/backend/access/hash/README

+39-23
Original file line numberDiff line numberDiff line change
@@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
5858
move primary bucket pages around after they are created.
5959

6060
Primary bucket pages (henceforth just "bucket pages") are allocated in
61-
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
62-
are created when the index is initialized. At the first split, buckets 2
63-
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
64-
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
65-
pages of a power-of-2 group appear consecutively in the index. This
66-
addressing scheme allows the physical location of a bucket page to be
67-
computed from the bucket number relatively easily, using only a small
68-
amount of control information. We take the log2() of the bucket number
69-
to determine which split point S the bucket belongs to, and then simply
70-
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
71-
metapage) to compute the physical address. hashm_spares[S] can be
72-
interpreted as the total number of overflow pages that have been allocated
73-
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
74-
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
75-
block numbers 1 and 2, just after the meta page. We always have
76-
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
77-
former. The difference between the two represents the number of overflow
78-
pages appearing between the bucket page groups of splitpoints N and N+1.
79-
61+
power-of-2 groups, called "split points" in the code. That means at every new
62+
splitpoint we double the existing number of buckets. Allocating huge chunks
63+
of bucket pages all at once isn't optimal and we will take ages to consume
64+
those. To avoid this exponential growth of index size, we did use a trick to
65+
break up allocation of buckets at the splitpoint into 4 equal phases. If
66+
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
67+
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
68+
of total buckets at each phase of splitpoint group. Next quarter of allocation
69+
will only happen if buckets of the previous phase have been already consumed.
70+
For the initial splitpoint groups < 10 we will allocate all of their buckets in
71+
single phase only, as number of buckets allocated at initial groups are small
72+
in numbers. And for the groups >= 10 the allocation process is distributed
73+
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
74+
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
75+
indicate the number of buckets allocated within each phase of splitpoint group
76+
10. And, for splitpoint group 11 and 12 allocation phases will be
77+
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
78+
can see that at each splitpoint group we double the total number of buckets
79+
from the previous group but in an incremental phase. The bucket pages
80+
allocated within one phase of a splitpoint group will appear consecutively in
81+
the index. This addressing scheme allows the physical location of a bucket
82+
page to be computed from the bucket number relatively easily, using only a
83+
small amount of control information. If we look at the function
84+
_hash_spareindex for a given bucket number we first compute the
85+
splitpoint group it belongs to and then the phase to which the bucket belongs
86+
to. Adding them we get the global splitpoint phase number S to which the
87+
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
88+
is an array stored in the metapage) with given bucket number to compute its
89+
physical address. The hashm_spares[S] can be interpreted as the total number
90+
of overflow pages that have been allocated before the bucket pages of
91+
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
92+
always appear at block numbers 1 and 2, just after the meta page. We always
93+
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
94+
former. The difference between the two represents the number of overflow pages
95+
appearing between the bucket page groups of splitpoints phase N and N+1.
8096
(Note: the above describes what happens when filling an initially minimally
81-
sized hash index. In practice, we try to estimate the required index size
82-
and allocate a suitable number of splitpoints immediately, to avoid
97+
sized hash index. In practice, we try to estimate the required index size and
98+
allocate a suitable number of splitpoints phases immediately, to avoid
8399
expensive re-splitting during initial index build.)
84100

85101
When S splitpoints exist altogether, the array entries hashm_spares[0]
86102
through hashm_spares[S] are valid; hashm_spares[S] records the current
87103
total number of overflow pages. New overflow pages are created as needed
88104
at the end of the index, and recorded by incrementing hashm_spares[S].
89-
When it is time to create a new splitpoint's worth of bucket pages, we
105+
When it is time to create a new splitpoint phase's worth of bucket pages, we
90106
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
91107
stored in the hashm_ovflpoint field of the meta page). This has the
92108
effect of reserving the correct number of bucket pages at the end of the
@@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
101117
an index extension we crash after allocating filesystem space and before
102118
updating the metapage. Note that on filesystems that allow "holes" in
103119
files, it's entirely likely that pages before the logical EOF are not yet
104-
allocated: when we allocate a new splitpoint's worth of bucket pages, we
120+
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
105121
physically zero the last such page to force the EOF up, and the first such
106122
page will be used immediately, but the intervening pages are not written
107123
until needed.

src/backend/access/hash/hashovfl.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
4949
* Convert to absolute page number by adding the number of bucket pages
5050
* that exist before this split point.
5151
*/
52-
return (BlockNumber) ((1 << i) + ovflbitnum);
52+
return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
5353
}
5454

5555
/*
@@ -67,14 +67,15 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
6767
/* Determine the split number containing this page */
6868
for (i = 1; i <= splitnum; i++)
6969
{
70-
if (ovflblkno <= (BlockNumber) (1 << i))
70+
if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
7171
break; /* oops */
72-
bitnum = ovflblkno - (1 << i);
72+
bitnum = ovflblkno - _hash_get_totalbuckets(i);
7373

7474
/*
7575
* bitnum has to be greater than number of overflow page added in
7676
* previous split point. The overflow page at this splitnum (i) if any
77-
* should start from ((2 ^ i) + metap->hashm_spares[i - 1] + 1).
77+
* should start from (_hash_get_totalbuckets(i) +
78+
* metap->hashm_spares[i - 1] + 1).
7879
*/
7980
if (bitnum > metap->hashm_spares[i - 1] &&
8081
bitnum <= metap->hashm_spares[i])

src/backend/access/hash/hashpage.c

+33-29
Original file line numberDiff line numberDiff line change
@@ -502,14 +502,15 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
502502
Page page;
503503
double dnumbuckets;
504504
uint32 num_buckets;
505-
uint32 log2_num_buckets;
505+
uint32 spare_index;
506506
uint32 i;
507507

508508
/*
509509
* Choose the number of initial bucket pages to match the fill factor
510510
* given the estimated number of tuples. We round up the result to the
511-
* next power of 2, however, and always force at least 2 bucket pages. The
512-
* upper limit is determined by considerations explained in
511+
* total number of buckets which has to be allocated before using its
512+
* _hashm_spare element. However always force at least 2 bucket pages.
513+
* The upper limit is determined by considerations explained in
513514
* _hash_expandtable().
514515
*/
515516
dnumbuckets = num_tuples / ffactor;
@@ -518,11 +519,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
518519
else if (dnumbuckets >= (double) 0x40000000)
519520
num_buckets = 0x40000000;
520521
else
521-
num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
522+
num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
522523

523-
log2_num_buckets = _hash_log2(num_buckets);
524-
Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
525-
Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
524+
spare_index = _hash_spareindex(num_buckets);
525+
Assert(spare_index < HASH_MAX_SPLITPOINTS);
526526

527527
page = BufferGetPage(buf);
528528
if (initpage)
@@ -563,18 +563,23 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
563563

564564
/*
565565
* We initialize the index with N buckets, 0 .. N-1, occupying physical
566-
* blocks 1 to N. The first freespace bitmap page is in block N+1. Since
567-
* N is a power of 2, we can set the masks this way:
566+
* blocks 1 to N. The first freespace bitmap page is in block N+1.
568567
*/
569-
metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
570-
metap->hashm_highmask = (num_buckets << 1) - 1;
568+
metap->hashm_maxbucket = num_buckets - 1;
569+
570+
/*
571+
* Set highmask as next immediate ((2 ^ x) - 1), which should be sufficient
572+
* to cover num_buckets.
573+
*/
574+
metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
575+
metap->hashm_lowmask = (metap->hashm_highmask >> 1);
571576

572577
MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
573578
MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
574579

575580
/* Set up mapping for one spare page after the initial splitpoints */
576-
metap->hashm_spares[log2_num_buckets] = 1;
577-
metap->hashm_ovflpoint = log2_num_buckets;
581+
metap->hashm_spares[spare_index] = 1;
582+
metap->hashm_ovflpoint = spare_index;
578583
metap->hashm_firstfree = 0;
579584

580585
/*
@@ -773,25 +778,25 @@ _hash_expandtable(Relation rel, Buffer metabuf)
773778
start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
774779

775780
/*
776-
* If the split point is increasing (hashm_maxbucket's log base 2
777-
* increases), we need to allocate a new batch of bucket pages.
781+
* If the split point is increasing we need to allocate a new batch of
782+
* bucket pages.
778783
*/
779-
spare_ndx = _hash_log2(new_bucket + 1);
784+
spare_ndx = _hash_spareindex(new_bucket + 1);
780785
if (spare_ndx > metap->hashm_ovflpoint)
781786
{
787+
uint32 buckets_to_add;
788+
782789
Assert(spare_ndx == metap->hashm_ovflpoint + 1);
783790

784791
/*
785-
* The number of buckets in the new splitpoint is equal to the total
786-
* number already in existence, i.e. new_bucket. Currently this maps
787-
* one-to-one to blocks required, but someday we may need a more
788-
* complicated calculation here. We treat allocation of buckets as a
789-
* separate WAL-logged action. Even if we fail after this operation,
790-
* won't leak bucket pages; rather, the next split will consume this
791-
* space. In any case, even without failure we don't use all the space
792-
* in one split operation.
792+
* We treat allocation of buckets as a separate WAL-logged action.
793+
* Even if we fail after this operation, won't leak bucket pages;
794+
* rather, the next split will consume this space. In any case, even
795+
* without failure we don't use all the space in one split
796+
* operation.
793797
*/
794-
if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
798+
buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
799+
if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
795800
{
796801
/* can't split due to BlockNumber overflow */
797802
_hash_relbuf(rel, buf_oblkno);
@@ -836,10 +841,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
836841
}
837842

838843
/*
839-
* If the split point is increasing (hashm_maxbucket's log base 2
840-
* increases), we need to adjust the hashm_spares[] array and
841-
* hashm_ovflpoint so that future overflow pages will be created beyond
842-
* this new batch of bucket pages.
844+
* If the split point is increasing we need to adjust the hashm_spares[]
845+
* array and hashm_ovflpoint so that future overflow pages will be created
846+
* beyond this new batch of bucket pages.
843847
*/
844848
if (spare_ndx > metap->hashm_ovflpoint)
845849
{

src/backend/access/hash/hashsort.c

+20-7
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,15 @@ struct HSpool
3737
{
3838
Tuplesortstate *sortstate; /* state data for tuplesort.c */
3939
Relation index;
40-
uint32 hash_mask; /* bitmask for hash codes */
40+
41+
/*
42+
* We sort the hash keys based on the buckets they belong to. Below masks
43+
* are used in _hash_hashkey2bucket to determine the bucket of given hash
44+
* key.
45+
*/
46+
uint32 high_mask;
47+
uint32 low_mask;
48+
uint32 max_buckets;
4149
};
4250

4351

@@ -56,11 +64,12 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
5664
* num_buckets buckets in the index, the appropriate mask can be computed
5765
* as follows.
5866
*
59-
* Note: at present, the passed-in num_buckets is always a power of 2, so
60-
* we could just compute num_buckets - 1. We prefer not to assume that
61-
* here, though.
67+
* NOTE : This hash mask calculation should be in sync with similar
68+
* calculation in _hash_init_metabuffer.
6269
*/
63-
hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
70+
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
71+
hspool->low_mask = (hspool->high_mask >> 1);
72+
hspool->max_buckets = num_buckets - 1;
6473

6574
/*
6675
* We size the sort area as maintenance_work_mem rather than work_mem to
@@ -69,7 +78,9 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
6978
*/
7079
hspool->sortstate = tuplesort_begin_index_hash(heap,
7180
index,
72-
hspool->hash_mask,
81+
hspool->high_mask,
82+
hspool->low_mask,
83+
hspool->max_buckets,
7384
maintenance_work_mem,
7485
false);
7586

@@ -122,7 +133,9 @@ _h_indexbuild(HSpool *hspool, Relation heapRel)
122133
#ifdef USE_ASSERT_CHECKING
123134
uint32 lasthashkey = hashkey;
124135

125-
hashkey = _hash_get_indextuple_hashkey(itup) & hspool->hash_mask;
136+
hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
137+
hspool->max_buckets, hspool->high_mask,
138+
hspool->low_mask);
126139
Assert(hashkey >= lasthashkey);
127140
#endif
128141

0 commit comments

Comments
 (0)