Skip to content

Commit 1542e16

Browse files
Consider outliers in split interval calculation.
Commit 0d861bb, which introduced deduplication to nbtree, added some logic to take large posting list tuples into account when choosing a split point. We subtract firstright posting list overhead from the projected new high key size when calculating leftfree/rightfree values for an affected candidate split point. Posting list tuples aren't special to nbtsplitloc.c, but taking them into account like this makes a huge difference in practice. Posting list tuples are frequently tuple size outliers. However, commit 0d861bb missed a closely related issue: split interval itself is calculated based on the assumption that tuples on the page being split are roughly equisized. That assumption was acceptable back when commit fab2502 taught the logic for choosing a split point about suffix truncation, but it's pretty questionable now that very large tuple sizes are common. This oversight led to unbalanced page splits in low cardinality multi-column indexes when deduplication was used: page splits that don't give sufficient weight to how unbalanced the split is when the interval happens to include some large posting list tuples (and when most other tuples on the page are not so large). Nail this down by calculating an initial split interval in a way that's attuned to the actual cost that we want to keep under control (not a fuzzy proxy for the cost): apply a leftfree + rightfree evenness test to each candidate split point that actually gets included in the split interval (for the default strategy). This replaces logic that used a percentage of all legal split points for the page as the basis of the initial split interval. Discussion: https://postgr.es/m/CAH2-WznJt5aT2uUB2Bs+JBLdwe0XTX67+xeLFcaNvCKxO=QBVQ@mail.gmail.com
1 parent 1c45507 commit 1542e16

File tree

1 file changed

+83
-21
lines changed

1 file changed

+83
-21
lines changed

src/backend/access/nbtree/nbtsplitloc.c

+83-21
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717
#include "access/nbtree.h"
1818
#include "storage/lmgr.h"
1919

20-
/* limits on split interval (default strategy only) */
21-
#define MAX_LEAF_INTERVAL 9
22-
#define MAX_INTERNAL_INTERVAL 18
23-
2420
typedef enum
2521
{
2622
/* strategy for searching through materialized list of split points */
@@ -76,6 +72,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
7672
static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
7773
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
7874
bool *newitemonleft, FindSplitStrat strategy);
75+
static int _bt_defaultinterval(FindSplitData *state);
7976
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
8077
SplitPoint *rightpage, FindSplitStrat *strategy);
8178
static void _bt_interval_edges(FindSplitData *state,
@@ -279,7 +276,7 @@ _bt_findsplitloc(Relation rel,
279276
* left side of the split, in order to maximize the number of trailing
280277
* attributes that can be truncated away. Only candidate split points
281278
* that imply an acceptable balance of free space on each side are
282-
* considered.
279+
* considered. See _bt_defaultinterval().
283280
*/
284281
if (!state.is_leaf)
285282
{
@@ -338,19 +335,6 @@ _bt_findsplitloc(Relation rel,
338335
fillfactormult = 0.50;
339336
}
340337

341-
/*
342-
* Set an initial limit on the split interval/number of candidate split
343-
* points as appropriate. The "Prefix B-Trees" paper refers to this as
344-
* sigma l for leaf splits and sigma b for internal ("branch") splits.
345-
* It's hard to provide a theoretical justification for the initial size
346-
* of the split interval, though it's clear that a small split interval
347-
* makes suffix truncation much more effective without noticeably
348-
* affecting space utilization over time.
349-
*/
350-
state.interval = Min(Max(1, state.nsplits * 0.05),
351-
state.is_leaf ? MAX_LEAF_INTERVAL :
352-
MAX_INTERNAL_INTERVAL);
353-
354338
/*
355339
* Save leftmost and rightmost splits for page before original ordinal
356340
* sort order is lost by delta/fillfactormult sort
@@ -361,6 +345,9 @@ _bt_findsplitloc(Relation rel,
361345
/* Give split points a fillfactormult-wise delta, and sort on deltas */
362346
_bt_deltasortsplits(&state, fillfactormult, usemult);
363347

348+
/* Determine split interval for default strategy */
349+
state.interval = _bt_defaultinterval(&state);
350+
364351
/*
365352
* Determine if default strategy/split interval will produce a
366353
* sufficiently distinguishing split, or if we should change strategies.
@@ -850,11 +837,13 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
850837
*/
851838
if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
852839
!final->newitemonleft && final->firstrightoff >= state->newitemoff &&
853-
final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL)
840+
final->firstrightoff < state->newitemoff + 9)
854841
{
855842
/*
856843
* Avoid the problem by performing a 50:50 split when the new item is
857844
* just to the right of the would-be "many duplicates" split point.
845+
* (Note that the test used for an insert that is "just to the right"
846+
* of the split point is conservative.)
858847
*/
859848
final = &state->splits[0];
860849
}
@@ -863,6 +852,79 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
863852
return final->firstrightoff;
864853
}
865854

855+
#define LEAF_SPLIT_DISTANCE 0.050
856+
#define INTERNAL_SPLIT_DISTANCE 0.075
857+
858+
/*
859+
* Return a split interval to use for the default strategy. This is a limit
860+
* on the number of candidate split points to give further consideration to.
861+
* Only a fraction of all candidate splits points (those located at the start
862+
* of the now-sorted splits array) fall within the split interval. Split
863+
* interval is applied within _bt_bestsplitloc().
864+
*
865+
* Split interval represents an acceptable range of split points -- those that
866+
* have leftfree and rightfree values that are acceptably balanced. The final
867+
* split point chosen is the split point with the lowest "penalty" among split
868+
* points in this split interval (unless we change our entire strategy, in
869+
* which case the interval also changes -- see _bt_strategy()).
870+
*
871+
* The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
872+
* and sigma b for internal ("branch") splits. It's hard to provide a
873+
* theoretical justification for the size of the split interval, though it's
874+
* clear that a small split interval can make tuples on level L+1 much smaller
875+
* on average, without noticeably affecting space utilization on level L.
876+
* (Note that the way that we calculate split interval might need to change if
877+
* suffix truncation is taught to truncate tuples "within" the last
878+
* attribute/datum for data types like text, which is more or less how it is
879+
* assumed to work in the paper.)
880+
*/
881+
static int
882+
_bt_defaultinterval(FindSplitData *state)
883+
{
884+
SplitPoint *spaceoptimal;
885+
int16 tolerance,
886+
lowleftfree,
887+
lowrightfree,
888+
highleftfree,
889+
highrightfree;
890+
891+
/*
892+
* Determine leftfree and rightfree values that are higher and lower than
893+
* we're willing to tolerate. Note that the final split interval will be
894+
* about 10% of nsplits in the common case where all non-pivot tuples
895+
* (data items) from a leaf page are uniformly sized. We're a bit more
896+
* aggressive when splitting internal pages.
897+
*/
898+
if (state->is_leaf)
899+
tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
900+
else
901+
tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
902+
903+
/* First candidate split point is the most evenly balanced */
904+
spaceoptimal = state->splits;
905+
lowleftfree = spaceoptimal->leftfree - tolerance;
906+
lowrightfree = spaceoptimal->rightfree - tolerance;
907+
highleftfree = spaceoptimal->leftfree + tolerance;
908+
highrightfree = spaceoptimal->rightfree + tolerance;
909+
910+
/*
911+
* Iterate through split points, starting from the split immediately after
912+
* 'spaceoptimal'. Find the first split point that divides free space so
913+
* unevenly that including it in the split interval would be unacceptable.
914+
*/
915+
for (int i = 1; i < state->nsplits; i++)
916+
{
917+
SplitPoint *split = state->splits + i;
918+
919+
/* Cannot use curdelta here, since its value is often weighted */
920+
if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
921+
split->leftfree > highleftfree || split->rightfree > highrightfree)
922+
return i;
923+
}
924+
925+
return state->nsplits;
926+
}
927+
866928
/*
867929
* Subroutine to decide whether split should use default strategy/initial
868930
* split interval, or whether it should finish splitting the page using
@@ -1097,7 +1159,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split)
10971159
}
10981160

10991161
/*
1100-
* Subroutine to get a lastleft IndexTuple for a split point from page
1162+
* Subroutine to get a lastleft IndexTuple for a split point
11011163
*/
11021164
static inline IndexTuple
11031165
_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
@@ -1113,7 +1175,7 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
11131175
}
11141176

11151177
/*
1116-
* Subroutine to get a firstright IndexTuple for a split point from page
1178+
* Subroutine to get a firstright IndexTuple for a split point
11171179
*/
11181180
static inline IndexTuple
11191181
_bt_split_firstright(FindSplitData *state, SplitPoint *split)

0 commit comments

Comments
 (0)