Consider outliers in split interval calculation.

petergeoghegan · petergeoghegan · commit 1542e16f2ca2 · 2020-04-21T09:59:24.000-07:00
Commit 0d861bb, which introduced deduplication to nbtree, added some logic to take large posting list tuples into account when choosing a split point. We subtract firstright posting list overhead from the projected new high key size when calculating leftfree/rightfree values for an affected candidate split point. Posting list tuples aren't special to nbtsplitloc.c, but taking them into account like this makes a huge difference in practice. Posting list tuples are frequently tuple size outliers. However, commit 0d861bb missed a closely related issue: split interval itself is calculated based on the assumption that tuples on the page being split are roughly equisized. That assumption was acceptable back when commit fab2502 taught the logic for choosing a split point about suffix truncation, but it's pretty questionable now that very large tuple sizes are common. This oversight led to unbalanced page splits in low cardinality multi-column indexes when deduplication was used: page splits that don't give sufficient weight to how unbalanced the split is when the interval happens to include some large posting list tuples (and when most other tuples on the page are not so large). Nail this down by calculating an initial split interval in a way that's attuned to the actual cost that we want to keep under control (not a fuzzy proxy for the cost): apply a leftfree + rightfree evenness test to each candidate split point that actually gets included in the split interval (for the default strategy). This replaces logic that used a percentage of all legal split points for the page as the basis of the initial split interval. Discussion: https://postgr.es/m/CAH2-WznJt5aT2uUB2Bs+JBLdwe0XTX67+xeLFcaNvCKxO=QBVQ@mail.gmail.com
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
@@ -17,10 +17,6 @@
 #include "access/nbtree.h"
 #include "storage/lmgr.h"
 
-/* limits on split interval (default strategy only) */
-#define MAX_LEAF_INTERVAL			9
-#define MAX_INTERNAL_INTERVAL		18
-
 typedef enum
 {
 	/* strategy for searching through materialized list of split points */
@@ -76,6 +72,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
 									 bool *newitemonleft, FindSplitStrat strategy);
+static int _bt_defaultinterval(FindSplitData *state);
 static int	_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
 						 SplitPoint *rightpage, FindSplitStrat *strategy);
 static void _bt_interval_edges(FindSplitData *state,
@@ -279,7 +276,7 @@ _bt_findsplitloc(Relation rel,
 	 * left side of the split, in order to maximize the number of trailing
 	 * attributes that can be truncated away.  Only candidate split points
 	 * that imply an acceptable balance of free space on each side are
-	 * considered.
+	 * considered.  See _bt_defaultinterval().
 	 */
 	if (!state.is_leaf)
 	{
@@ -338,19 +335,6 @@ _bt_findsplitloc(Relation rel,
 		fillfactormult = 0.50;
 	}
 
-	/*
-	 * Set an initial limit on the split interval/number of candidate split
-	 * points as appropriate.  The "Prefix B-Trees" paper refers to this as
-	 * sigma l for leaf splits and sigma b for internal ("branch") splits.
-	 * It's hard to provide a theoretical justification for the initial size
-	 * of the split interval, though it's clear that a small split interval
-	 * makes suffix truncation much more effective without noticeably
-	 * affecting space utilization over time.
-	 */
-	state.interval = Min(Max(1, state.nsplits * 0.05),
-						 state.is_leaf ? MAX_LEAF_INTERVAL :
-						 MAX_INTERNAL_INTERVAL);
-
 	/*
 	 * Save leftmost and rightmost splits for page before original ordinal
 	 * sort order is lost by delta/fillfactormult sort
@@ -361,6 +345,9 @@ _bt_findsplitloc(Relation rel,
 	/* Give split points a fillfactormult-wise delta, and sort on deltas */
 	_bt_deltasortsplits(&state, fillfactormult, usemult);
 
+	/* Determine split interval for default strategy */
+	state.interval = _bt_defaultinterval(&state);
+
 	/*
 	 * Determine if default strategy/split interval will produce a
 	 * sufficiently distinguishing split, or if we should change strategies.
@@ -850,11 +837,13 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
 	 */
 	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
 		!final->newitemonleft && final->firstrightoff >= state->newitemoff &&
-		final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL)
+		final->firstrightoff < state->newitemoff + 9)
 	{
 		/*
 		 * Avoid the problem by performing a 50:50 split when the new item is
 		 * just to the right of the would-be "many duplicates" split point.
+		 * (Note that the test used for an insert that is "just to the right"
+		 * of the split point is conservative.)
 		 */
 		final = &state->splits[0];
 	}
@@ -863,6 +852,79 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
 	return final->firstrightoff;
 }
 
+#define LEAF_SPLIT_DISTANCE			0.050
+#define INTERNAL_SPLIT_DISTANCE		0.075
+
+/*
+ * Return a split interval to use for the default strategy.  This is a limit
+ * on the number of candidate split points to give further consideration to.
+ * Only a fraction of all candidate splits points (those located at the start
+ * of the now-sorted splits array) fall within the split interval.  Split
+ * interval is applied within _bt_bestsplitloc().
+ *
+ * Split interval represents an acceptable range of split points -- those that
+ * have leftfree and rightfree values that are acceptably balanced.  The final
+ * split point chosen is the split point with the lowest "penalty" among split
+ * points in this split interval (unless we change our entire strategy, in
+ * which case the interval also changes -- see _bt_strategy()).
+ *
+ * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
+ * and sigma b for internal ("branch") splits.  It's hard to provide a
+ * theoretical justification for the size of the split interval, though it's
+ * clear that a small split interval can make tuples on level L+1 much smaller
+ * on average, without noticeably affecting space utilization on level L.
+ * (Note that the way that we calculate split interval might need to change if
+ * suffix truncation is taught to truncate tuples "within" the last
+ * attribute/datum for data types like text, which is more or less how it is
+ * assumed to work in the paper.)
+ */
+static int
+_bt_defaultinterval(FindSplitData *state)
+{
+	SplitPoint *spaceoptimal;
+	int16		tolerance,
+				lowleftfree,
+				lowrightfree,
+				highleftfree,
+				highrightfree;
+
+	/*
+	 * Determine leftfree and rightfree values that are higher and lower than
+	 * we're willing to tolerate.  Note that the final split interval will be
+	 * about 10% of nsplits in the common case where all non-pivot tuples
+	 * (data items) from a leaf page are uniformly sized.  We're a bit more
+	 * aggressive when splitting internal pages.
+	 */
+	if (state->is_leaf)
+		tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
+	else
+		tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
+
+	/* First candidate split point is the most evenly balanced */
+	spaceoptimal = state->splits;
+	lowleftfree = spaceoptimal->leftfree - tolerance;
+	lowrightfree = spaceoptimal->rightfree - tolerance;
+	highleftfree = spaceoptimal->leftfree + tolerance;
+	highrightfree = spaceoptimal->rightfree + tolerance;
+
+	/*
+	 * Iterate through split points, starting from the split immediately after
+	 * 'spaceoptimal'.  Find the first split point that divides free space so
+	 * unevenly that including it in the split interval would be unacceptable.
+	 */
+	for (int i = 1; i < state->nsplits; i++)
+	{
+		SplitPoint *split = state->splits + i;
+
+		/* Cannot use curdelta here, since its value is often weighted */
+		if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
+			split->leftfree > highleftfree || split->rightfree > highrightfree)
+			return i;
+	}
+
+	return state->nsplits;
+}
+
 /*
  * Subroutine to decide whether split should use default strategy/initial
  * split interval, or whether it should finish splitting the page using
@@ -1097,7 +1159,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split)
 }
 
 /*
- * Subroutine to get a lastleft IndexTuple for a split point from page
+ * Subroutine to get a lastleft IndexTuple for a split point
  */
 static inline IndexTuple
 _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
@@ -1113,7 +1175,7 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
 }
 
 /*
- * Subroutine to get a firstright IndexTuple for a split point from page
+ * Subroutine to get a firstright IndexTuple for a split point
  */
 static inline IndexTuple
 _bt_split_firstright(FindSplitData *state, SplitPoint *split)