Change patternsel() so that instead of switching from a pure

pattern-examination heuristic method to purely histogram-driven selectivity at histogram size 100, we compute both estimates and use a weighted average. The weight put on the heuristic estimate decreases linearly with histogram size, dropping to zero for 100 or more histogram entries. Likewise in ltreeparentsel(). After a patch by Greg Stark, though I reorganized the logic a bit to give the caller of histogram_selectivity() more control.
author: Tom Lane 2008-03-09 00:32:09 +0000
committer: Tom Lane 2008-03-09 00:32:09 +0000
commit: 341a4d3b830e403b0d6644c91d6c125de0b3b149 (patch)
tree: 688b6bee8301026f4647380e590aa96af03f450f
parent: abb05e92762d4520b175bcbd3c988e4311feeffb (diff)
3 files changed, 72 insertions, 32 deletions
diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c
index 6803fefc20..8f28cf81e9 100644
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
@@ -609,6 +609,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 		double		mcvsum;
 		double		mcvsel;
 		double		nullfrac;
+		int			hist_size;
 
 		fmgr_info(get_opcode(operator), &contproc);
 
@@ -626,21 +627,31 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 		 */
 		selec = histogram_selectivity(&vardata, &contproc,
 									  constval, varonleft,
-									  100, 1);
+									  10, 1, &hist_size);
 		if (selec < 0)
 		{
 			/* Nope, fall back on default */
 			selec = DEFAULT_PARENT_SEL;
 		}
-		else
+		else if (hist_size < 100)
 		{
-			/* Yes, but don't believe extremely small or large estimates. */
-			if (selec < 0.0001)
-				selec = 0.0001;
-			else if (selec > 0.9999)
-				selec = 0.9999;
+			/*
+			 * For histogram sizes from 10 to 100, we combine the
+			 * histogram and default selectivities, putting increasingly
+			 * more trust in the histogram for larger sizes.
+			 */
+			double	hist_weight = hist_size / 100.0;
+
+			selec = selec * hist_weight +
+				DEFAULT_PARENT_SEL * (1.0 - hist_weight);
 		}
 
+		/* In any case, don't believe extremely small or large estimates. */
+		if (selec < 0.0001)
+			selec = 0.0001;
+		else if (selec > 0.9999)
+			selec = 0.9999;
+
 		if (HeapTupleIsValid(vardata.statsTuple))
 			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
 		else
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a38bd5c92c..96d8d8143d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
  * or not it has anything to do with the histogram sort operator.  We are
  * essentially using the histogram just as a representative sample.  However,
  * small histograms are unlikely to be all that representative, so the caller
- * should specify a minimum histogram size to use, and fall back on some
- * other approach if this routine fails.
+ * should be prepared to fall back on some other estimation approach when the
+ * histogram is missing or very small.  It may also be prudent to combine this
+ * approach with another one when the histogram is small.
  *
- * The caller also specifies n_skip, which causes us to ignore the first and
- * last n_skip histogram elements, on the grounds that they are outliers and
- * hence not very representative.  If in doubt, min_hist_size = 100 and
- * n_skip = 1 are reasonable values.
+ * If the actual histogram size is not at least min_hist_size, we won't bother
+ * to do the calculation at all.  Also, if the n_skip parameter is > 0, we
+ * ignore the first and last n_skip histogram elements, on the grounds that
+ * they are outliers and hence not very representative.  Typical values for
+ * these parameters are 10 and 1.
  *
  * The function result is the selectivity, or -1 if there is no histogram
  * or it's smaller than min_hist_size.
  *
+ * The output parameter *hist_size receives the actual histogram size,
+ * or zero if no histogram.  Callers may use this number to decide how
+ * much faith to put in the function result.
+ *
  * Note that the result disregards both the most-common-values (if any) and
  * null entries.  The caller is expected to combine this result with
  * statistics for those portions of the column population.	It may also be
@@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 double
 histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 					  Datum constval, bool varonleft,
-					  int min_hist_size, int n_skip)
+					  int min_hist_size, int n_skip,
+					  int *hist_size)
 {
 	double		result;
 	Datum	   *values;
@@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 						 &values, &nvalues,
 						 NULL, NULL))
 	{
+		*hist_size = nvalues;
 		if (nvalues >= min_hist_size)
 		{
 			int			nmatch = 0;
@@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 		free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 	}
 	else
+	{
+		*hist_size = 0;
 		result = -1;
+	}
 
 	return result;
 }
@@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 		 * selectivity of the fixed prefix and remainder of pattern
 		 * separately, then combine the two to get an estimate of the
 		 * selectivity for the part of the column population represented by
-		 * the histogram.  We then add up data for any most-common-values
-		 * values; these are not in the histogram population, and we can get
-		 * exact answers for them by applying the pattern operator, so there's
-		 * no reason to approximate.  (If the MCVs cover a significant part of
-		 * the total population, this gives us a big leg up in accuracy.)
+		 * the histogram.  (For small histograms, we combine these approaches.)
+		 *
+		 * We then add up data for any most-common-values values; these are
+		 * not in the histogram population, and we can get exact answers for
+		 * them by applying the pattern operator, so there's no reason to
+		 * approximate.  (If the MCVs cover a significant part of the total
+		 * population, this gives us a big leg up in accuracy.)
 		 */
 		Selectivity selec;
+		int			hist_size;
 		FmgrInfo	opproc;
 		double		nullfrac,
 					mcv_selec,
@@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 		fmgr_info(get_opcode(operator), &opproc);
 
 		selec = histogram_selectivity(&vardata, &opproc, constval, true,
-									  100, 1);
-		if (selec < 0)
+									  10, 1, &hist_size);
+
+		/* If not at least 100 entries, use the heuristic method */
+		if (hist_size < 100)
 		{
-			/* Nope, so fake it with the heuristic method */
+			Selectivity heursel;
 			Selectivity prefixsel;
 			Selectivity restsel;
 
@@ -1146,17 +1162,29 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 			else
 				prefixsel = 1.0;
 			restsel = pattern_selectivity(rest, ptype);
-			selec = prefixsel * restsel;
-		}
-		else
-		{
-			/* Yes, but don't believe extremely small or large estimates. */
-			if (selec < 0.0001)
-				selec = 0.0001;
-			else if (selec > 0.9999)
-				selec = 0.9999;
+			heursel = prefixsel * restsel;
+
+			if (selec < 0)			/* fewer than 10 histogram entries? */
+				selec = heursel;
+			else
+			{
+				/*
+				 * For histogram sizes from 10 to 100, we combine the
+				 * histogram and heuristic selectivities, putting increasingly
+				 * more trust in the histogram for larger sizes.
+				 */
+				double	hist_weight = hist_size / 100.0;
+
+				selec = selec * hist_weight + heursel * (1.0 - hist_weight);
+			}
 		}
 
+		/* In any case, don't believe extremely small or large estimates. */
+		if (selec < 0.0001)
+			selec = 0.0001;
+		else if (selec > 0.9999)
+			selec = 0.9999;
+
 		/*
 		 * If we have most-common-values info, add up the fractions of the MCV
 		 * entries that satisfy MCV OP PATTERN.  These fractions contribute
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 10e5af4269..c6d79cf376 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -112,7 +112,8 @@ extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 				double *sumcommonp);
 extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 					  Datum constval, bool varonleft,
-					  int min_hist_size, int n_skip);
+					  int min_hist_size, int n_skip,
+					  int *hist_size);
 
 extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
 					 Pattern_Type ptype,
author	Tom Lane	2008-03-09 00:32:09 +0000
committer	Tom Lane	2008-03-09 00:32:09 +0000
commit	341a4d3b830e403b0d6644c91d6c125de0b3b149 (patch)
tree	688b6bee8301026f4647380e590aa96af03f450f
parent	abb05e92762d4520b175bcbd3c988e4311feeffb (diff)