diff options
Diffstat (limited to 'src/backend/optimizer/path/clausesel.c')
-rw-r--r-- | src/backend/optimizer/path/clausesel.c | 137 |
1 files changed, 113 insertions, 24 deletions
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index 02660c2ba5..758ddea4a5 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -3,7 +3,7 @@ * clausesel.c * Routines to compute clause selectivities * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -22,6 +22,7 @@ #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" +#include "statistics/statistics.h" /* @@ -40,7 +41,8 @@ typedef struct RangeQueryClause static void addRangeClause(RangeQueryClause **rqlist, Node *clause, bool varonleft, bool isLTsel, Selectivity s2); - +static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root, + List *clauses); /**************************************************************************** * ROUTINES TO COMPUTE SELECTIVITIES @@ -60,23 +62,28 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause, * subclauses. However, that's only right if the subclauses have independent * probabilities, and in reality they are often NOT independent. So, * we want to be smarter where we can. - - * Currently, the only extra smarts we have is to recognize "range queries", - * such as "x > 34 AND x < 42". Clauses are recognized as possible range - * query components if they are restriction opclauses whose operators have - * scalarltsel() or scalargtsel() as their restriction selectivity estimator. - * We pair up clauses of this form that refer to the same variable. An - * unpairable clause of this kind is simply multiplied into the selectivity - * product in the normal way. But when we find a pair, we know that the - * selectivities represent the relative positions of the low and high bounds - * within the column's range, so instead of figuring the selectivity as - * hisel * losel, we can figure it as hisel + losel - 1. (To visualize this, - * see that hisel is the fraction of the range below the high bound, while - * losel is the fraction above the low bound; so hisel can be interpreted - * directly as a 0..1 value but we need to convert losel to 1-losel before - * interpreting it as a value. Then the available range is 1-losel to hisel. - * However, this calculation double-excludes nulls, so really we need - * hisel + losel + null_frac - 1.) + * + * If the clauses taken together refer to just one relation, we'll try to + * apply selectivity estimates using any extended statistics for that rel. + * Currently we only have (soft) functional dependencies, so apply these in as + * many cases as possible, and fall back on normal estimates for remaining + * clauses. + * + * We also recognize "range queries", such as "x > 34 AND x < 42". Clauses + * are recognized as possible range query components if they are restriction + * opclauses whose operators have scalarltsel() or scalargtsel() as their + * restriction selectivity estimator. We pair up clauses of this form that + * refer to the same variable. An unpairable clause of this kind is simply + * multiplied into the selectivity product in the normal way. But when we + * find a pair, we know that the selectivities represent the relative + * positions of the low and high bounds within the column's range, so instead + * of figuring the selectivity as hisel * losel, we can figure it as hisel + + * losel - 1. (To visualize this, see that hisel is the fraction of the range + * below the high bound, while losel is the fraction above the low bound; so + * hisel can be interpreted directly as a 0..1 value but we need to convert + * losel to 1-losel before interpreting it as a value. Then the available + * range is 1-losel to hisel. However, this calculation double-excludes + * nulls, so really we need hisel + losel + null_frac - 1.) * * If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation * and instead use DEFAULT_RANGE_INEQ_SEL. The same applies if the equation @@ -96,28 +103,67 @@ clauselist_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo) { Selectivity s1 = 1.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; RangeQueryClause *rqlist = NULL; ListCell *l; + int listidx; /* - * If there's exactly one clause, then no use in trying to match up pairs, - * so just go directly to clause_selectivity(). + * If there's exactly one clause, just go directly to + * clause_selectivity(). None of what we might do below is relevant. */ if (list_length(clauses) == 1) return clause_selectivity(root, (Node *) linitial(clauses), varRelid, jointype, sjinfo); /* - * Initial scan over clauses. Anything that doesn't look like a potential - * rangequery clause gets multiplied into s1 and forgotten. Anything that - * does gets inserted into an rqlist entry. + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. */ + rel = find_single_rel_for_clauses(root, clauses); + if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { + /* + * Perform selectivity estimations on any clauses found applicable by + * dependencies_clauselist_selectivity. 'estimatedclauses' will be + * filled with the 0-based list positions of clauses used that way, so + * that we can ignore them below. + */ + s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses); + + /* + * This would be the place to apply any other types of extended + * statistics selectivity estimations for remaining clauses. + */ + } + + /* + * Apply normal selectivity estimates for remaining clauses. We'll be + * careful to skip any clauses which were already estimated above. + * + * Anything that doesn't look like a potential rangequery clause gets + * multiplied into s1 and forgotten. Anything that does gets inserted into + * an rqlist entry. + */ + listidx = -1; foreach(l, clauses) { Node *clause = (Node *) lfirst(l); RestrictInfo *rinfo; Selectivity s2; + listidx++; + + /* + * Skip this clause if it's already been estimated by some other + * statistics above. + */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + /* Always compute the selectivity using clause_selectivity */ s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo); @@ -373,6 +419,49 @@ addRangeClause(RangeQueryClause **rqlist, Node *clause, } /* + * find_single_rel_for_clauses + * Examine each clause in 'clauses' and determine if all clauses + * reference only a single relation. If so return that relation, + * otherwise return NULL. + */ +static RelOptInfo * +find_single_rel_for_clauses(PlannerInfo *root, List *clauses) +{ + int lastrelid = 0; + ListCell *l; + + foreach(l, clauses) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); + int relid; + + /* + * If we have a list of bare clauses rather than RestrictInfos, we + * could pull out their relids the hard way with pull_varnos(). + * However, currently the extended-stats machinery won't do anything + * with non-RestrictInfo clauses anyway, so there's no point in + * spending extra cycles; just fail if that's what we have. + */ + if (!IsA(rinfo, RestrictInfo)) + return NULL; + + if (bms_is_empty(rinfo->clause_relids)) + continue; /* we can ignore variable-free clauses */ + if (!bms_get_singleton_member(rinfo->clause_relids, &relid)) + return NULL; /* multiple relations in this clause */ + if (lastrelid == 0) + lastrelid = relid; /* first clause referencing a relation */ + else if (relid != lastrelid) + return NULL; /* relation not same as last one */ + } + + if (lastrelid != 0) + return find_base_rel(root, lastrelid); + + return NULL; /* no clauses */ +} + +/* * bms_is_subset_singleton * * Same result as bms_is_subset(s, bms_make_singleton(x)), |