@@ -142,11 +142,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
142142 FmgrInfo * opproc , bool isgt ,
143143 Datum constval , Oid consttype );
144144static double eqjoinsel_inner (Oid operator ,
145- VariableStatData * vardata1 , VariableStatData * vardata2 ,
146- RelOptInfo * rel1 , RelOptInfo * rel2 );
145+ VariableStatData * vardata1 , VariableStatData * vardata2 );
147146static double eqjoinsel_semi (Oid operator ,
148147 VariableStatData * vardata1 , VariableStatData * vardata2 ,
149- RelOptInfo * rel1 , RelOptInfo * rel2 );
148+ RelOptInfo * inner_rel );
150149static bool convert_to_scalar (Datum value , Oid valuetypid , double * scaledvalue ,
151150 Datum lobound , Datum hibound , Oid boundstypid ,
152151 double * scaledlobound , double * scaledhibound );
@@ -2011,47 +2010,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
20112010 VariableStatData vardata1 ;
20122011 VariableStatData vardata2 ;
20132012 bool join_is_reversed ;
2014- RelOptInfo * rel1 ;
2015- RelOptInfo * rel2 ;
2013+ RelOptInfo * inner_rel ;
20162014
20172015 get_join_variables (root , args , sjinfo ,
20182016 & vardata1 , & vardata2 , & join_is_reversed );
20192017
2020- /*
2021- * Identify the join's direct input relations. We use the min lefthand
2022- * and min righthand as the inputs, even though the join might actually
2023- * get done with larger input relations. The min inputs are guaranteed to
2024- * have been formed by now, though, and always using them ensures
2025- * consistency of estimates.
2026- */
2027- if (!join_is_reversed )
2028- {
2029- rel1 = find_join_input_rel (root , sjinfo -> min_lefthand );
2030- rel2 = find_join_input_rel (root , sjinfo -> min_righthand );
2031- }
2032- else
2033- {
2034- rel1 = find_join_input_rel (root , sjinfo -> min_righthand );
2035- rel2 = find_join_input_rel (root , sjinfo -> min_lefthand );
2036- }
2037-
20382018 switch (sjinfo -> jointype )
20392019 {
20402020 case JOIN_INNER :
20412021 case JOIN_LEFT :
20422022 case JOIN_FULL :
2043- selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 ,
2044- rel1 , rel2 );
2023+ selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 );
20452024 break ;
20462025 case JOIN_SEMI :
20472026 case JOIN_ANTI :
2027+ /*
2028+ * Look up the join's inner relation. min_righthand is sufficient
2029+ * information because neither SEMI nor ANTI joins permit any
2030+ * reassociation into or out of their RHS, so the righthand will
2031+ * always be exactly that set of rels.
2032+ */
2033+ inner_rel = find_join_input_rel (root , sjinfo -> min_righthand );
2034+
20482035 if (!join_is_reversed )
20492036 selec = eqjoinsel_semi (operator , & vardata1 , & vardata2 ,
2050- rel1 , rel2 );
2037+ inner_rel );
20512038 else
20522039 selec = eqjoinsel_semi (get_commutator (operator ),
20532040 & vardata2 , & vardata1 ,
2054- rel2 , rel1 );
2041+ inner_rel );
20552042 break ;
20562043 default :
20572044 /* other values not expected here */
@@ -2077,8 +2064,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
20772064 */
20782065static double
20792066eqjoinsel_inner (Oid operator ,
2080- VariableStatData * vardata1 , VariableStatData * vardata2 ,
2081- RelOptInfo * rel1 , RelOptInfo * rel2 )
2067+ VariableStatData * vardata1 , VariableStatData * vardata2 )
20822068{
20832069 double selec ;
20842070 double nd1 ;
@@ -2273,26 +2259,10 @@ eqjoinsel_inner(Oid operator,
22732259 * XXX Can we be smarter if we have an MCV list for just one side? It
22742260 * seems that if we assume equal distribution for the other side, we
22752261 * end up with the same answer anyway.
2276- *
2277- * An additional hack we use here is to clamp the nd1 and nd2 values
2278- * to not more than what we are estimating the input relation sizes to
2279- * be, providing a crude correction for the selectivity of restriction
2280- * clauses on those relations. (We don't do that in the other path
2281- * since there we are comparing the nd values to stats for the whole
2282- * relations.) We can apply this clamp both with respect to the base
2283- * relations from which the join variables come, and to the immediate
2284- * input relations of the current join.
22852262 */
22862263 double nullfrac1 = stats1 ? stats1 -> stanullfrac : 0.0 ;
22872264 double nullfrac2 = stats2 ? stats2 -> stanullfrac : 0.0 ;
22882265
2289- if (vardata1 -> rel )
2290- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2291- nd1 = Min (nd1 , rel1 -> rows );
2292- if (vardata2 -> rel )
2293- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2294- nd2 = Min (nd2 , rel2 -> rows );
2295-
22962266 selec = (1.0 - nullfrac1 ) * (1.0 - nullfrac2 );
22972267 if (nd1 > nd2 )
22982268 selec /= nd1 ;
@@ -2319,7 +2289,7 @@ eqjoinsel_inner(Oid operator,
23192289static double
23202290eqjoinsel_semi (Oid operator ,
23212291 VariableStatData * vardata1 , VariableStatData * vardata2 ,
2322- RelOptInfo * rel1 , RelOptInfo * rel2 )
2292+ RelOptInfo * inner_rel )
23232293{
23242294 double selec ;
23252295 double nd1 ;
@@ -2339,6 +2309,25 @@ eqjoinsel_semi(Oid operator,
23392309 nd1 = get_variable_numdistinct (vardata1 );
23402310 nd2 = get_variable_numdistinct (vardata2 );
23412311
2312+ /*
2313+ * We clamp nd2 to be not more than what we estimate the inner relation's
2314+ * size to be. This is intuitively somewhat reasonable since obviously
2315+ * there can't be more than that many distinct values coming from the
2316+ * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2317+ * likewise) is that this is the only pathway by which restriction clauses
2318+ * applied to the inner rel will affect the join result size estimate,
2319+ * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2320+ * only the outer rel's size. If we clamped nd1 we'd be double-counting
2321+ * the selectivity of outer-rel restrictions.
2322+ *
2323+ * We can apply this clamping both with respect to the base relation from
2324+ * which the join variable comes (if there is just one), and to the
2325+ * immediate inner input relation of the current join.
2326+ */
2327+ if (vardata2 -> rel )
2328+ nd2 = Min (nd2 , vardata2 -> rel -> rows );
2329+ nd2 = Min (nd2 , inner_rel -> rows );
2330+
23422331 if (HeapTupleIsValid (vardata1 -> statsTuple ))
23432332 {
23442333 stats1 = (Form_pg_statistic ) GETSTRUCT (vardata1 -> statsTuple );
@@ -2382,11 +2371,21 @@ eqjoinsel_semi(Oid operator,
23822371 uncertainfrac ,
23832372 uncertain ;
23842373 int i ,
2385- nmatches ;
2374+ nmatches ,
2375+ clamped_nvalues2 ;
2376+
2377+ /*
2378+ * The clamping above could have resulted in nd2 being less than
2379+ * nvalues2; in which case, we assume that precisely the nd2 most
2380+ * common values in the relation will appear in the join input, and so
2381+ * compare to only the first nd2 members of the MCV list. Of course
2382+ * this is frequently wrong, but it's the best bet we can make.
2383+ */
2384+ clamped_nvalues2 = Min (nvalues2 , nd2 );
23862385
23872386 fmgr_info (get_opcode (operator ), & eqproc );
23882387 hasmatch1 = (bool * ) palloc0 (nvalues1 * sizeof (bool ));
2389- hasmatch2 = (bool * ) palloc0 (nvalues2 * sizeof (bool ));
2388+ hasmatch2 = (bool * ) palloc0 (clamped_nvalues2 * sizeof (bool ));
23902389
23912390 /*
23922391 * Note we assume that each MCV will match at most one member of the
@@ -2399,7 +2398,7 @@ eqjoinsel_semi(Oid operator,
23992398 {
24002399 int j ;
24012400
2402- for (j = 0 ; j < nvalues2 ; j ++ )
2401+ for (j = 0 ; j < clamped_nvalues2 ; j ++ )
24032402 {
24042403 if (hasmatch2 [j ])
24052404 continue ;
@@ -2444,7 +2443,7 @@ eqjoinsel_semi(Oid operator,
24442443 {
24452444 nd1 -= nmatches ;
24462445 nd2 -= nmatches ;
2447- if (nd1 <= nd2 || nd2 <= 0 )
2446+ if (nd1 <= nd2 || nd2 < 0 )
24482447 uncertainfrac = 1.0 ;
24492448 else
24502449 uncertainfrac = nd2 / nd1 ;
@@ -2465,14 +2464,7 @@ eqjoinsel_semi(Oid operator,
24652464
24662465 if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT )
24672466 {
2468- if (vardata1 -> rel )
2469- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2470- nd1 = Min (nd1 , rel1 -> rows );
2471- if (vardata2 -> rel )
2472- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2473- nd2 = Min (nd2 , rel2 -> rows );
2474-
2475- if (nd1 <= nd2 || nd2 <= 0 )
2467+ if (nd1 <= nd2 || nd2 < 0 )
24762468 selec = 1.0 - nullfrac1 ;
24772469 else
24782470 selec = (nd2 / nd1 ) * (1.0 - nullfrac1 );
0 commit comments