summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane2016-04-04 20:48:13 +0000
committerTom Lane2016-04-04 20:48:13 +0000
commit391159e03a8b69dd04a1432ceb800c7c4c3d608c (patch)
tree5db27b427a4e4d366c768913f49eef451a3213de
parentc9ff752a854b687fc0a05fd4aba1066028ec5495 (diff)
Partially revert commit 3d3bf62f30200500637b24fdb7b992a99f9704c3.
On reflection, the pre-existing logic in ANALYZE is specifically meant to compare the frequency of a candidate MCV against the estimated frequency of a random distinct value across the whole table. The change to compare it against the average frequency of values actually seen in the sample doesn't seem very principled, and if anything it would make us less likely not more likely to consider a value an MCV. So revert that, but keep the aspect of considering only nonnull values, which definitely is correct. In passing, rename the local variables in these stanzas to "ndistinct_table", to avoid confusion with the "ndistinct" that appears at an outer scope in compute_scalar_stats.
-rw-r--r--src/backend/commands/analyze.c16
1 files changed, 10 insertions, 6 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 44a4b3ff1e..1283de0334 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2133,13 +2133,15 @@ compute_distinct_stats(VacAttrStatsP stats,
}
else
{
- /* d here is the same as d in the Haas-Stokes formula */
- int d = nonnull_cnt - summultiple + nmultiple;
+ double ndistinct_table = stats->stadistinct;
double avgcount,
mincount;
+ /* Re-extract estimate of # distinct nonnull values in table */
+ if (ndistinct_table < 0)
+ ndistinct_table = -ndistinct_table * totalrows;
/* estimate # occurrences in sample of a typical nonnull value */
- avgcount = (double) nonnull_cnt / (double) d;
+ avgcount = (double) nonnull_cnt / ndistinct_table;
/* set minimum threshold count to store a value */
mincount = avgcount * 1.25;
if (mincount < 2)
@@ -2493,14 +2495,16 @@ compute_scalar_stats(VacAttrStatsP stats,
}
else
{
- /* d here is the same as d in the Haas-Stokes formula */
- int d = ndistinct + toowide_cnt;
+ double ndistinct_table = stats->stadistinct;
double avgcount,
mincount,
maxmincount;
+ /* Re-extract estimate of # distinct nonnull values in table */
+ if (ndistinct_table < 0)
+ ndistinct_table = -ndistinct_table * totalrows;
/* estimate # occurrences in sample of a typical nonnull value */
- avgcount = (double) values_cnt / (double) d;
+ avgcount = (double) nonnull_cnt / ndistinct_table;
/* set minimum threshold count to store a value */
mincount = avgcount * 1.25;
if (mincount < 2)