3939
4040
4141static void ExecHashIncreaseNumBatches (HashJoinTable hashtable );
42+ static void ExecHashIncreaseNumBuckets (HashJoinTable hashtable );
4243static void ExecHashBuildSkewHash (HashJoinTable hashtable , Hash * node ,
4344 int mcvsToUse );
4445static void ExecHashSkewTableInsert (HashJoinTable hashtable ,
@@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
117118 /* It's a skew tuple, so put it into that hash table */
118119 ExecHashSkewTableInsert (hashtable , slot , hashvalue ,
119120 bucketNumber );
121+ hashtable -> skewTuples += 1 ;
120122 }
121123 else
122124 {
@@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
127129 }
128130 }
129131
132+ /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
133+ if (hashtable -> nbuckets != hashtable -> nbuckets_optimal )
134+ {
135+ /* We never decrease the number of buckets. */
136+ Assert (hashtable -> nbuckets_optimal > hashtable -> nbuckets );
137+
138+ #ifdef HJDEBUG
139+ printf ("Increasing nbuckets %d => %d\n" ,
140+ hashtable -> nbuckets , hashtable -> nbuckets_optimal );
141+ #endif
142+
143+ ExecHashIncreaseNumBuckets (hashtable );
144+ }
145+
146+ /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
147+ hashtable -> spaceUsed += hashtable -> nbuckets * sizeof (HashJoinTuple );
148+ if (hashtable -> spaceUsed > hashtable -> spacePeak )
149+ hashtable -> spacePeak = hashtable -> spaceUsed ;
150+
130151 /* must provide our own instrumentation support */
131152 if (node -> ps .instrument )
132153 InstrStopNode (node -> ps .instrument , hashtable -> totalTuples );
@@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
272293 */
273294 hashtable = (HashJoinTable ) palloc (sizeof (HashJoinTableData ));
274295 hashtable -> nbuckets = nbuckets ;
296+ hashtable -> nbuckets_original = nbuckets ;
297+ hashtable -> nbuckets_optimal = nbuckets ;
275298 hashtable -> log2_nbuckets = log2_nbuckets ;
299+ hashtable -> log2_nbuckets_optimal = log2_nbuckets ;
276300 hashtable -> buckets = NULL ;
277301 hashtable -> keepNulls = keepNulls ;
278302 hashtable -> skewEnabled = false;
@@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
286310 hashtable -> nbatch_outstart = nbatch ;
287311 hashtable -> growEnabled = true;
288312 hashtable -> totalTuples = 0 ;
313+ hashtable -> skewTuples = 0 ;
289314 hashtable -> innerBatchFile = NULL ;
290315 hashtable -> outerBatchFile = NULL ;
291316 hashtable -> spaceUsed = 0 ;
@@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
620645 */
621646 ninmemory = nfreed = 0 ;
622647
648+ /* If know we need to resize nbuckets, we can do it while rebatching. */
649+ if (hashtable -> nbuckets_optimal != hashtable -> nbuckets )
650+ {
651+ /* we never decrease the number of buckets */
652+ Assert (hashtable -> nbuckets_optimal > hashtable -> nbuckets );
653+
654+ hashtable -> nbuckets = hashtable -> nbuckets_optimal ;
655+ hashtable -> log2_nbuckets = hashtable -> log2_nbuckets_optimal ;
656+
657+ hashtable -> buckets = repalloc (hashtable -> buckets ,
658+ sizeof (HashJoinTuple ) * hashtable -> nbuckets );
659+ }
660+
623661 /*
624662 * We will scan through the chunks directly, so that we can reset the
625663 * buckets now and not have to keep track which tuples in the buckets have
@@ -703,6 +741,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
703741 }
704742}
705743
744+ /*
745+ * ExecHashIncreaseNumBuckets
746+ * increase the original number of buckets in order to reduce
747+ * number of tuples per bucket
748+ */
749+ static void
750+ ExecHashIncreaseNumBuckets (HashJoinTable hashtable )
751+ {
752+ HashMemoryChunk chunk ;
753+
754+ /* do nothing if not an increase (it's called increase for a reason) */
755+ if (hashtable -> nbuckets >= hashtable -> nbuckets_optimal )
756+ return ;
757+
758+ /*
759+ * We already know the optimal number of buckets, so let's just
760+ * compute the log2_nbuckets for it.
761+ */
762+ hashtable -> nbuckets = hashtable -> nbuckets_optimal ;
763+ hashtable -> log2_nbuckets = my_log2 (hashtable -> nbuckets_optimal );
764+
765+ Assert (hashtable -> nbuckets > 1 );
766+ Assert (hashtable -> nbuckets <= (INT_MAX / 2 ));
767+ Assert (hashtable -> nbuckets == (1 << hashtable -> log2_nbuckets ));
768+
769+ #ifdef HJDEBUG
770+ printf ("Increasing nbuckets to %d\n" , hashtable -> nbuckets );
771+ #endif
772+
773+ /*
774+ * Just reallocate the proper number of buckets - we don't need to
775+ * walk through them - we can walk the dense-allocated chunks
776+ * (just like in ExecHashIncreaseNumBatches, but without all the
777+ * copying into new chunks)
778+ */
779+ hashtable -> buckets =
780+ (HashJoinTuple * ) repalloc (hashtable -> buckets ,
781+ hashtable -> nbuckets * sizeof (HashJoinTuple ));
782+
783+ memset (hashtable -> buckets , 0 , sizeof (void * ) * hashtable -> nbuckets );
784+
785+ /* scan through all tuples in all chunks to rebuild the hash table */
786+ for (chunk = hashtable -> chunks ; chunk != NULL ; chunk = chunk -> next )
787+ {
788+ /* process all tuples stored in this chunk */
789+ size_t idx = 0 ;
790+ while (idx < chunk -> used )
791+ {
792+ HashJoinTuple hashTuple = (HashJoinTuple ) (chunk -> data + idx );
793+ int bucketno ;
794+ int batchno ;
795+
796+ ExecHashGetBucketAndBatch (hashtable , hashTuple -> hashvalue ,
797+ & bucketno , & batchno );
798+
799+ /* add the tuple to the proper bucket */
800+ hashTuple -> next = hashtable -> buckets [bucketno ];
801+ hashtable -> buckets [bucketno ] = hashTuple ;
802+
803+ /* advance index past the tuple */
804+ idx += MAXALIGN (HJTUPLE_OVERHEAD +
805+ HJTUPLE_MINTUPLE (hashTuple )-> t_len );
806+ }
807+ }
808+
809+ #ifdef HJDEBUG
810+ printf ("Nbuckets increased to %d, average items per bucket %.1f\n" ,
811+ hashtable -> nbuckets , batchTuples / hashtable -> nbuckets );
812+ #endif
813+ }
814+
815+
706816/*
707817 * ExecHashTableInsert
708818 * insert a tuple into the hash table depending on the hash value
@@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
736846 */
737847 HashJoinTuple hashTuple ;
738848 int hashTupleSize ;
849+ double ntuples = (hashtable -> totalTuples - hashtable -> skewTuples );
739850
740851 /* Create the HashJoinTuple */
741852 hashTupleSize = HJTUPLE_OVERHEAD + tuple -> t_len ;
@@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
756867 hashTuple -> next = hashtable -> buckets [bucketno ];
757868 hashtable -> buckets [bucketno ] = hashTuple ;
758869
870+ /*
871+ * Increase the (optimal) number of buckets if we just exceeded the
872+ * NTUP_PER_BUCKET threshold, but only when there's still a single batch.
873+ */
874+ if ((hashtable -> nbatch == 1 ) &&
875+ (hashtable -> nbuckets_optimal <= INT_MAX /2 ) && /* overflow protection */
876+ (ntuples >= (hashtable -> nbuckets_optimal * NTUP_PER_BUCKET )))
877+ {
878+ hashtable -> nbuckets_optimal *= 2 ;
879+ hashtable -> log2_nbuckets_optimal += 1 ;
880+ }
881+
759882 /* Account for space used, and back off if we've used too much */
760883 hashtable -> spaceUsed += hashTupleSize ;
761884 if (hashtable -> spaceUsed > hashtable -> spacePeak )
762885 hashtable -> spacePeak = hashtable -> spaceUsed ;
763- if (hashtable -> spaceUsed + hashtable -> nbuckets * sizeof (HashJoinTuple )
886+ if (hashtable -> spaceUsed +
887+ hashtable -> nbuckets_optimal * sizeof (HashJoinTuple )
764888 > hashtable -> spaceAllowed )
765889 ExecHashIncreaseNumBatches (hashtable );
766890 }
@@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
8851009 * functions are good about randomizing all their output bits, else we are
8861010 * likely to have very skewed bucket or batch occupancy.)
8871011 *
888- * nbuckets doesn't change over the course of the join.
1012+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
1013+ * bucket count growth. Once we start batching, the value is fixed and does
1014+ * not change over the course of the join (making it possible to compute batch
1015+ * number the way we do here).
8891016 *
8901017 * nbatch is always a power of 2; we increase it only by doubling it. This
8911018 * effectively adds one more bit to the top of the batchno.
0 commit comments