1515 * to hash_create. This prevents any attempt to split buckets on-the-fly.
1616 * Therefore, each hash bucket chain operates independently, and no fields
1717 * of the hash header change after init except nentries and freeList.
18- * A partitioned table uses a spinlock to guard changes of those two fields.
18+ * A partitioned table uses spinlocks to guard changes of those fields.
1919 * This lets any subset of the hash buckets be treated as a separately
2020 * lockable partition. We expect callers to use the low-order bits of a
2121 * lookup key's hash value as a partition number --- this will work because
111111#define DEF_DIRSIZE 256
112112#define DEF_FFACTOR 1 /* default fill factor */
113113
114+ /* Number of freelists to be used for a partitioned hash table. */
115+ #define NUM_FREELISTS 32
114116
115117/* A hash bucket is a linked list of HASHELEMENTs */
116118typedef HASHELEMENT * HASHBUCKET ;
117119
118120/* A hash segment is an array of bucket headers */
119121typedef HASHBUCKET * HASHSEGMENT ;
120122
123+ /*
124+ * Using array of FreeListData instead of separate arrays of mutexes, nentries
125+ * and freeLists prevents, at least partially, sharing one cache line between
126+ * different mutexes (see below).
127+ */
128+ typedef struct
129+ {
130+ slock_t mutex ; /* spinlock */
131+ long nentries ; /* number of entries */
132+ HASHELEMENT * freeList ; /* list of free elements */
133+ } FreeListData ;
134+
121135/*
122136 * Header structure for a hash table --- contains all changeable info
123137 *
@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT;
128142 */
129143struct HASHHDR
130144{
131- /* In a partitioned table, take this lock to touch nentries or freeList */
132- slock_t mutex ; /* unused if not partitioned table */
133-
134- /* These fields change during entry addition/deletion */
135- long nentries ; /* number of entries in hash table */
136- HASHELEMENT * freeList ; /* linked list of free elements */
145+ /*
146+ * The freelist can become a point of contention on high-concurrency hash
147+ * tables, so we use an array of freelist, each with its own mutex and
148+ * nentries count, instead of just a single one.
149+ *
150+ * If hash table is not partitioned only freeList[0] is used and spinlocks
151+ * are not used at all.
152+ */
153+ FreeListData freeList [NUM_FREELISTS ];
137154
138155 /* These fields can change, but not in a partitioned table */
139156 /* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +183,9 @@ struct HASHHDR
166183
167184#define IS_PARTITIONED (hctl ) ((hctl)->num_partitions != 0)
168185
186+ #define FREELIST_IDX (hctl , hashcode ) \
187+ (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
188+
169189/*
170190 * Top control structure for a hashtable --- in a shared table, each backend
171191 * has its own copy (OK since no fields change at runtime)
@@ -219,10 +239,10 @@ static long hash_accesses,
219239 */
220240static void * DynaHashAlloc (Size size );
221241static HASHSEGMENT seg_alloc (HTAB * hashp );
222- static bool element_alloc (HTAB * hashp , int nelem );
242+ static bool element_alloc (HTAB * hashp , int nelem , int freelist_idx );
223243static bool dir_realloc (HTAB * hashp );
224244static bool expand_table (HTAB * hashp );
225- static HASHBUCKET get_hash_entry (HTAB * hashp );
245+ static HASHBUCKET get_hash_entry (HTAB * hashp , int freelist_idx );
226246static void hdefault (HTAB * hashp );
227247static int choose_nelem_alloc (Size entrysize );
228248static bool init_htab (HTAB * hashp , long nelem );
@@ -482,10 +502,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
482502 if ((flags & HASH_SHARED_MEM ) ||
483503 nelem < hctl -> nelem_alloc )
484504 {
485- if (!element_alloc (hashp , (int ) nelem ))
486- ereport (ERROR ,
487- (errcode (ERRCODE_OUT_OF_MEMORY ),
488- errmsg ("out of memory" )));
505+ int i ,
506+ freelist_partitions ,
507+ nelem_alloc ,
508+ nelem_alloc_first ;
509+
510+ /*
511+ * If hash table is partitioned all freeLists have equal number of
512+ * elements. Otherwise only freeList[0] is used.
513+ */
514+ if (IS_PARTITIONED (hashp -> hctl ))
515+ freelist_partitions = NUM_FREELISTS ;
516+ else
517+ freelist_partitions = 1 ;
518+
519+ nelem_alloc = nelem / freelist_partitions ;
520+ if (nelem_alloc == 0 )
521+ nelem_alloc = 1 ;
522+
523+ /* Make sure all memory will be used */
524+ if (nelem_alloc * freelist_partitions < nelem )
525+ nelem_alloc_first =
526+ nelem - nelem_alloc * (freelist_partitions - 1 );
527+ else
528+ nelem_alloc_first = nelem_alloc ;
529+
530+ for (i = 0 ; i < freelist_partitions ; i ++ )
531+ {
532+ int temp = (i == 0 ) ? nelem_alloc_first : nelem_alloc ;
533+
534+ if (!element_alloc (hashp , temp , i ))
535+ ereport (ERROR ,
536+ (errcode (ERRCODE_OUT_OF_MEMORY ),
537+ errmsg ("out of memory" )));
538+ }
489539 }
490540
491541 if (flags & HASH_FIXED_SIZE )
@@ -503,9 +553,6 @@ hdefault(HTAB *hashp)
503553
504554 MemSet (hctl , 0 , sizeof (HASHHDR ));
505555
506- hctl -> nentries = 0 ;
507- hctl -> freeList = NULL ;
508-
509556 hctl -> dsize = DEF_DIRSIZE ;
510557 hctl -> nsegs = 0 ;
511558
@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem)
572619 HASHSEGMENT * segp ;
573620 int nbuckets ;
574621 int nsegs ;
622+ int i ;
575623
576624 /*
577625 * initialize mutex if it's a partitioned table
578626 */
579627 if (IS_PARTITIONED (hctl ))
580- SpinLockInit (& hctl -> mutex );
628+ for (i = 0 ; i < NUM_FREELISTS ; i ++ )
629+ SpinLockInit (& (hctl -> freeList [i ].mutex ));
581630
582631 /*
583632 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem)
648697 "HIGH MASK " , hctl -> high_mask ,
649698 "LOW MASK " , hctl -> low_mask ,
650699 "NSEGS " , hctl -> nsegs ,
651- "NENTRIES " , hctl -> nentries );
700+ "NENTRIES " , hash_get_num_entries ( hctl ) );
652701#endif
653702 return true;
654703}
@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp)
769818 where , hashp -> hctl -> accesses , hashp -> hctl -> collisions );
770819
771820 fprintf (stderr , "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n" ,
772- hashp -> hctl -> nentries , (long ) hashp -> hctl -> keysize ,
821+ hash_get_num_entries ( hashp ) , (long ) hashp -> hctl -> keysize ,
773822 hashp -> hctl -> max_bucket , hashp -> hctl -> nsegs );
774823 fprintf (stderr , "%s: total accesses %ld total collisions %ld\n" ,
775824 where , hash_accesses , hash_collisions );
@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp,
863912 HASHBUCKET currBucket ;
864913 HASHBUCKET * prevBucketPtr ;
865914 HashCompareFunc match ;
915+ int freelist_idx = FREELIST_IDX (hctl , hashvalue );
866916
867917#if HASH_STATISTICS
868918 hash_accesses ++ ;
@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp,
885935 * order of these tests is to try to check cheaper conditions first.
886936 */
887937 if (!IS_PARTITIONED (hctl ) && !hashp -> frozen &&
888- hctl -> nentries / (long ) (hctl -> max_bucket + 1 ) >= hctl -> ffactor &&
938+ hctl -> freeList [ 0 ]. nentries / (long ) (hctl -> max_bucket + 1 ) >= hctl -> ffactor &&
889939 !has_seq_scans (hashp ))
890940 (void ) expand_table (hashp );
891941 }
@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp,
943993 {
944994 /* if partitioned, must lock to touch nentries and freeList */
945995 if (IS_PARTITIONED (hctl ))
946- SpinLockAcquire (& hctl -> mutex );
996+ SpinLockAcquire (& ( hctl -> freeList [ freelist_idx ]. mutex ) );
947997
948- Assert (hctl -> nentries > 0 );
949- hctl -> nentries -- ;
998+ Assert (hctl -> freeList [ freelist_idx ]. nentries > 0 );
999+ hctl -> freeList [ freelist_idx ]. nentries -- ;
9501000
9511001 /* remove record from hash bucket's chain. */
9521002 * prevBucketPtr = currBucket -> link ;
9531003
9541004 /* add the record to the freelist for this table. */
955- currBucket -> link = hctl -> freeList ;
956- hctl -> freeList = currBucket ;
1005+ currBucket -> link = hctl -> freeList [ freelist_idx ]. freeList ;
1006+ hctl -> freeList [ freelist_idx ]. freeList = currBucket ;
9571007
9581008 if (IS_PARTITIONED (hctl ))
959- SpinLockRelease (& hctl -> mutex );
1009+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
9601010
9611011 /*
9621012 * better hope the caller is synchronizing access to this
@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp,
9821032 elog (ERROR , "cannot insert into frozen hashtable \"%s\"" ,
9831033 hashp -> tabname );
9841034
985- currBucket = get_hash_entry (hashp );
1035+ currBucket = get_hash_entry (hashp , freelist_idx );
9861036 if (currBucket == NULL )
9871037 {
9881038 /* out of memory */
@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp,
11751225 * create a new entry if possible
11761226 */
11771227static HASHBUCKET
1178- get_hash_entry (HTAB * hashp )
1228+ get_hash_entry (HTAB * hashp , int freelist_idx )
11791229{
1180- HASHHDR * hctl = hashp -> hctl ;
1230+ HASHHDR * hctl = hashp -> hctl ;
11811231 HASHBUCKET newElement ;
1232+ int borrow_from_idx ;
11821233
11831234 for (;;)
11841235 {
11851236 /* if partitioned, must lock to touch nentries and freeList */
11861237 if (IS_PARTITIONED (hctl ))
1187- SpinLockAcquire (& hctl -> mutex );
1238+ SpinLockAcquire (& hctl -> freeList [ freelist_idx ]. mutex );
11881239
11891240 /* try to get an entry from the freelist */
1190- newElement = hctl -> freeList ;
1241+ newElement = hctl -> freeList [freelist_idx ].freeList ;
1242+
11911243 if (newElement != NULL )
11921244 break ;
11931245
1194- /* no free elements. allocate another chunk of buckets */
11951246 if (IS_PARTITIONED (hctl ))
1196- SpinLockRelease (& hctl -> mutex );
1247+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
11971248
1198- if (!element_alloc (hashp , hctl -> nelem_alloc ))
1249+ /* no free elements. allocate another chunk of buckets */
1250+ if (!element_alloc (hashp , hctl -> nelem_alloc , freelist_idx ))
11991251 {
1200- /* out of memory */
1201- return NULL ;
1252+ if (!IS_PARTITIONED (hctl ))
1253+ return NULL ; /* out of memory */
1254+
1255+ /* try to borrow element from another partition */
1256+ borrow_from_idx = freelist_idx ;
1257+ for (;;)
1258+ {
1259+ borrow_from_idx = (borrow_from_idx + 1 ) % NUM_FREELISTS ;
1260+ if (borrow_from_idx == freelist_idx )
1261+ break ;
1262+
1263+ SpinLockAcquire (& (hctl -> freeList [borrow_from_idx ].mutex ));
1264+ newElement = hctl -> freeList [borrow_from_idx ].freeList ;
1265+
1266+ if (newElement != NULL )
1267+ {
1268+ hctl -> freeList [borrow_from_idx ].freeList = newElement -> link ;
1269+ SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1270+
1271+ SpinLockAcquire (& hctl -> freeList [freelist_idx ].mutex );
1272+ hctl -> freeList [freelist_idx ].nentries ++ ;
1273+ SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
1274+
1275+ break ;
1276+ }
1277+
1278+ SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1279+ }
1280+
1281+ return newElement ;
12021282 }
12031283 }
12041284
12051285 /* remove entry from freelist, bump nentries */
1206- hctl -> freeList = newElement -> link ;
1207- hctl -> nentries ++ ;
1286+ hctl -> freeList [ freelist_idx ]. freeList = newElement -> link ;
1287+ hctl -> freeList [ freelist_idx ]. nentries ++ ;
12081288
12091289 if (IS_PARTITIONED (hctl ))
1210- SpinLockRelease (& hctl -> mutex );
1290+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
12111291
12121292 return newElement ;
12131293}
@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp)
12181298long
12191299hash_get_num_entries (HTAB * hashp )
12201300{
1301+ int i ;
1302+ long sum = hashp -> hctl -> freeList [0 ].nentries ;
1303+
12211304 /*
12221305 * We currently don't bother with the mutex; it's only sensible to call
12231306 * this function if you've got lock on all partitions of the table.
12241307 */
1225- return hashp -> hctl -> nentries ;
1308+
1309+ if (!IS_PARTITIONED (hashp -> hctl ))
1310+ return sum ;
1311+
1312+ for (i = 1 ; i < NUM_FREELISTS ; i ++ )
1313+ sum += hashp -> hctl -> freeList [i ].nentries ;
1314+
1315+ return sum ;
12261316}
12271317
12281318/*
@@ -1527,12 +1617,12 @@ seg_alloc(HTAB *hashp)
15271617}
15281618
15291619/*
1530- * allocate some new elements and link them into the free list
1620+ * allocate some new elements and link them into the indicated free list
15311621 */
15321622static bool
1533- element_alloc (HTAB * hashp , int nelem )
1623+ element_alloc (HTAB * hashp , int nelem , int freelist_idx )
15341624{
1535- HASHHDR * hctl = hashp -> hctl ;
1625+ HASHHDR * hctl = hashp -> hctl ;
15361626 Size elementSize ;
15371627 HASHELEMENT * firstElement ;
15381628 HASHELEMENT * tmpElement ;
@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem)
15631653
15641654 /* if partitioned, must lock to touch freeList */
15651655 if (IS_PARTITIONED (hctl ))
1566- SpinLockAcquire (& hctl -> mutex );
1656+ SpinLockAcquire (& hctl -> freeList [ freelist_idx ]. mutex );
15671657
15681658 /* freelist could be nonempty if two backends did this concurrently */
1569- firstElement -> link = hctl -> freeList ;
1570- hctl -> freeList = prevElement ;
1659+ firstElement -> link = hctl -> freeList [ freelist_idx ]. freeList ;
1660+ hctl -> freeList [ freelist_idx ]. freeList = prevElement ;
15711661
15721662 if (IS_PARTITIONED (hctl ))
1573- SpinLockRelease (& hctl -> mutex );
1663+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
15741664
15751665 return true;
15761666}
0 commit comments