Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan

author Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)

committer Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)
author Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)
committer Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index e5c112526fcc1fdff2f61dd1fb210561c9861b9c..a7ced751ab07df0878d88c4cbdc63c7c6eba8ba7 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1053,6 +1053,7 @@ CommitTransaction(void)
         AtEOXact_Namespace(true);
         AtEOXact_CatCache(true);
         AtEOXact_Files();
+       AtEOXact_HashTables(true);
         pgstat_count_xact_commit();
         AtCommit_Memory();
  
@@ -1168,6 +1169,7 @@ AbortTransaction(void)
         AtEOXact_Namespace(false);
         AtEOXact_CatCache(false);
         AtEOXact_Files();
+       AtEOXact_HashTables(false);
         pgstat_count_xact_rollback();
  
         /*
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index 18eba0662ee3de1b5477dc78cfa613c001fffa86..996606fb3866919e93a01e42b60378f30145fce6 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -71,6 +71,9 @@ static bool expand_table(HTAB *hashp);
  static bool hdefault(HTAB *hashp);
  static bool init_htab(HTAB *hashp, long nelem);
  static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
  
  
  /*
@@ -166,6 +169,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                         return NULL;
         }
  
+       hashp->frozen = false;
+
         if (!hdefault(hashp))
                 return NULL;
  
@@ -623,6 +628,10 @@ hash_search(HTAB *hashp,
                         if (currBucket != NULL)
                                 return (void *) ELEMENTKEY(currBucket);
  
+                       /* disallow inserts if frozen */
+                       if (hashp->frozen)
+                               elog(ERROR, "cannot insert into a frozen hashtable");
+
                         /* get the next free element */
                         currBucket = hctl->freeList;
                         if (currBucket == NULL)
@@ -645,8 +654,12 @@ hash_search(HTAB *hashp,
  
                         /* caller is expected to fill the data field on return */
  
-                       /* Check if it is time to split the segment */
-                       if (++hctl->nentries / (long) (hctl->max_bucket + 1) > hctl->ffactor)
+                       /*
+                        * Check if it is time to split a bucket.  Can't split if table
+                        * is the subject of any active hash_seq_search scans.
+                        */
+                       if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+                               !has_seq_scans(hashp))
                         {
                                 /*
                                  * NOTE: failure to expand table is not a fatal error, it
@@ -665,15 +678,25 @@ hash_search(HTAB *hashp,
  }
  
  /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
   *                     Sequentially search through hash table and return
   *                     all the elements one by one, return NULL when no more.
   *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
   * NOTE: caller may delete the returned element before continuing the scan.
   * However, deleting any other element while the scan is in progress is
   * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
   * if elements are added to the table while the scan is in progress, it is
   * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
   */
  void
  hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -681,6 +704,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
         status->hashp = hashp;
         status->curBucket = 0;
         status->curEntry = NULL;
+       if (!hashp->frozen)
+               register_seq_scan(hashp);
  }
  
  void *
@@ -734,9 +759,40 @@ hash_seq_search(HASH_SEQ_STATUS *status)
                         ++status->curBucket;
         }
  
+       hash_seq_term(status);
         return NULL;                            /* out of buckets */
  }
  
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+       if (!status->hashp->frozen)
+               deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *                     Freeze a hashtable against future insertions (deletions are
+ *                     still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+       if (hashp->isshared)
+               elog(ERROR, "cannot freeze shared hashtable");
+       if (!hashp->frozen && has_seq_scans(hashp))
+               elog(ERROR, "cannot freeze hashtable with active scans");
+       hashp->frozen = true;
+}
+
  
  /********************************* UTILITIES ************************/
  
@@ -948,3 +1004,108 @@ my_log2(long num)
                 ;
         return i;
  }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];   /* tables being scanned */
+static int     num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+       if (num_seq_scans >= MAX_SEQ_SCANS)
+               elog(ERROR, "too many active hash_seq_search scans");
+       seq_scan_tables[num_seq_scans] = hashp;
+       num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+       int             i;
+
+       /* Search backward since it's most likely at the stack top */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_tables[i] == hashp)
+               {
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       num_seq_scans--;
+                       return;
+               }
+       }
+       elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+                hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+       int             i;
+
+       for (i = 0; i < num_seq_scans; i++)
+       {
+               if (seq_scan_tables[i] == hashp)
+                       return true;
+       }
+       return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+       /*
+        * During abort cleanup, open scans are expected; just silently clean 'em
+        * out.  An open scan at commit means someone forgot a hash_seq_term()
+        * call, so complain.
+        *
+        * Note: it's tempting to try to print the tabname here, but refrain for
+        * fear of touching deallocated memory.  This isn't a user-facing message
+        * anyway, so it needn't be pretty.
+        */
+       if (isCommit)
+       {
+               int             i;
+
+               for (i = 0; i < num_seq_scans; i++)
+               {
+                       elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                seq_scan_tables[i]);
+               }
+       }
+       num_seq_scans = 0;
+}
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h

index 782f3ae4cca13d8a192add4ad394060b60b7bb5e..52268922135d3dc41a986e3e95934a247388a0a0 100644 (file)
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -89,6 +89,8 @@ typedef struct HTAB
                                                                  * used */
         char       *tabname;            /* table name (for error messages) */
         bool            isshared;               /* true if table is in shared memory */
+       /* freezing a shared table isn't allowed, so we can keep state here */
+       bool            frozen;                 /* true = no more inserts allowed */
  } HTAB;
  
  /* Parameter data structure for hash_create */
@@ -155,8 +157,11 @@ extern void *hash_search(HTAB *hashp, void *keyPtr, HASHACTION action,
                         bool *foundPtr);
  extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
  extern void *hash_seq_search(HASH_SEQ_STATUS *status);
+extern void hash_seq_term(HASH_SEQ_STATUS *status);
+extern void hash_freeze(HTAB *hashp);
  extern long hash_estimate_size(long num_entries, long entrysize);
  extern long hash_select_dirsize(long num_entries);
+extern void AtEOXact_HashTables(bool isCommit);
  
  /*
   * prototypes for functions in hashfn.c
author	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)
committer	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:25:48 +0000 (23:25 +0000)
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/utils/hash/dynahash.c		patch \| blob \| blame \| history
src/include/utils/hsearch.h		patch \| blob \| blame \| history