freespace: Don't return blocks past the end of the main fork.

author Noah Misch <[email protected]>

Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)

committer Noah Misch <[email protected]>

Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)
author Noah Misch <[email protected]>
Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)
committer Noah Misch <[email protected]>
Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README

index e7ff23b76f79463388bd94afbbb792cf51b1c0dc..dc2a63a137bdcc7859fb74c0887097b9964bcad4 100644 (file)
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -169,9 +169,7 @@ Recovery
  --------
  
  The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
-self-correcting measures to repair possible corruption. As a result when
-we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
-rather than MarkBufferDirty().
+self-correcting measures to repair possible corruption.
  
  First of all, whenever a value is set on an FSM page, the root node of the
  page is compared against the new value after bubbling up the change is
@@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are
  immediately updated.  Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
  to propagate the new free-space info into the upper pages of the FSM tree.
  
+As a result when we write to the FSM we treat that as a hint and thus use
+MarkBufferDirtyHint() rather than MarkBufferDirty().  Every read here uses
+RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification
+failures.  We'd operate correctly without the full page images that
+MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot
+knowledge to RBM_ZERO_ON_ERROR.
+
+Relation extension is not WAL-logged.  Hence, after WAL replay, an on-disk FSM
+slot may indicate free space in PageIsNew() blocks that never reached disk.
+We detect this case by comparing against the actual relation size, and we mark
+the block as full in that case.
+
  TODO
  ----
  
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c

index bcdb182193870ce71f6dee945cc5818dc1a4edf3..b1262ac3b69ddf6127684bf6c39874764335afb5 100644 (file)
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat);
  static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
                              BlockNumber start, BlockNumber end,
                              bool *eof_p);
+static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
  
  
  /******** Public API ********/
@@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
   * amount of free space available on that page and then try again (see
   * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
   * extend the relation.
+ *
+ * This can trigger FSM updates if any FSM entry is found to point to a block
+ * past the end of the relation.
   */
  BlockNumber
  GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
@@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
      * Otherwise, search as usual.
      */
     if (search_slot != -1)
-       return fsm_get_heap_blk(addr, search_slot);
-   else
-       return fsm_search(rel, search_cat);
+   {
+       BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
+
+       /*
+        * Check that the blknum is actually in the relation. Don't try to
+        * update the FSM in that case, just fall back to the other case
+        */
+       if (fsm_does_block_exist(rel, blknum))
+           return blknum;
+   }
+   return fsm_search(rel, search_cat);
  }
  
  /*
@@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
  
         /*
-        * Truncation of a relation is WAL-logged at a higher-level, and we
-        * will be called at WAL replay. But if checksums are enabled, we need
-        * to still write a WAL record to protect against a torn page, if the
-        * page is flushed to disk before the truncation WAL record. We cannot
-        * use MarkBufferDirtyHint here, because that will not dirty the page
-        * during recovery.
+        * This change is non-critical, because fsm_does_block_exist() would
+        * stop us from returning a truncated-away block.  However, since this
+        * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
+        * of that many fsm_does_block_exist() rejections.  Use a full
+        * MarkBufferDirty(), not MarkBufferDirtyHint().
          */
         MarkBufferDirty(buf);
+
+       /*
+        * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
+        * differing from the rest of the file in this respect.  This is
+        * optional; see README mention of full page images.  XXX consider
+        * XLogSaveBufferForHint() for even closer similarity.
+        *
+        * A higher-level operation calls us at WAL replay.  If we crash
+        * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
+        * not changed, and our fork remains valid.  If we crash after that
+        * flush, redo will return here.
+        */
         if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
             log_newpage_buffer(buf, false);
  
@@ -674,8 +697,15 @@ fsm_search(Relation rel, uint8 min_cat)
                                     (addr.level == FSM_BOTTOM_LEVEL),
                                     false);
             if (slot == -1)
+           {
                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
-           UnlockReleaseBuffer(buf);
+               UnlockReleaseBuffer(buf);
+           }
+           else
+           {
+               /* Keep the pin for possible update below */
+               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+           }
         }
         else
             slot = -1;
@@ -687,8 +717,37 @@ fsm_search(Relation rel, uint8 min_cat)
              * bottom.
              */
             if (addr.level == FSM_BOTTOM_LEVEL)
-               return fsm_get_heap_blk(addr, slot);
-
+           {
+               BlockNumber blkno = fsm_get_heap_blk(addr, slot);
+               Page        page;
+
+               if (fsm_does_block_exist(rel, blkno))
+               {
+                   ReleaseBuffer(buf);
+                   return blkno;
+               }
+
+               /*
+                * Block is past the end of the relation.  Update FSM, and
+                * restart from root.  The usual "advancenext" behavior is
+                * pessimal for this rare scenario, since every later slot is
+                * unusable in the same way.  We could zero all affected slots
+                * on the same FSM page, but don't bet on the benefits of that
+                * optimization justifying its compiled code bulk.
+                */
+               page = BufferGetPage(buf);
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+               fsm_set_avail(page, slot, 0);
+               MarkBufferDirtyHint(buf, false);
+               UnlockReleaseBuffer(buf);
+               if (restarts++ > 10000) /* same rationale as below */
+                   return InvalidBlockNumber;
+               addr = FSM_ROOT_ADDRESS;
+           }
+           else
+           {
+               ReleaseBuffer(buf);
+           }
             addr = fsm_get_child(addr, slot);
         }
         else if (addr.level == FSM_ROOT_LEVEL)
@@ -856,3 +915,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
  
     return max_avail;
  }
+
+
+/*
+ * Check whether a block number is past the end of the relation.  This can
+ * happen after WAL replay, if the FSM reached disk but newly-extended pages
+ * it refers to did not.
+ */
+static bool
+fsm_does_block_exist(Relation rel, BlockNumber blknumber)
+{
+   SMgrRelation smgr = RelationGetSmgr(rel);
+
+   /*
+    * If below the cached nblocks, the block surely exists.  Otherwise, we
+    * face a trade-off.  We opt to compare to a fresh nblocks, incurring
+    * lseek() overhead.  The alternative would be to assume the block does
+    * not exist, but that would cause FSM to set zero space available for
+    * blocks that main fork extension just recorded.
+    */
+   return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
+            blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+           blknumber < RelationGetNumberOfBlocks(rel));
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index 62226d5dca71d82989fca7bfd4d14ecbe89535b3..100f6454e535e300bfd37a9ad5424616aa18f3bb 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -679,8 +679,9 @@ BlockNumber
  smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
  {
     /*
-    * For now, we only use cached values in recovery due to lack of a shared
-    * invalidation mechanism for changes in file size.
+    * For now, this function uses cached values only in recovery due to lack
+    * of a shared invalidation mechanism for changes in file size.  Code
+    * elsewhere reads smgr_cached_nblocks and copes with stale data.
      */
     if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
         return reln->smgr_cached_nblocks[forknum];
diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl

index fcf35fea0386d692f8aa8080a53835b0ae2051cf..4412fe28962992da7c375816bde225172c41de86 100644 (file)
--- a/src/test/recovery/t/008_fsm_truncation.pl
+++ b/src/test/recovery/t/008_fsm_truncation.pl
@@ -1,9 +1,8 @@
  
  # Copyright (c) 2021-2024, PostgreSQL Global Development Group
  
-# Test WAL replay of FSM changes.
-#
-# FSM changes don't normally need to be WAL-logged, except for truncation.
+# Test FSM-driven INSERT just after truncation clears FSM slots indicating
+# free space in removed blocks.
  # The FSM mustn't return a page that doesn't exist (anymore).
  use strict;
  use warnings FATAL => 'all';
author	Noah Misch <[email protected]>
	Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)
committer	Noah Misch <[email protected]>
	Sat, 13 Apr 2024 15:34:20 +0000 (08:34 -0700)
src/backend/storage/freespace/README		patch \| blob \| blame \| history
src/backend/storage/freespace/freespace.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/test/recovery/t/008_fsm_truncation.pl		patch \| blob \| blame \| history