Fix MVCC bug with prepared xact with subxacts on standby
authorHeikki Linnakangas <[email protected]>
Thu, 27 Jun 2024 18:06:32 +0000 (21:06 +0300)
committerHeikki Linnakangas <[email protected]>
Thu, 27 Jun 2024 18:09:58 +0000 (21:09 +0300)
We did not recover the subtransaction IDs of prepared transactions
when starting a hot standby from a shutdown checkpoint. As a result,
such subtransactions were considered as aborted, rather than
in-progress. That would lead to hint bits being set incorrectly, and
the subtransactions suddenly becoming visible to old snapshots when
the prepared transaction was committed.

To fix, update pg_subtrans with prepared transactions's subxids when
starting hot standby from a shutdown checkpoint. The snapshots taken
from that state need to be marked as "suboverflowed", so that we also
check the pg_subtrans.

Backport to all supported versions.

Discussion: https://www.postgresql.org/message-id/6b852e98-2d49-4ca1-9e95-db419a2696e0@iki.fi

src/backend/access/transam/twophase.c
src/backend/access/transam/xlog.c
src/backend/storage/ipc/procarray.c
src/backend/storage/ipc/standby.c
src/include/storage/standby.h
src/test/recovery/t/009_twophase.pl
src/tools/pgindent/typedefs.list

index bf451d42ffb5dd76f23f53e1561def475199f3f4..9a8257fcafbb1a8dab57614ca1cdc79723bda5f5 100644 (file)
@@ -2035,9 +2035,8 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
  * This is never called at the end of recovery - we use
  * RecoverPreparedTransactions() at that point.
  *
- * The lack of calls to SubTransSetParent() calls here is by design;
- * those calls are made by RecoverPreparedTransactions() at the end of recovery
- * for those xacts that need this.
+ * This updates pg_subtrans, so that any subtransactions will be correctly
+ * seen as in-progress in snapshots taken during recovery.
  */
 void
 StandbyRecoverPreparedTransactions(void)
@@ -2057,7 +2056,7 @@ StandbyRecoverPreparedTransactions(void)
 
        buf = ProcessTwoPhaseBuffer(xid,
                                    gxact->prepare_start_lsn,
-                                   gxact->ondisk, false, false);
+                                   gxact->ondisk, true, false);
        if (buf != NULL)
            pfree(buf);
    }
index 8dcdf5a764647287f96b045e3308ed9232368663..a69337f2d4be114f9088a69da0053270dc9052df 100644 (file)
@@ -5777,6 +5777,9 @@ StartupXLOG(void)
                RunningTransactionsData running;
                TransactionId latestCompletedXid;
 
+               /* Update pg_subtrans entries for any prepared transactions */
+               StandbyRecoverPreparedTransactions();
+
                /*
                 * Construct a RunningTransactions snapshot representing a
                 * shut down server, with only prepared transactions still
@@ -5785,7 +5788,7 @@ StartupXLOG(void)
                 */
                running.xcnt = nxids;
                running.subxcnt = 0;
-               running.subxid_overflow = false;
+               running.subxid_status = SUBXIDS_IN_SUBTRANS;
                running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
                running.oldestRunningXid = oldestActiveXID;
                latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
@@ -5795,8 +5798,6 @@ StartupXLOG(void)
                running.xids = xids;
 
                ProcArrayApplyRecoveryInfo(&running);
-
-               StandbyRecoverPreparedTransactions();
            }
        }
 
@@ -8244,6 +8245,9 @@ xlog_redo(XLogReaderState *record)
 
            oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
 
+           /* Update pg_subtrans entries for any prepared transactions */
+           StandbyRecoverPreparedTransactions();
+
            /*
             * Construct a RunningTransactions snapshot representing a shut
             * down server, with only prepared transactions still alive. We're
@@ -8252,7 +8256,7 @@ xlog_redo(XLogReaderState *record)
             */
            running.xcnt = nxids;
            running.subxcnt = 0;
-           running.subxid_overflow = false;
+           running.subxid_status = SUBXIDS_IN_SUBTRANS;
            running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
            running.oldestRunningXid = oldestActiveXID;
            latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
@@ -8262,8 +8266,6 @@ xlog_redo(XLogReaderState *record)
            running.xids = xids;
 
            ProcArrayApplyRecoveryInfo(&running);
-
-           StandbyRecoverPreparedTransactions();
        }
 
        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
index d5165aa0d9f4e05557665a47b68b4b177b1ec2bc..387b4a405b0bb7ad1b4278ee399a496d9f345e66 100644 (file)
@@ -1106,7 +1106,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
         * If the snapshot isn't overflowed or if its empty we can reset our
         * pending state and use this snapshot instead.
         */
-       if (!running->subxid_overflow || running->xcnt == 0)
+       if (running->subxid_status != SUBXIDS_MISSING || running->xcnt == 0)
        {
            /*
             * If we have already collected known assigned xids, we need to
@@ -1258,7 +1258,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
     * missing, so conservatively assume the last one is latestObservedXid.
     * ----------
     */
-   if (running->subxid_overflow)
+   if (running->subxid_status == SUBXIDS_MISSING)
    {
        standbyState = STANDBY_SNAPSHOT_PENDING;
 
@@ -1270,6 +1270,18 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
        standbyState = STANDBY_SNAPSHOT_READY;
 
        standbySnapshotPendingXmin = InvalidTransactionId;
+
+       /*
+        * If the 'xids' array didn't include all subtransactions, we have to
+        * mark any snapshots taken as overflowed.
+        */
+       if (running->subxid_status == SUBXIDS_IN_SUBTRANS)
+           procArray->lastOverflowedXid = latestObservedXid;
+       else
+       {
+           Assert(running->subxid_status == SUBXIDS_IN_ARRAY);
+           procArray->lastOverflowedXid = InvalidTransactionId;
+       }
    }
 
    /*
@@ -2833,7 +2845,7 @@ GetRunningTransactionData(void)
 
    CurrentRunningXacts->xcnt = count - subcount;
    CurrentRunningXacts->subxcnt = subcount;
-   CurrentRunningXacts->subxid_overflow = suboverflowed;
+   CurrentRunningXacts->subxid_status = suboverflowed ? SUBXIDS_IN_SUBTRANS : SUBXIDS_IN_ARRAY;
    CurrentRunningXacts->nextXid = XidFromFullTransactionId(TransamVariables->nextXid);
    CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
    CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid;
index 87b04e51b3683823d6db7c4c9b32fb722d572f37..872679ca44788cff113840cc0ab62f6bfb691b2c 100644 (file)
@@ -1184,7 +1184,7 @@ standby_redo(XLogReaderState *record)
 
        running.xcnt = xlrec->xcnt;
        running.subxcnt = xlrec->subxcnt;
-       running.subxid_overflow = xlrec->subxid_overflow;
+       running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
        running.nextXid = xlrec->nextXid;
        running.latestCompletedXid = xlrec->latestCompletedXid;
        running.oldestRunningXid = xlrec->oldestRunningXid;
@@ -1349,7 +1349,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 
    xlrec.xcnt = CurrRunningXacts->xcnt;
    xlrec.subxcnt = CurrRunningXacts->subxcnt;
-   xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+   xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
    xlrec.nextXid = CurrRunningXacts->nextXid;
    xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
@@ -1366,7 +1366,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 
    recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
 
-   if (CurrRunningXacts->subxid_overflow)
+   if (xlrec.subxid_overflow)
        elog(DEBUG2,
             "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
             CurrRunningXacts->xcnt,
index 0fc0804e2660e5e92b7d2dfd1c33e3c3d8c2e9a1..cce0bc521e7582e8fb9042a8d917c32f33caaaf4 100644 (file)
@@ -75,11 +75,19 @@ extern void StandbyReleaseOldLocks(TransactionId oldxid);
  * almost immediately see the data we need to begin executing queries.
  */
 
+typedef enum
+{
+   SUBXIDS_IN_ARRAY,           /* xids array includes all running subxids */
+   SUBXIDS_MISSING,            /* snapshot overflowed, subxids are missing */
+   SUBXIDS_IN_SUBTRANS,        /* subxids are not included in 'xids', but
+                                * pg_subtrans is fully up-to-date */
+} subxids_array_status;
+
 typedef struct RunningTransactionsData
 {
    int         xcnt;           /* # of xact ids in xids[] */
    int         subxcnt;        /* # of subxact ids in xids[] */
-   bool        subxid_overflow;    /* snapshot overflowed, subxids missing */
+   subxids_array_status subxid_status;
    TransactionId nextXid;      /* xid from TransamVariables->nextXid */
    TransactionId oldestRunningXid; /* *not* oldestXmin */
    TransactionId oldestDatabaseRunningXid; /* same as above, but within the
index 701f9cc20f8411dc49e456dcc8dc3075cf78eb24..21a65cd298b8721de1172f16d8f0e9632226ff53 100644 (file)
@@ -312,6 +312,52 @@ $cur_standby->start;
 
 $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_12'");
 
+###############################################################################
+# Check visibility of prepared transactions in standby after a restart while
+# primary is down.
+###############################################################################
+
+$cur_primary->psql(
+   'postgres', "
+   CREATE TABLE t_009_tbl_standby_mvcc (id int, msg text);
+   BEGIN;
+   INSERT INTO t_009_tbl_standby_mvcc VALUES (1, 'issued to ${cur_primary_name}');
+   SAVEPOINT s1;
+   INSERT INTO t_009_tbl_standby_mvcc VALUES (2, 'issued to ${cur_primary_name}');
+   PREPARE TRANSACTION 'xact_009_standby_mvcc';
+   ");
+$cur_primary->stop;
+$cur_standby->restart;
+
+# Acquire a snapshot in standby, before we commit the prepared transaction
+my $standby_session = $cur_standby->background_psql('postgres', on_error_die => 1);
+$standby_session->query_safe("BEGIN ISOLATION LEVEL REPEATABLE READ");
+$psql_out = $standby_session->query_safe(
+   "SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '0',
+   "Prepared transaction not visible in standby before commit");
+
+# Commit the transaction in primary
+$cur_primary->start;
+$cur_primary->psql('postgres', "
+SET synchronous_commit='remote_apply'; -- To ensure the standby is caught up
+COMMIT PREPARED 'xact_009_standby_mvcc';
+");
+
+# Still not visible to the old snapshot
+$psql_out = $standby_session->query_safe(
+   "SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '0',
+   "Committed prepared transaction not visible to old snapshot in standby");
+
+# Is visible to a new snapshot
+$standby_session->query_safe("COMMIT");
+$psql_out = $standby_session->query_safe(
+   "SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '2',
+   "Committed prepared transaction is visible to new snapshot in standby");
+$standby_session->quit;
+
 ###############################################################################
 # Check for a lock conflict between prepared transaction with DDL inside and
 # replay of XLOG_STANDBY_LOCK wal record.
index 61ad417cde664a59e52bbde8ba4708894c070f05..d90982466c5c9e7028cd381023221e91bc859a4c 100644 (file)
@@ -3931,6 +3931,7 @@ string
 substitute_actual_parameters_context
 substitute_actual_srf_parameters_context
 substitute_phv_relids_context
+subxids_array_status
 symbol
 tablespaceinfo
 td_entry