From 0fa4195738a160d4ed5b123670ad91ff3367e1df Mon Sep 17 00:00:00 2001 From: nkey Date: Sat, 23 Nov 2024 13:25:11 +0100 Subject: [PATCH 1/2] Add isolation test to reproduce dirty snapshot scan issue This commit introduces an isolation test to reliably reproduce the issue where non-MVCC index scans using SnapshotDirty can miss tuples due to concurrent modifications. This situation can lead to incorrect results. To facilitate this test, new injection point added in the index_getnext_slot. Changes include: * Added injection point in src/backend/access/index/indexam.c * Updated Makefile and meson.build to include the new dirty_index_scan isolation test. * Created a new isolation spec dirty_index_scan.spec and its expected output to define and verify the test steps. * This test complements the previous fix by demonstrating the issue and verifying that the fix effectively addresses it. --- src/backend/access/index/indexam.c | 8 ++++ src/backend/executor/execIndexing.c | 3 ++ src/test/modules/injection_points/Makefile | 2 +- .../expected/dirty_index_scan.out | 27 ++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../specs/dirty_index_scan.spec | 37 +++++++++++++++++++ 6 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 src/test/modules/injection_points/expected/dirty_index_scan.out create mode 100644 src/test/modules/injection_points/specs/dirty_index_scan.spec diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da6..9184f14e0142 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -57,6 +57,7 @@ #include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/injection_point.h" /* ---------------------------------------------------------------- @@ -741,6 +742,13 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * the index. */ Assert(ItemPointerIsValid(&scan->xs_heaptid)); +#ifdef USE_INJECTION_POINTS + if (!IsCatalogRelationOid(scan->indexRelation->rd_id)) + { + INJECTION_POINT("index_getnext_slot_before_fetch"); + } +#endif + if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062..eaf7e8434bbd 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -117,6 +117,7 @@ #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/snapmgr.h" +#include "utils/injection_point.h" /* waitMode argument to check_exclusion_or_unique_constraint() */ typedef enum @@ -943,6 +944,8 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, ExecDropSingleTupleTableSlot(existing_slot); + if (!conflict) + INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict"); return !conflict; } diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index e680991f8d4f..b73f8ac80f2b 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -14,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points" REGRESS = injection_points hashagg reindex_conc REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress -ISOLATION = basic inplace syscache-update-pruned +ISOLATION = basic inplace syscache-update-pruned dirty_index_scan TAP_TESTS = 1 diff --git a/src/test/modules/injection_points/expected/dirty_index_scan.out b/src/test/modules/injection_points/expected/dirty_index_scan.out new file mode 100644 index 000000000000..c286a9fd5b63 --- /dev/null +++ b/src/test/modules/injection_points/expected/dirty_index_scan.out @@ -0,0 +1,27 @@ +Parsed test spec with 3 sessions + +starting permutation: s1_s1 s2_s1 s3_s1 +injection_points_attach +----------------------- + +(1 row) + +step s1_s1: INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; +step s2_s1: UPDATE test.tbl SET n = n + 1 WHERE i = 42; +step s3_s1: + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); + +step s1_s1: <... completed> +step s2_s1: <... completed> +step s3_s1: <... completed> +injection_points_detach +----------------------- + +(1 row) + +injection_points_wakeup +----------------------- + +(1 row) + diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index d61149712fd7..bb3869f9a750 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -47,6 +47,7 @@ tests += { 'basic', 'inplace', 'syscache-update-pruned', + 'dirty_index_scan', ], 'runningcheck': false, # see syscache-update-pruned }, diff --git a/src/test/modules/injection_points/specs/dirty_index_scan.spec b/src/test/modules/injection_points/specs/dirty_index_scan.spec new file mode 100644 index 000000000000..373bcaf4929f --- /dev/null +++ b/src/test/modules/injection_points/specs/dirty_index_scan.spec @@ -0,0 +1,37 @@ +setup +{ + CREATE EXTENSION injection_points; + CREATE SCHEMA test; + CREATE UNLOGGED TABLE test.tbl(i int primary key, n int); + CREATE INDEX tbl_n_idx ON test.tbl(n); + INSERT INTO test.tbl VALUES(42,1); +} + +teardown +{ + DROP SCHEMA test CASCADE; + DROP EXTENSION injection_points; +} + +session s1 +setup { + SELECT injection_points_set_local(); + SELECT injection_points_attach('check_exclusion_or_unique_constraint_no_conflict', 'error'); + SELECT injection_points_attach('index_getnext_slot_before_fetch', 'wait'); +} + +step s1_s1 { INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; } + +session s2 +step s2_s1 { UPDATE test.tbl SET n = n + 1 WHERE i = 42; } + +session s3 +step s3_s1 { + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); +} + +permutation + s1_s1 + s2_s1(*) + s3_s1(s1_s1) \ No newline at end of file From 9b376f1742811ea71cfbd291e257fdefdf6b6ec3 Mon Sep 17 00:00:00 2001 From: nkey Date: Wed, 12 Mar 2025 00:53:02 +0100 Subject: [PATCH 2/2] Fix btree index scan concurrency issues with dirty snapshots This patch addresses an issue where non-MVCC index scans using SnapshotDirty or SnapshotSelf could miss tuples due to concurrent modifications. The fix retains read locks on pages for these special snapshot types until the scan is done with the page's tuples, preventing concurrent modifications from causing inconsistent results. Updated README to document this special case in the btree locking mechanism. --- src/backend/access/nbtree/README | 13 +++++++++- src/backend/access/nbtree/nbtree.c | 17 +++++++++++++ src/backend/access/nbtree/nbtsearch.c | 35 +++++++++++++++++++++++---- src/backend/access/nbtree/nbtutils.c | 8 +++++- src/include/access/nbtree.h | 3 +++ 5 files changed, 69 insertions(+), 7 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 53d4a61dc3f1..a9280415633d 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we came from. (Actually, it's even harder than that; see page deletion discussion below.) -Page read locks are held only for as long as a scan is examining a page. +Page read locks are held only for as long as a scan is examining a page +(with exception for SnapshotDirty and SnapshotSelf scans - see below). To minimize lock/unlock traffic, an index scan always searches a leaf page to identify all the matching items at once, copying their heap tuple IDs into backend-local storage. The heap tuple IDs are then processed while @@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards (though this requires extra handling to account for concurrent splits of the left sibling; see detailed move-left algorithm below). +Despite the described mechanics in place, inconsistent results may still occur +during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a +concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the +same page. If the scan has already visited the page and cached its content in the +backend-local storage, it might skip the old tuple due to deletion and miss the new +tuple because the scan does not re-read the page. To address this issue, for +SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until +we're completely done processing all the tuples from that page, preventing +concurrent modifications that could lead to inconsistent results. + In most cases we release our lock and pin on a page before attempting to acquire pin and lock on the page we are moving to. In a few places it is necessary to lock the next page before releasing the current one. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af7..8239076c5182 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + /* Release any extended lock held for SnapshotDirty/Self scans */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); BTScanPosInvalidate(so->currPos); } @@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + /* Release any extended lock held for SnapshotDirty/Self scans */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); } @@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 77264ddeecb5..1e1d24b32433 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * This will prevent vacuum from stalling in a blocked state trying to read a * page when a cursor is sitting on it. * + * For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer + * here, but instead set extra_unlock to indicate that the lock should be held + * until we're completely done with this page. This prevents concurrent + * modifications from causing inconsistent results during non-MVCC scans. + * * See nbtree/README section on making concurrent TID recycling safe. */ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) { + if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY || + scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF) + { + sp->extra_unlock = true; + return; + } _bt_unlockbuf(scan->indexRelation, sp->buf); if (IsMVCCSnapshot(scan->xs_snapshot) && @@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * _bt_next() -- Get the next item in a scan. * * On entry, so->currPos describes the current page, which may be pinned - * but is not locked, and so->currPos.itemIndex identifies which item was + * but is not locked (except for SnapshotDirty and SnapshotSelf scans, where + * the page remains locked), and so->currPos.itemIndex identifies which item was * previously returned. * * On success exit, so->currPos is updated as needed, and _bt_returnitem @@ -2101,10 +2113,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked, + * except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked + * until we're done with all tuples from the page. If there's no pin held, it's + * because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on. + * The scan must have so->currPos.currPage set to a valid block, in any case. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2163,8 +2176,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* mark/restore not supported by parallel scans */ Assert(!scan->parallel_scan); + Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY); + Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF); } + /* + * For SnapshotDirty/Self scans, we kept the read lock after processing + * the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're + * moving to another page, we need to explicitly release that lock. + */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); /* Walk to the next page with data */ diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 11802a4c2151..22b9c9d232b6 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3347,13 +3347,17 @@ _bt_killitems(IndexScanDesc scan) * LSN. */ droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); + /* For SnapshotDirty/Self scans, the buffer is already locked */ + if (!so->currPos.extra_unlock) + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); } else { Buffer buf; + /* extra_unlock should never be set without a valid buffer pin */ + Assert(!so->currPos.extra_unlock); droppedpin = true; /* Attempt to re-read the buffer, getting pin and lock. */ @@ -3490,6 +3494,8 @@ _bt_killitems(IndexScanDesc scan) } _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + /* Reset the extra_unlock flag since we've now released the lock */ + so->currPos.extra_unlock = false; } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..2c2485f34bda 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -962,6 +962,7 @@ typedef struct BTScanPosItem /* what we remember about each match */ typedef struct BTScanPosData { Buffer buf; /* currPage buf (invalid means unpinned) */ + bool extra_unlock; /* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */ /* page details as of the saved position's call to _bt_readpage */ BlockNumber currPage; /* page referenced by items array */ @@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos; ) #define BTScanPosUnpin(scanpos) \ do { \ + Assert(!(scanpos).extra_unlock); \ ReleaseBuffer((scanpos).buf); \ (scanpos).buf = InvalidBuffer; \ } while (0) @@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos; do { \ (scanpos).buf = InvalidBuffer; \ (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).extra_unlock = false; \ } while (0) /* We need one of these for each equality-type SK_SEARCHARRAY scan key */