35
35
#include "utils/snapmgr.h"
36
36
37
37
static BTMetaPageData * _bt_getmeta (Relation rel , Buffer metabuf );
38
- static bool _bt_mark_page_halfdead (Relation rel , Buffer buf , BTStack stack );
38
+ static bool _bt_mark_page_halfdead (Relation rel , Buffer leafbuf ,
39
+ BTStack stack );
39
40
static bool _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf ,
40
- bool * rightsib_empty );
41
+ bool * rightsib_empty ,
42
+ TransactionId * oldestBtpoXact );
41
43
static TransactionId _bt_xid_horizon (Relation rel , Relation heapRel , Page page ,
42
44
OffsetNumber * deletable , int ndeletable );
43
45
static bool _bt_lock_branch_parent (Relation rel , BlockNumber child ,
@@ -1470,27 +1472,35 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1470
1472
}
1471
1473
1472
1474
/*
1473
- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1475
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1474
1476
*
1475
- * This action unlinks the page from the b-tree structure, removing all
1477
+ * This action unlinks the leaf page from the b-tree structure, removing all
1476
1478
* pointers leading to it --- but not touching its own left and right links.
1477
1479
* The page cannot be physically reclaimed right away, since other processes
1478
1480
* may currently be trying to follow links leading to the page; they have to
1479
1481
* be allowed to use its right-link to recover. See nbtree/README.
1480
1482
*
1481
1483
* On entry, the target buffer must be pinned and locked (either read or write
1482
- * lock is OK). This lock and pin will be dropped before exiting.
1484
+ * lock is OK). The page must be an empty leaf page, which may be half-dead
1485
+ * already (a half-dead page should only be passed to us when an earlier
1486
+ * VACUUM operation was interrupted, though). Note in particular that caller
1487
+ * should never pass a buffer containing an existing deleted page here. The
1488
+ * lock and pin on caller's buffer will be dropped before we return.
1483
1489
*
1484
1490
* Returns the number of pages successfully deleted (zero if page cannot
1485
- * be deleted now; could be more than one if parent or sibling pages were
1486
- * deleted too).
1491
+ * be deleted now; could be more than one if parent or right sibling pages
1492
+ * were deleted too).
1493
+ *
1494
+ * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1495
+ * responsible for maintaining *oldestBtpoXact in the case of pages that were
1496
+ * deleted by a previous VACUUM.
1487
1497
*
1488
1498
* NOTE: this leaks memory. Rather than trying to clean up everything
1489
1499
* carefully, it's better to run it in a temp context that can be reset
1490
1500
* frequently.
1491
1501
*/
1492
1502
int
1493
- _bt_pagedel (Relation rel , Buffer buf )
1503
+ _bt_pagedel (Relation rel , Buffer leafbuf , TransactionId * oldestBtpoXact )
1494
1504
{
1495
1505
int ndeleted = 0 ;
1496
1506
BlockNumber rightsib ;
@@ -1511,14 +1521,21 @@ _bt_pagedel(Relation rel, Buffer buf)
1511
1521
1512
1522
for (;;)
1513
1523
{
1514
- page = BufferGetPage (buf );
1524
+ page = BufferGetPage (leafbuf );
1515
1525
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1516
1526
1517
1527
/*
1518
1528
* Internal pages are never deleted directly, only as part of deleting
1519
1529
* the whole branch all the way down to leaf level.
1530
+ *
1531
+ * Also check for deleted pages here. Caller never passes us a fully
1532
+ * deleted page. Only VACUUM can delete pages, so there can't have
1533
+ * been a concurrent deletion. Assume that we reached any deleted
1534
+ * page encountered here by following a sibling link, and that the
1535
+ * index is corrupt.
1520
1536
*/
1521
- if (!P_ISLEAF (opaque ))
1537
+ Assert (!P_ISDELETED (opaque ));
1538
+ if (!P_ISLEAF (opaque ) || P_ISDELETED (opaque ))
1522
1539
{
1523
1540
/*
1524
1541
* Pre-9.4 page deletion only marked internal pages as half-dead,
@@ -1537,13 +1554,22 @@ _bt_pagedel(Relation rel, Buffer buf)
1537
1554
errmsg ("index \"%s\" contains a half-dead internal page" ,
1538
1555
RelationGetRelationName (rel )),
1539
1556
errhint ("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it." )));
1540
- _bt_relbuf (rel , buf );
1557
+
1558
+ if (P_ISDELETED (opaque ))
1559
+ ereport (LOG ,
1560
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1561
+ errmsg_internal ("found deleted block %u while following right link in index \"%s\"" ,
1562
+ BufferGetBlockNumber (leafbuf ),
1563
+ RelationGetRelationName (rel ))));
1564
+
1565
+ _bt_relbuf (rel , leafbuf );
1541
1566
return ndeleted ;
1542
1567
}
1543
1568
1544
1569
/*
1545
1570
* We can never delete rightmost pages nor root pages. While at it,
1546
- * check that page is not already deleted and is empty.
1571
+ * check that page is empty, since it's possible that the leafbuf page
1572
+ * was empty a moment ago, but has since had some inserts.
1547
1573
*
1548
1574
* To keep the algorithm simple, we also never delete an incompletely
1549
1575
* split page (they should be rare enough that this doesn't make any
@@ -1558,14 +1584,14 @@ _bt_pagedel(Relation rel, Buffer buf)
1558
1584
* to. On subsequent iterations, we know we stepped right from a page
1559
1585
* that passed these tests, so it's OK.
1560
1586
*/
1561
- if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) || P_ISDELETED ( opaque ) ||
1587
+ if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) ||
1562
1588
P_FIRSTDATAKEY (opaque ) <= PageGetMaxOffsetNumber (page ) ||
1563
1589
P_INCOMPLETE_SPLIT (opaque ))
1564
1590
{
1565
1591
/* Should never fail to delete a half-dead page */
1566
1592
Assert (!P_ISHALFDEAD (opaque ));
1567
1593
1568
- _bt_relbuf (rel , buf );
1594
+ _bt_relbuf (rel , leafbuf );
1569
1595
return ndeleted ;
1570
1596
}
1571
1597
@@ -1603,7 +1629,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1603
1629
* To avoid deadlocks, we'd better drop the leaf page lock
1604
1630
* before going further.
1605
1631
*/
1606
- LockBuffer (buf , BUFFER_LOCK_UNLOCK );
1632
+ LockBuffer (leafbuf , BUFFER_LOCK_UNLOCK );
1607
1633
1608
1634
/*
1609
1635
* Fetch the left sibling, to check that it's not marked with
@@ -1627,10 +1653,10 @@ _bt_pagedel(Relation rel, Buffer buf)
1627
1653
* incompletely-split page to be split again. So we don't
1628
1654
* need to walk right here.
1629
1655
*/
1630
- if (lopaque -> btpo_next == BufferGetBlockNumber (buf ) &&
1656
+ if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf ) &&
1631
1657
P_INCOMPLETE_SPLIT (lopaque ))
1632
1658
{
1633
- ReleaseBuffer (buf );
1659
+ ReleaseBuffer (leafbuf );
1634
1660
_bt_relbuf (rel , lbuf );
1635
1661
return ndeleted ;
1636
1662
}
@@ -1646,40 +1672,59 @@ _bt_pagedel(Relation rel, Buffer buf)
1646
1672
_bt_relbuf (rel , lbuf );
1647
1673
1648
1674
/*
1649
- * Re-lock the leaf page, and start over, to re-check that the
1650
- * page can still be deleted.
1675
+ * Re-lock the leaf page, and start over to use our stack
1676
+ * within _bt_mark_page_halfdead. We must do it that way
1677
+ * because it's possible that leafbuf can no longer be
1678
+ * deleted. We need to recheck.
1651
1679
*/
1652
- LockBuffer (buf , BT_WRITE );
1680
+ LockBuffer (leafbuf , BT_WRITE );
1653
1681
continue ;
1654
1682
}
1655
1683
1656
- if (!_bt_mark_page_halfdead (rel , buf , stack ))
1684
+ /*
1685
+ * See if it's safe to delete the leaf page, and determine how
1686
+ * many parent/internal pages above the leaf level will be
1687
+ * deleted. If it's safe then _bt_mark_page_halfdead will also
1688
+ * perform the first phase of deletion, which includes marking the
1689
+ * leafbuf page half-dead.
1690
+ */
1691
+ Assert (P_ISLEAF (opaque ) && !P_IGNORE (opaque ));
1692
+ if (!_bt_mark_page_halfdead (rel , leafbuf , stack ))
1657
1693
{
1658
- _bt_relbuf (rel , buf );
1694
+ _bt_relbuf (rel , leafbuf );
1659
1695
return ndeleted ;
1660
1696
}
1661
1697
}
1662
1698
1663
1699
/*
1664
1700
* Then unlink it from its siblings. Each call to
1665
1701
* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1666
- * making it shallower. Iterate until the leaf page is gone.
1702
+ * making it shallower. Iterate until the leafbuf page is deleted.
1703
+ *
1704
+ * _bt_unlink_halfdead_page should never fail, since we established
1705
+ * that deletion is generally safe in _bt_mark_page_halfdead.
1667
1706
*/
1668
1707
rightsib_empty = false;
1708
+ Assert (P_ISLEAF (opaque ) && P_ISHALFDEAD (opaque ));
1669
1709
while (P_ISHALFDEAD (opaque ))
1670
1710
{
1671
- /* will check for interrupts, once lock is released */
1672
- if (!_bt_unlink_halfdead_page (rel , buf , & rightsib_empty ))
1711
+ /* Check for interrupts in _bt_unlink_halfdead_page */
1712
+ if (!_bt_unlink_halfdead_page (rel , leafbuf , & rightsib_empty ,
1713
+ oldestBtpoXact ))
1673
1714
{
1674
- /* _bt_unlink_halfdead_page already released buffer */
1715
+ /* _bt_unlink_halfdead_page failed, released buffer */
1675
1716
return ndeleted ;
1676
1717
}
1677
1718
ndeleted ++ ;
1678
1719
}
1679
1720
1721
+ Assert (P_ISLEAF (opaque ) && P_ISDELETED (opaque ));
1722
+ Assert (TransactionIdFollowsOrEquals (opaque -> btpo .xact ,
1723
+ * oldestBtpoXact ));
1724
+
1680
1725
rightsib = opaque -> btpo_next ;
1681
1726
1682
- _bt_relbuf (rel , buf );
1727
+ _bt_relbuf (rel , leafbuf );
1683
1728
1684
1729
/*
1685
1730
* Check here, as calling loops will have locks held, preventing
@@ -1705,7 +1750,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1705
1750
if (!rightsib_empty )
1706
1751
break ;
1707
1752
1708
- buf = _bt_getbuf (rel , rightsib , BT_WRITE );
1753
+ leafbuf = _bt_getbuf (rel , rightsib , BT_WRITE );
1709
1754
}
1710
1755
1711
1756
return ndeleted ;
@@ -1909,17 +1954,28 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1909
1954
* of the whole branch, including the leaf page itself, iterate until the
1910
1955
* leaf page is deleted.
1911
1956
*
1912
- * Returns 'false' if the page could not be unlinked (shouldn't happen).
1913
- * If the (current) right sibling of the page is empty, *rightsib_empty is
1914
- * set to true.
1957
+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
1958
+ * the right sibling of the current target page is empty, *rightsib_empty is
1959
+ * set to true, allowing caller to delete the target's right sibling page in
1960
+ * passing. Note that *rightsib_empty is only actually used by caller when
1961
+ * target page is leafbuf, following last call here for leafbuf/the subtree
1962
+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
1963
+ * consistent.)
1964
+ *
1965
+ * We maintain *oldestBtpoXact for pages that are deleted by the current
1966
+ * VACUUM operation here. This must be handled here because we conservatively
1967
+ * assume that there needs to be a new call to ReadNewTransactionId() each
1968
+ * time a page gets deleted. See comments about the underlying assumption
1969
+ * below.
1915
1970
*
1916
1971
* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1917
1972
* On success exit, we'll be holding pin and write lock. On failure exit,
1918
1973
* we'll release both pin and lock before returning (we define it that way
1919
1974
* to avoid having to reacquire a lock we already released).
1920
1975
*/
1921
1976
static bool
1922
- _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty )
1977
+ _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty ,
1978
+ TransactionId * oldestBtpoXact )
1923
1979
{
1924
1980
BlockNumber leafblkno = BufferGetBlockNumber (leafbuf );
1925
1981
BlockNumber leafleftsib ;
@@ -2057,9 +2113,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2057
2113
lbuf = InvalidBuffer ;
2058
2114
2059
2115
/*
2060
- * Next write-lock the target page itself. It should be okay to take just
2061
- * a write lock not a superexclusive lock, since no scans would stop on an
2062
- * empty page.
2116
+ * Next write-lock the target page itself. It's okay to take a write lock
2117
+ * rather than a superexclusive lock, since no scan will stop on an empty
2118
+ * page.
2063
2119
*/
2064
2120
LockBuffer (buf , BT_WRITE );
2065
2121
page = BufferGetPage (buf );
@@ -2204,6 +2260,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2204
2260
*/
2205
2261
page = BufferGetPage (buf );
2206
2262
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
2263
+ Assert (P_ISHALFDEAD (opaque ) || !P_ISLEAF (opaque ));
2207
2264
opaque -> btpo_flags &= ~BTP_HALF_DEAD ;
2208
2265
opaque -> btpo_flags |= BTP_DELETED ;
2209
2266
opaque -> btpo .xact = ReadNewTransactionId ();
@@ -2309,6 +2366,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2309
2366
_bt_relbuf (rel , lbuf );
2310
2367
_bt_relbuf (rel , rbuf );
2311
2368
2369
+ if (!TransactionIdIsValid (* oldestBtpoXact ) ||
2370
+ TransactionIdPrecedes (opaque -> btpo .xact , * oldestBtpoXact ))
2371
+ * oldestBtpoXact = opaque -> btpo .xact ;
2372
+
2312
2373
/*
2313
2374
* Release the target, if it was not the leaf block. The leaf is always
2314
2375
* kept locked.
0 commit comments