From 9f00e6e57f0c4899570b323949a240d40c6366fb Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 25 Apr 2020 18:58:37 -0700 Subject: [PATCH v1 1/2] Fix bug in nbtree VACUUM "skip full scan" feature. Commit 857f9c36cda (which taught nbtree to skip VACUUM's full index scan during cleanup of nbtree indexes in some situations) made nbtree VACUUM store the oldest btpo_xact in the nbtree metapage for the index as a whole. This was based on the oldest observed value across every deleted index page. However, it failed to handle pages that the current VACUUM operation deletes correctly. The most immediate problem with the handling of pages deleted by the current VACUUM operation was that the special area of the page was examined without a buffer pin. More fundamentally, the handling failed to account for the full range of _bt_pagedel() behaviors. For example, _bt_pagedel() sometimes deletes internal pages in passing, as part of deleting an entire subtree with btvacuumpage() caller's page as the leaf level page. To fix, push down the logic for maintaining the oldest btpo_xact to _bt_pagedel(). btvacuumpage() is strictly responsible for pages that were fully deleted by a previous VACUUM operation, while _bt_pagedel() is strictly responsible for pages that are deleted by the current VACUUM operation (which includes half-dead pages from a previous interrupted VACUUM operation). Discussion: https://postgr.es/m/CAH2-Wz=mWPBLZ2cr95cBV=kzPav8OOBtkHTfg+-+AUiozANK0w@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com Backpatch: 11-, where the "skip full scan" feature was introduced. --- src/include/access/nbtree.h | 3 +- src/backend/access/nbtree/nbtpage.c | 133 ++++++++++++++++++++-------- src/backend/access/nbtree/nbtree.c | 21 ++--- 3 files changed, 111 insertions(+), 46 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9a3acd26b7..7bccd7ecc3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1080,7 +1080,8 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel); -extern int _bt_pagedel(Relation rel, Buffer buf); +extern int _bt_pagedel(Relation rel, Buffer leafbuf, + TransactionId *oldestBtpoXact); /* * prototypes for functions in nbtsearch.c diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f5962f6163..55175eb5c7 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -35,9 +35,11 @@ #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); -static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); +static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, + BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, - bool *rightsib_empty); + bool *rightsib_empty, + TransactionId *oldestBtpoXact); static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable); static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, @@ -1470,27 +1472,37 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, } /* - * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. + * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. * - * This action unlinks the page from the b-tree structure, removing all + * This action unlinks the leaf page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write - * lock is OK). This lock and pin will be dropped before exiting. + * lock is OK). The page must be an empty leaf-page, which may be half-dead + * already (a half-dead page should only be passed to us when an earlier + * VACUUM operation was interrupted, though). Note in particular that caller + * should never pass a buffer containing an existing deleted page here. The + * lock and pin on caller's buffer will be dropped before we return. * * Returns the number of pages successfully deleted (zero if page cannot - * be deleted now; could be more than one if parent or sibling pages were - * deleted too). + * be deleted now; could be more than one if parent or right sibling pages + * were deleted too). + * + * Maintains *oldestBtpoXact for caller for any pages that are deleted here + * (including caller's target page and any other internal pages or right + * sibling pages that we manage to delete). Caller is responsible for + * maintaining *oldestBtpoXact in the case of pages that were deleted by a + * previous VACUUM. * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int -_bt_pagedel(Relation rel, Buffer buf) +_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) { int ndeleted = 0; BlockNumber rightsib; @@ -1511,14 +1523,21 @@ _bt_pagedel(Relation rel, Buffer buf) for (;;) { - page = BufferGetPage(buf); + page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Internal pages are never deleted directly, only as part of deleting * the whole branch all the way down to leaf level. + * + * Also check for deleted pages here. Caller never passes us a fully + * deleted page. Only VACUUM can delete pages, so there can't have + * been a concurrent deletion. Assume that we reached any deleted + * page encountered here by following a sibling link, and that the + * index is corrupt. */ - if (!P_ISLEAF(opaque)) + Assert(!P_ISDELETED(opaque)); + if (!P_ISLEAF(opaque) || P_ISDELETED(opaque)) { /* * Pre-9.4 page deletion only marked internal pages as half-dead, @@ -1537,13 +1556,21 @@ _bt_pagedel(Relation rel, Buffer buf) errmsg("index \"%s\" contains a half-dead internal page", RelationGetRelationName(rel)), errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); - _bt_relbuf(rel, buf); + + if (P_ISDELETED(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains a deleted page with a live sibling link", + RelationGetRelationName(rel)))); + + _bt_relbuf(rel, leafbuf); return ndeleted; } /* * We can never delete rightmost pages nor root pages. While at it, - * check that page is not already deleted and is empty. + * check that page is empty, since it's possible that the page was + * empty a moment ago, but has since had some inserts. * * To keep the algorithm simple, we also never delete an incompletely * split page (they should be rare enough that this doesn't make any @@ -1558,14 +1585,14 @@ _bt_pagedel(Relation rel, Buffer buf) * to. On subsequent iterations, we know we stepped right from a page * that passed these tests, so it's OK. */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || P_INCOMPLETE_SPLIT(opaque)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); return ndeleted; } @@ -1603,7 +1630,7 @@ _bt_pagedel(Relation rel, Buffer buf) * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); /* * Fetch the left sibling, to check that it's not marked with @@ -1627,10 +1654,10 @@ _bt_pagedel(Relation rel, Buffer buf) * incompletely-split page to be split again. So we don't * need to walk right here. */ - if (lopaque->btpo_next == BufferGetBlockNumber(buf) && + if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) && P_INCOMPLETE_SPLIT(lopaque)) { - ReleaseBuffer(buf); + ReleaseBuffer(leafbuf); _bt_relbuf(rel, lbuf); return ndeleted; } @@ -1646,16 +1673,26 @@ _bt_pagedel(Relation rel, Buffer buf) _bt_relbuf(rel, lbuf); /* - * Re-lock the leaf page, and start over, to re-check that the - * page can still be deleted. + * Re-lock the leaf page, and start over to use our stack + * within _bt_mark_page_halfdead(). We must do it that way + * because it's possible that page can no longer be deleted. + * We need to recheck. */ - LockBuffer(buf, BT_WRITE); + LockBuffer(leafbuf, BT_WRITE); continue; } - if (!_bt_mark_page_halfdead(rel, buf, stack)) + /* + * See if it's safe to delete the leaf page, and determine how + * many parent/internal pages above the leaf level will be + * deleted. If it's safe then _bt_mark_page_halfdead will also + * perform the first phase of deletion, including marking the leaf + * page half-dead. + */ + Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); + if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) { - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); return ndeleted; } } @@ -1663,23 +1700,32 @@ _bt_pagedel(Relation rel, Buffer buf) /* * Then unlink it from its siblings. Each call to * _bt_unlink_halfdead_page unlinks the topmost page from the branch, - * making it shallower. Iterate until the leaf page is gone. + * making it shallower. Iterate until the leaf page is deleted. + * + * _bt_unlink_halfdead_page should never fail, since we established + * that deletion is generally safe in _bt_mark_page_halfdead. */ rightsib_empty = false; + Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); while (P_ISHALFDEAD(opaque)) { - /* will check for interrupts, once lock is released */ - if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty)) + /* Check for interrupts in _bt_unlink_halfdead_page */ + if (!_bt_unlink_halfdead_page(rel, leafbuf, &rightsib_empty, + oldestBtpoXact)) { - /* _bt_unlink_halfdead_page already released buffer */ + /* _bt_unlink_halfdead_page failed, released buffer */ return ndeleted; } ndeleted++; } + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); + Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact, + *oldestBtpoXact)); + rightsib = opaque->btpo_next; - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); /* * Check here, as calling loops will have locks held, preventing @@ -1705,7 +1751,7 @@ _bt_pagedel(Relation rel, Buffer buf) if (!rightsib_empty) break; - buf = _bt_getbuf(rel, rightsib, BT_WRITE); + leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); } return ndeleted; @@ -1909,9 +1955,19 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * of the whole branch, including the leaf page itself, iterate until the * leaf page is deleted. * - * Returns 'false' if the page could not be unlinked (shouldn't happen). - * If the (current) right sibling of the page is empty, *rightsib_empty is - * set to true. + * Returns 'false' if the page could not be unlinked (shouldn't happen). If + * the right sibling of the current target page is empty, *rightsib_empty is + * set to true, allowing caller to delete the target's right sibling page in + * passing. Note that *rightsib_empty is only actually used by caller when + * target page is leafbuf, following last call here for leafbuf/the subtree + * containing leafbuf. (We always set *rightsib_empty for caller, just to be + * consistent.) + * + * We maintain *oldestBtpoXact for VACUUM operation as a whole here. This is + * necessary because we conservatively assume that there must be a call to + * ReadNewTransactionId() for each and every page that is deleted, including + * when we delete a series of pages that are all part of the same empty + * subtree -- see comments about the underlying assumption below. * * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). * On success exit, we'll be holding pin and write lock. On failure exit, @@ -1919,7 +1975,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * to avoid having to reacquire a lock we already released). */ static bool -_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) +_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty, + TransactionId *oldestBtpoXact) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); BlockNumber leafleftsib; @@ -2057,9 +2114,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) lbuf = InvalidBuffer; /* - * Next write-lock the target page itself. It should be okay to take just - * a write lock not a superexclusive lock, since no scans would stop on an - * empty page. + * Next write-lock the target page itself. It's okay to take a write lock + * rather than a superexclusive lock, since no scan will stop on an empty + * page. */ LockBuffer(buf, BT_WRITE); page = BufferGetPage(buf); @@ -2204,6 +2261,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); @@ -2309,6 +2367,11 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) _bt_relbuf(rel, lbuf); _bt_relbuf(rel, rbuf); + /* maintain oldestBtpoXact */ + if (!TransactionIdIsValid(*oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact)) + *oldestBtpoXact = opaque->btpo.xact; + /* * Release the target, if it was not the leaf block. The leaf is always * kept locked. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 36294789f3..8ecd632446 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1137,24 +1137,30 @@ restart: /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { - /* Okay to recycle this page */ + /* Okay to recycle this page (which could be leaf or internal) */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { - /* Already deleted, but can't recycle yet */ + /* + * Already deleted page (which could be leaf or internal). Can't + * recycle yet. + */ stats->pages_deleted++; - /* Update the oldest btpo.xact */ + /* Maintain the oldest btpo.xact */ if (!TransactionIdIsValid(vstate->oldestBtpoXact) || TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { - /* Half-dead, try to delete */ + /* + * Half-dead leaf page. Try to delete now. Might update + * oldestBtpoXact and pages_deleted below. + */ delete_now = true; } else if (P_ISLEAF(opaque)) @@ -1357,16 +1363,11 @@ restart: MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); - ndel = _bt_pagedel(rel, buf); + ndel = _bt_pagedel(rel, buf, &vstate->oldestBtpoXact); /* count only this page, else may double-count parent */ if (ndel) - { stats->pages_deleted++; - if (!TransactionIdIsValid(vstate->oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) - vstate->oldestBtpoXact = opaque->btpo.xact; - } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ -- 2.25.1