gist microvacuum doesn't appear to care about hot standby?
Hi,
Am I missing something or did
commit 013ebc0a7b7ea9c1b1ab7a3d4dd75ea121ea8ba7
Author: Teodor Sigaev <teodor@sigaev.ru>
Date: 2015-09-09 18:43:37 +0300
Microvacuum for GIST
Mark index tuple as dead if it's pointed by kill_prior_tuple during
ordinary (search) scan and remove it during insert process if there is no
enough space for new tuple to insert. This improves select performance
because index will not return tuple marked as dead and improves insert
performance because it reduces number of page split.
Anastasia Lubennikova <a.lubennikova@postgrespro.ru> with
minor editorialization by me
entirely disregard recovery conflict handling? The index entries it
removes could very well be visible to a snapshot on the standby. That's
why the equivalent nbtree (and hash) code does:
/*
* If we have any conflict processing to do, it must happen before we
* update the page.
*
* Btree delete records can conflict with standby queries. You might
* think that vacuum records would conflict as well, but we've handled
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
* cleaned by the vacuum of the heap and so we can resolve any conflicts
* just once when that arrives. After that we know that no conflicts
* exist from individual btree vacuum records on that index.
*/
if (InHotStandby)
{
TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
RelFileNode rnode;
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
xlrec->onCatalogTable, rnode);
}
Is there any reason something like that isn't necessary for gist? If so,
where's that documented? If not, uh, isn't that a somewhat serious bug
in gist?
Greetings,
Andres Freund
On Thu, Dec 13, 2018 at 1:45 AM Andres Freund <andres@anarazel.de> wrote:
Is there any reason something like that isn't necessary for gist? If so,
where's that documented? If not, uh, isn't that a somewhat serious bug
in gist?
Thank you for pointing! This looks like a bug for me too. I'm going
to investigate more on this and provide a fix in next couple of days.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Thu, Dec 13, 2018 at 7:28 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
On Thu, Dec 13, 2018 at 1:45 AM Andres Freund <andres@anarazel.de> wrote:
Is there any reason something like that isn't necessary for gist? If so,
where's that documented? If not, uh, isn't that a somewhat serious bug
in gist?Thank you for pointing! This looks like a bug for me too. I'm going
to investigate more on this and provide a fix in next couple of days.
Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.
Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
Attachments:
gist-microvacuum-conflict-handling.patchapplication/octet-stream; name=gist-microvacuum-conflict-handling.patchDownload
commit b09d5f257d7e909a2d2724aaa2f06b8bde1e7ce0
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sun Dec 16 23:54:38 2018 +0300
Check for conflicting queries during replay of gistvacuumpage()
013ebc0a7b implements so-calles GiST microvacuum. That is gistgettuple() marks
index tuples as dead when kill_prior_tuple is set. Later, when new tuple
insertion claims page space, those dead index tuples are physically deleted
from page. When this deletion is replayed on standby, it might conflict with
read-only queries. But 013ebc0a7b doesn't handle this. That may lead to
disappearance of some tuples from read-only snapshots on standby.
This commit implements resolving of conflicts between replay of GiST microvacuum
and standby queries. Note, that we have to implement new WAL record type
XLOG_GIST_DELETE, which has necessary information. After upgrade, old WAL
records will be still applied without proper conflict handling as it was before.
New WAL records will have proper conflict handling, but can't be applied on
previous minor versions.
Reported-by: Andres Freund
Diagnosed-by: Andres Freund
Discussion: https://postgr.es/m/20181212224524.scafnlyjindmrbe6%40alap3.anarazel.de
Author: Alexander Korotkov
Backpatch-through: 9.6
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7a..7dc149c5d00 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
#define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
values, isnull, true /* size is currently bogus */ );
itup->t_tid = *ht_ctid;
- gistdoinsert(r, itup, 0, giststate);
+ gistdoinsert(r, itup, 0, giststate, heapRel);
/* cleanup */
MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markfollowright)
+ bool markfollowright,
+ Relation heapRel)
{
BlockNumber blkno = BufferGetBlockNumber(buffer);
Page page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer);
+ gistvacuumpage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
* so it does not bother releasing palloc'd allocations.
*/
void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel)
{
ItemId iid;
IndexTuple idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
state.r = r;
+ state.heapRel = heapRel;
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
oldoffnum, NULL,
leftchild,
&splitinfo,
- true);
+ true,
+ state->heapRel);
/*
* Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
@@ -1589,9 +1594,9 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
{
XLogRecPtr recptr;
- recptr = gistXLogUpdate(buffer,
+ recptr = gistXLogDelete(buffer,
deletable, ndeletable,
- NULL, 0, InvalidBuffer);
+ heapRel->rd_node);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f0148..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -56,6 +56,7 @@ typedef enum
typedef struct
{
Relation indexrel;
+ Relation heaprel;
GISTSTATE *giststate;
int64 indtuples; /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
int fillfactor;
buildstate.indexrel = index;
+ buildstate.heaprel = heap;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
* locked, we call gistdoinsert directly.
*/
gistdoinsert(index, itup, buildstate->freespace,
- buildstate->giststate);
+ buildstate->giststate, buildstate->heaprel);
}
/* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
itup, ntup, oldoffnum, &placed_to_blk,
InvalidBuffer,
&splitinfo,
- false);
+ false,
+ buildstate->heaprel);
/*
* If this is a root split, update the root path item kept in memory. This
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e091269785..79358b3e56a 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -16,8 +16,12 @@
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
#include "utils/memutils.h"
static MemoryContext opCtx; /* working memory for operations */
@@ -160,6 +164,210 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
UnlockReleaseBuffer(buffer);
}
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+ gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record);
+ OffsetNumber *todelete;
+ Buffer ibuffer,
+ hbuffer;
+ Page ipage,
+ hpage;
+ RelFileNode rnode;
+ BlockNumber blkno;
+ ItemId iitemid,
+ hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int i;
+
+ /*
+ * If there's nothing running on the standby we don't need to derive a
+ * full latestRemovedXid value, so use a fast path out of here. This
+ * returns InvalidTransactionId, and so will conflict with all HS
+ * transactions; but since we just worked out that that's zero people,
+ * it's OK.
+ *
+ * XXX There is a race condition here, which is that a new backend might
+ * start just after we look. If so, it cannot need to conflict, but this
+ * coding will result in throwing a conflict anyway.
+ */
+ if (CountDBBackends(InvalidOid) == 0)
+ return latestRemovedXid;
+
+ /*
+ * In what follows, we have to examine the previous state of the index
+ * page, as well as the heap page(s) it points to. This is only valid if
+ * WAL replay has reached a consistent database state; which means that
+ * the preceding check is not just an optimization, but is *necessary*. We
+ * won't have let in any user sessions before we reach consistency.
+ */
+ if (!reachedConsistency)
+ elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+ /*
+ * Get index page. If the DB is consistent, this should not fail, nor
+ * should any of the heap page fetches below. If one does, we return
+ * InvalidTransactionId to cancel all HS transactions. That's probably
+ * overkill, but it's safe, and certainly better than panicking here.
+ */
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+ ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId from
+ * the heap items they point to.
+ */
+ todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete);
+
+ for (i = 0; i < xlrec->ntodelete; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, todelete[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ /*
+ * Locate the heap page that the index tuple points at
+ */
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at by
+ * using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ * Note that we are not looking at tuple data here, just headers.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
+ */
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating
+ * latestRemovedxids.
+ */
+ }
+ else
+ Assert(!ItemIdIsUsed(hitemid));
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /*
+ * If all heap tuples were LP_DEAD then we will be returning
+ * InvalidTransactionId here, which avoids conflicts. This matches
+ * existing logic which assumes that LP_DEAD tuples must already be older
+ * than the latestRemovedXid on the cleanup record that set them as
+ * LP_DEAD, hence must already have generated a conflict.
+ */
+ return latestRemovedXid;
+}
+
+/*
+ * redo delete on gist index page to remove tuples marked as DEAD during index
+ * tuple insertion
+ */
+static void
+gistRedoDeleteRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * GiST delete records can conflict with standby queries. You might
+ * think that vacuum records would conflict as well, but we've handled
+ * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby)
+ {
+ TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record);
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete)
+ {
+ OffsetNumber *todelete;
+
+ todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete);
+
+ PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+ }
+
+ GistClearPageHasGarbage(page);
+ GistMarkTuplesDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
/*
* Returns an array of index pointers.
*/
@@ -318,6 +526,9 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
gistRedoPageUpdateRecord(record);
break;
+ case XLOG_GIST_DELETE:
+ gistRedoDeleteRecord(record);
+ break;
case XLOG_GIST_PAGE_SPLIT:
gistRedoPageSplitRecord(record);
break;
@@ -487,3 +698,36 @@ gistXLogUpdate(Buffer buffer,
return recptr;
}
+
+/*
+ * Write XLOG record describing a delete of leaf index tuples marked as DEAD
+ * during new tuple insertion. One may think that this case is already covered
+ * by gistXLogUpdate(). But deletion of index tuples might conflict with
+ * standby queries and needs special handling.
+ */
+XLogRecPtr
+gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
+ RelFileNode hnode)
+{
+ gistxlogDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.hnode = hnode;
+ xlrec.ntodelete = ntodelete;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete);
+
+ /*
+ * We need the target-offsets array whether or not we store the whole
+ * buffer, to allow us to find the latestRemovedXid on a standby
+ * server.
+ */
+ XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE);
+
+ return recptr;
+}
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index e5e925e0c5a..b79ed1dfdc8 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -23,6 +23,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
{
}
+static void
+out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec)
+{
+}
+
static void
out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
{
@@ -41,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec);
break;
+ case XLOG_GIST_DELETE:
+ out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec);
+ break;
case XLOG_GIST_PAGE_SPLIT:
out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
break;
@@ -59,6 +67,9 @@ gist_identify(uint8 info)
case XLOG_GIST_PAGE_UPDATE:
id = "PAGE_UPDATE";
break;
+ case XLOG_GIST_DELETE:
+ id = "DELETE";
+ break;
case XLOG_GIST_PAGE_SPLIT:
id = "PAGE_SPLIT";
break;
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba0..a73716d6eaa 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
typedef struct
{
Relation r;
+ Relation heapRel;
Size freespace; /* free space to be left */
GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
- GISTSTATE *GISTstate);
+ GISTSTATE *GISTstate,
+ Relation heapRel);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markleftchild);
+ bool markleftchild,
+ Relation heapRel);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -414,6 +417,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer,
IndexTuple *itup, int ntup,
Buffer leftchild);
+XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+ int ntodelete, RelFileNode hnode);
+
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0d..b67c7100500 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -18,6 +18,7 @@
#include "lib/stringinfo.h"
#define XLOG_GIST_PAGE_UPDATE 0x00
+#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
@@ -40,6 +41,22 @@ typedef struct gistxlogPageUpdate
*/
} gistxlogPageUpdate;
+/*
+ * Backup Blk 0: Leaf page, whose index tuples are deleted.
+ */
+typedef struct gistxlogDelete
+{
+ RelFileNode hnode; /* RelFileNode of the heap the index currently
+ * points at */
+ uint16 ntodelete; /* number of deleted offsets */
+
+ /*
+ * In payload of blk 0 : todelete OffsetNumbers
+ */
+} gistxlogDelete;
+
+#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
+
/*
* Backup Blk 0: If this operation completes a page split, by inserting a
* downlink for the split page, the left half of the split
Hi,
On 2018-12-17 01:03:52 +0300, Alexander Korotkov wrote:
On Thu, Dec 13, 2018 at 7:28 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:On Thu, Dec 13, 2018 at 1:45 AM Andres Freund <andres@anarazel.de> wrote:
Is there any reason something like that isn't necessary for gist? If so,
where's that documented? If not, uh, isn't that a somewhat serious bug
in gist?Thank you for pointing! This looks like a bug for me too. I'm going
to investigate more on this and provide a fix in next couple of days.Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?
Please hold off committing for a bit. Adding new WAL records in a minor
release ought to be very well considered and a measure of last resort.
Couldn't we determine the xid horizon on the primary, and reuse an
existing WAL record to trigger the conflict? Or something along those
lines?
Greetings,
Andres Freund
On Mon, Dec 17, 2018 at 1:25 AM Andres Freund <andres@anarazel.de> wrote:
On 2018-12-17 01:03:52 +0300, Alexander Korotkov wrote:
On Thu, Dec 13, 2018 at 7:28 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:On Thu, Dec 13, 2018 at 1:45 AM Andres Freund <andres@anarazel.de> wrote:
Is there any reason something like that isn't necessary for gist? If so,
where's that documented? If not, uh, isn't that a somewhat serious bug
in gist?Thank you for pointing! This looks like a bug for me too. I'm going
to investigate more on this and provide a fix in next couple of days.Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?Please hold off committing for a bit. Adding new WAL records in a minor
release ought to be very well considered and a measure of last resort.Couldn't we determine the xid horizon on the primary, and reuse an
existing WAL record to trigger the conflict? Or something along those
lines?
I thought about that, but decided it's better to mimic B-tree and hash
behavior rather than invent new logic in a minor release. But given
that new WAL record in minor release in substantial problem, that
argument doesn't matter.
Yes, it seems to be possible. We can determine xid horizon on primary
in the same way you proposed for B-tree and hash [1] and use
XLOG_HEAP2_CLEANUP_INFO record to trigger the conflict. Do you like
me to make such patch for GiST based on your patch?
1. /messages/by-id/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Mon, Dec 17, 2018 at 3:40 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
On Mon, Dec 17, 2018 at 1:25 AM Andres Freund <andres@anarazel.de> wrote:
On 2018-12-17 01:03:52 +0300, Alexander Korotkov wrote:
Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?Please hold off committing for a bit. Adding new WAL records in a minor
release ought to be very well considered and a measure of last resort.Couldn't we determine the xid horizon on the primary, and reuse an
existing WAL record to trigger the conflict? Or something along those
lines?I thought about that, but decided it's better to mimic B-tree and hash
behavior rather than invent new logic in a minor release. But given
that new WAL record in minor release in substantial problem, that
argument doesn't matter.Yes, it seems to be possible. We can determine xid horizon on primary
in the same way you proposed for B-tree and hash [1] and use
XLOG_HEAP2_CLEANUP_INFO record to trigger the conflict. Do you like
me to make such patch for GiST based on your patch?
Got another tricky idea. Now, deleted offset numbers are written to
buffer data. We can also append them to record data. So, basing on
record length we can resolve conflicts when offsets are provided in
record data. Unpatched version will just ignore extra record data
tail. That would cost us some redundant bigger wal records, but solve
other problems. Any thoughts?
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Sun, Dec 16, 2018 at 4:41 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
I thought about that, but decided it's better to mimic B-tree and hash
behavior rather than invent new logic in a minor release. But given
that new WAL record in minor release in substantial problem, that
argument doesn't matter.
It might be better to mimic B-Tree and hash on the master branch.
--
Peter Geoghegan
On Mon, Dec 17, 2018 at 3:35 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
On Mon, Dec 17, 2018 at 3:40 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:On Mon, Dec 17, 2018 at 1:25 AM Andres Freund <andres@anarazel.de> wrote:
On 2018-12-17 01:03:52 +0300, Alexander Korotkov wrote:
Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?Please hold off committing for a bit. Adding new WAL records in a minor
release ought to be very well considered and a measure of last resort.Couldn't we determine the xid horizon on the primary, and reuse an
existing WAL record to trigger the conflict? Or something along those
lines?I thought about that, but decided it's better to mimic B-tree and hash
behavior rather than invent new logic in a minor release. But given
that new WAL record in minor release in substantial problem, that
argument doesn't matter.Yes, it seems to be possible. We can determine xid horizon on primary
in the same way you proposed for B-tree and hash [1] and use
XLOG_HEAP2_CLEANUP_INFO record to trigger the conflict. Do you like
me to make such patch for GiST based on your patch?Got another tricky idea. Now, deleted offset numbers are written to
buffer data. We can also append them to record data. So, basing on
record length we can resolve conflicts when offsets are provided in
record data. Unpatched version will just ignore extra record data
tail. That would cost us some redundant bigger wal records, but solve
other problems. Any thoughts?
Please, find backpatch version of patch implementing this approach
attached. I found it more attractive than placing xid horizon
calculation to primary. Because xid horizon calculation on primary is
substantially new behavior, which is unwanted for backpatching. I've
not yet tested this patch.
I'm going to test this patch including WAL compatibility. If
everything will be OK, then commit.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
Attachments:
gist-microvacuum-conflict-handling-backpatch.patchapplication/octet-stream; name=gist-microvacuum-conflict-handling-backpatch.patchDownload
commit 21d0f25df959c66e4d0e2241f58982ee1bbd5d51
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date: Tue Dec 18 01:58:29 2018 +0300
Check for conflicting queries during replay of gistvacuumpage()
013ebc0a7b implements so-calles GiST microvacuum. That is gistgettuple() marks
index tuples as dead when kill_prior_tuple is set. Later, when new tuple
insertion claims page space, those dead index tuples are physically deleted
from page. When this deletion is replayed on standby, it might conflict with
read-only queries. But 013ebc0a7b doesn't handle this. That may lead to
disappearance of some tuples from read-only snapshots on standby.
This commit implements resolving of conflicts between replay of GiST microvacuum
and standby queries. On the master we implement new WAL record type
XLOG_GIST_DELETE, which comprises necessary information. On stable releases
we've to be tricky to keep WAL compatibility. Information required for conflict
processing is just appended to data of XLOG_GIST_PAGE_UPDATE record. So,
PostgreSQL version, which doesn't know about conflict processing, will just
ignore that.
Reported-by: Andres Freund
Diagnosed-by: Andres Freund
Discussion: https://postgr.es/m/20181212224524.scafnlyjindmrbe6%40alap3.anarazel.de
Author: Alexander Korotkov
Backpatch-through: 9.6
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7a..699dd207fe3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
#define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
values, isnull, true /* size is currently bogus */ );
itup->t_tid = *ht_ctid;
- gistdoinsert(r, itup, 0, giststate);
+ gistdoinsert(r, itup, 0, giststate, heapRel);
/* cleanup */
MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markfollowright)
+ bool markfollowright,
+ Relation heapRel)
{
BlockNumber blkno = BufferGetBlockNumber(buffer);
Page page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer);
+ gistvacuumpage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -556,7 +558,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
recptr = gistXLogUpdate(buffer,
deloffs, ndeloffs, itup, ntup,
- leftchildbuf);
+ leftchildbuf, NULL);
PageSetLSN(page, recptr);
}
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
* so it does not bother releasing palloc'd allocations.
*/
void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel)
{
ItemId iid;
IndexTuple idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
state.r = r;
+ state.heapRel = heapRel;
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
oldoffnum, NULL,
leftchild,
&splitinfo,
- true);
+ true,
+ state->heapRel);
/*
* Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
@@ -1591,7 +1596,8 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
recptr = gistXLogUpdate(buffer,
deletable, ndeletable,
- NULL, 0, InvalidBuffer);
+ NULL, 0, InvalidBuffer,
+ &heapRel->rd_node);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f0148..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -56,6 +56,7 @@ typedef enum
typedef struct
{
Relation indexrel;
+ Relation heaprel;
GISTSTATE *giststate;
int64 indtuples; /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
int fillfactor;
buildstate.indexrel = index;
+ buildstate.heaprel = heap;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
* locked, we call gistdoinsert directly.
*/
gistdoinsert(index, itup, buildstate->freespace,
- buildstate->giststate);
+ buildstate->giststate, buildstate->heaprel);
}
/* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
itup, ntup, oldoffnum, &placed_to_blk,
InvalidBuffer,
&splitinfo,
- false);
+ false,
+ buildstate->heaprel);
/*
* If this is a root split, update the root path item kept in memory. This
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c779..a396179df95 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -224,7 +224,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
recptr = gistXLogUpdate(buffer,
todelete, ntodelete,
- NULL, 0, InvalidBuffer);
+ NULL, 0, InvalidBuffer,
+ NULL);
PageSetLSN(page, recptr);
}
else
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e091269785..eefe8b4953a 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -16,8 +16,12 @@
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
#include "utils/memutils.h"
static MemoryContext opCtx; /* working memory for operations */
@@ -60,6 +64,155 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
UnlockReleaseBuffer(buffer);
}
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+ gistxlogPageUpdate *xlrec = (gistxlogPageUpdate *) XLogRecGetData(record);
+ OffsetNumber *todelete;
+ Buffer ibuffer,
+ hbuffer;
+ Page ipage,
+ hpage;
+ RelFileNode rnode,
+ *hnode;
+ BlockNumber blkno;
+ ItemId iitemid,
+ hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int i;
+
+ /*
+ * If there's nothing running on the standby we don't need to derive a
+ * full latestRemovedXid value, so use a fast path out of here. This
+ * returns InvalidTransactionId, and so will conflict with all HS
+ * transactions; but since we just worked out that that's zero people,
+ * it's OK.
+ *
+ * XXX There is a race condition here, which is that a new backend might
+ * start just after we look. If so, it cannot need to conflict, but this
+ * coding will result in throwing a conflict anyway.
+ */
+ if (CountDBBackends(InvalidOid) == 0)
+ return latestRemovedXid;
+
+ /*
+ * In what follows, we have to examine the previous state of the index
+ * page, as well as the heap page(s) it points to. This is only valid if
+ * WAL replay has reached a consistent database state; which means that
+ * the preceding check is not just an optimization, but is *necessary*. We
+ * won't have let in any user sessions before we reach consistency.
+ */
+ if (!reachedConsistency)
+ elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+ /*
+ * Get index page. If the DB is consistent, this should not fail, nor
+ * should any of the heap page fetches below. If one does, we return
+ * InvalidTransactionId to cancel all HS transactions. That's probably
+ * overkill, but it's safe, and certainly better than panicking here.
+ */
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+ ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId from
+ * the heap items they point to.
+ */
+ hnode = (RelFileNode *) ((char *) xlrec + sizeof(gistxlogPageUpdate));
+ todelete = (OffsetNumber *) ((char *) hnode + sizeof(RelFileNode));
+
+ for (i = 0; i < xlrec->ntodelete; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, todelete[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ /*
+ * Locate the heap page that the index tuple points at
+ */
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBufferExtended(*hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at by
+ * using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ * Note that we are not looking at tuple data here, just headers.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
+ */
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating
+ * latestRemovedxids.
+ */
+ }
+ else
+ Assert(!ItemIdIsUsed(hitemid));
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /*
+ * If all heap tuples were LP_DEAD then we will be returning
+ * InvalidTransactionId here, which avoids conflicts. This matches
+ * existing logic which assumes that LP_DEAD tuples must already be older
+ * than the latestRemovedXid on the cleanup record that set them as
+ * LP_DEAD, hence must already have generated a conflict.
+ */
+ return latestRemovedXid;
+}
+
/*
* redo any page update (except page split)
*/
@@ -71,6 +224,34 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
Buffer buffer;
Page page;
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * Support for conflict processing in GiST has been backpatched. This is
+ * why we have to use tricky way of saving WAL-compatibility between
+ * minor versions. Information required for conflict processing is just
+ * appended to data of XLOG_GIST_PAGE_UPDATE record. So, PostgreSQL
+ * version, which doesn't know about conflict processing, will just ignore
+ * that.
+ *
+ * GiST delete records can conflict with standby queries. You might
+ * think that vacuum records would conflict as well, but we've handled
+ * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby && XLogRecGetDataLen(record) > sizeof(gistxlogPageUpdate))
+ {
+ TransactionId latestRemovedXid = gistRedoPageUpdateRecordGetLatestRemovedXid(record);
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ }
+
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
{
char *begin;
@@ -457,7 +638,7 @@ XLogRecPtr
gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ituplen,
- Buffer leftchildbuf)
+ Buffer leftchildbuf, RelFileNode *hnode)
{
gistxlogPageUpdate xlrec;
int i;
@@ -469,6 +650,16 @@ gistXLogUpdate(Buffer buffer,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate));
+ /*
+ * Append the information required for standby conflict processing if it is
+ * provided by caller.
+ */
+ if (hnode)
+ {
+ XLogRegisterData((char *) hnode, sizeof(RelFileNode));
+ XLogRegisterData((char *) todelete, sizeof(OffsetNumber) * ntodelete);
+ }
+
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete);
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba0..ed0fb634a52 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
typedef struct
{
Relation r;
+ Relation heapRel;
Size freespace; /* free space to be left */
GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
- GISTSTATE *GISTstate);
+ GISTSTATE *GISTstate,
+ Relation heapRel);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markleftchild);
+ bool markleftchild,
+ Relation heapRel);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -412,7 +415,7 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ntup,
- Buffer leftchild);
+ Buffer leftchild, RelFileNode *hnode);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
On Tue, Dec 18, 2018 at 2:04 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
On Mon, Dec 17, 2018 at 3:35 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:On Mon, Dec 17, 2018 at 3:40 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:On Mon, Dec 17, 2018 at 1:25 AM Andres Freund <andres@anarazel.de> wrote:
On 2018-12-17 01:03:52 +0300, Alexander Korotkov wrote:
Sorry for delay. Attached patch implements conflict handling for gist
microvacuum like btree and hash. I'm going to push it if no
objections.Note, that it implements new WAL record type. So, new WAL can\t be
replayed on old minor release. I'm note sure if we claim that it's
usually possible. Should we state something explicitly for this case?Please hold off committing for a bit. Adding new WAL records in a minor
release ought to be very well considered and a measure of last resort.Couldn't we determine the xid horizon on the primary, and reuse an
existing WAL record to trigger the conflict? Or something along those
lines?I thought about that, but decided it's better to mimic B-tree and hash
behavior rather than invent new logic in a minor release. But given
that new WAL record in minor release in substantial problem, that
argument doesn't matter.Yes, it seems to be possible. We can determine xid horizon on primary
in the same way you proposed for B-tree and hash [1] and use
XLOG_HEAP2_CLEANUP_INFO record to trigger the conflict. Do you like
me to make such patch for GiST based on your patch?Got another tricky idea. Now, deleted offset numbers are written to
buffer data. We can also append them to record data. So, basing on
record length we can resolve conflicts when offsets are provided in
record data. Unpatched version will just ignore extra record data
tail. That would cost us some redundant bigger wal records, but solve
other problems. Any thoughts?Please, find backpatch version of patch implementing this approach
attached. I found it more attractive than placing xid horizon
calculation to primary. Because xid horizon calculation on primary is
substantially new behavior, which is unwanted for backpatching. I've
not yet tested this patch.I'm going to test this patch including WAL compatibility. If
everything will be OK, then commit.
I've managed to reproduce the problem and test my backpatch solution.
primary (patched)
standby 1 (patched)
standby 2 (unpatched)
drop table if exists test;
create table test (p point) with (fillfactor = 50, autovacuum_enabled = false);
insert into test (select point(i % 100, i / 100) from
generate_series(0,9999) i);
vacuum test;
create index test_gist_idx on test using gist (p);
alter table test set (fillfactor = 100);
begin isolation level repeatable read;
select count(*) from test where p <@ box(point(0,0),point(99,99));
count
-------
10000
(1 row)
begin isolation level repeatable read;
select count(*) from test where p <@ box(point(0,0),point(99,99));
count
-------
10000
(1 row)
delete from test where p[0]::int % 10 = 0 and p[1]::int % 10 = 0;
set enable_seqscan = off;
set enable_bitmapscan = off;
set enable_indexonlyscan = off;
select count(*) from test where p <@ box(point(0,0),point(99,99));
insert into test (select point(i % 100, i / 100) from
generate_series(0,9999) i);
select count(*) from test where p <@ box(point(0,0),point(99,99));
count
-------
10000
(1 row)
select count(*) from test where p <@ box(point(0,0),point(99,99));
count
-------
9961
(1 row)
select count(*) from test where p <@ box(point(0,0),point(99,99));
FATAL: terminating connection due to conflict with recovery
DETAIL: User query might have needed to see row versions that
must be removed.
HINT: In a moment you should be able to reconnect to the database
and repeat your command.
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Succeeded.
So, two standbys were reading the same WAL generated by patched
primary. Patched standby got conflict: it gives correct query answer
then drops transaction. Unpatched replicate WAL stream without
conflict. So, it gives wrong query answer as if it was reading WAL
from unpatched master.
If experimenting with unpatched primary, both standbys gives wrong
query answer without conflict.
Please, find attached two patches I'm going to commit: for master and
for backpatching.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
Attachments:
gist-microvacuum-conflict-handling-2.patchapplication/octet-stream; name=gist-microvacuum-conflict-handling-2.patchDownload
commit b8dafd869891b3c1e84d6d346099f1dd4617f5f3
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu Dec 20 01:30:32 2018 +0300
Check for conflicting queries during replay of gistvacuumpage()
013ebc0a7b implements so-calles GiST microvacuum. That is gistgettuple() marks
index tuples as dead when kill_prior_tuple is set. Later, when new tuple
insertion claims page space, those dead index tuples are physically deleted
from page. When this deletion is replayed on standby, it might conflict with
read-only queries. But 013ebc0a7b doesn't handle this. That may lead to
disappearance of some tuples from read-only snapshots on standby.
This commit implements resolving of conflicts between replay of GiST microvacuum
and standby queries. On the master we implement new WAL record type
XLOG_GIST_DELETE, which comprises necessary information. On stable releases
we've to be tricky to keep WAL compatibility. Information required for conflict
processing is just appended to data of XLOG_GIST_PAGE_UPDATE record. So,
PostgreSQL version, which doesn't know about conflict processing, will just
ignore that.
Reported-by: Andres Freund
Diagnosed-by: Andres Freund
Discussion: https://postgr.es/m/20181212224524.scafnlyjindmrbe6%40alap3.anarazel.de
Author: Alexander Korotkov
Backpatch-through: 9.6
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7a..a2cb84800e8 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
#define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
values, isnull, true /* size is currently bogus */ );
itup->t_tid = *ht_ctid;
- gistdoinsert(r, itup, 0, giststate);
+ gistdoinsert(r, itup, 0, giststate, heapRel);
/* cleanup */
MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markfollowright)
+ bool markfollowright,
+ Relation heapRel)
{
BlockNumber blkno = BufferGetBlockNumber(buffer);
Page page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer);
+ gistvacuumpage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
* so it does not bother releasing palloc'd allocations.
*/
void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel)
{
ItemId iid;
IndexTuple idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
state.r = r;
+ state.heapRel = heapRel;
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
oldoffnum, NULL,
leftchild,
&splitinfo,
- true);
+ true,
+ state->heapRel);
/*
* Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
@@ -1589,9 +1594,9 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
{
XLogRecPtr recptr;
- recptr = gistXLogUpdate(buffer,
+ recptr = gistXLogDelete(buffer,
deletable, ndeletable,
- NULL, 0, InvalidBuffer);
+ heapRel->rd_node);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f0148..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -56,6 +56,7 @@ typedef enum
typedef struct
{
Relation indexrel;
+ Relation heaprel;
GISTSTATE *giststate;
int64 indtuples; /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
int fillfactor;
buildstate.indexrel = index;
+ buildstate.heaprel = heap;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
* locked, we call gistdoinsert directly.
*/
gistdoinsert(index, itup, buildstate->freespace,
- buildstate->giststate);
+ buildstate->giststate, buildstate->heaprel);
}
/* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
itup, ntup, oldoffnum, &placed_to_blk,
InvalidBuffer,
&splitinfo,
- false);
+ false,
+ buildstate->heaprel);
/*
* If this is a root split, update the root path item kept in memory. This
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e091269785..01e025d5fdb 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -16,8 +16,12 @@
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
#include "utils/memutils.h"
static MemoryContext opCtx; /* working memory for operations */
@@ -160,6 +164,210 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
UnlockReleaseBuffer(buffer);
}
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+ gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record);
+ OffsetNumber *todelete;
+ Buffer ibuffer,
+ hbuffer;
+ Page ipage,
+ hpage;
+ RelFileNode rnode;
+ BlockNumber blkno;
+ ItemId iitemid,
+ hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int i;
+
+ /*
+ * If there's nothing running on the standby we don't need to derive a
+ * full latestRemovedXid value, so use a fast path out of here. This
+ * returns InvalidTransactionId, and so will conflict with all HS
+ * transactions; but since we just worked out that that's zero people,
+ * it's OK.
+ *
+ * XXX There is a race condition here, which is that a new backend might
+ * start just after we look. If so, it cannot need to conflict, but this
+ * coding will result in throwing a conflict anyway.
+ */
+ if (CountDBBackends(InvalidOid) == 0)
+ return latestRemovedXid;
+
+ /*
+ * In what follows, we have to examine the previous state of the index
+ * page, as well as the heap page(s) it points to. This is only valid if
+ * WAL replay has reached a consistent database state; which means that
+ * the preceding check is not just an optimization, but is *necessary*. We
+ * won't have let in any user sessions before we reach consistency.
+ */
+ if (!reachedConsistency)
+ elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+ /*
+ * Get index page. If the DB is consistent, this should not fail, nor
+ * should any of the heap page fetches below. If one does, we return
+ * InvalidTransactionId to cancel all HS transactions. That's probably
+ * overkill, but it's safe, and certainly better than panicking here.
+ */
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+ ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId from
+ * the heap items they point to.
+ */
+ todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete);
+
+ for (i = 0; i < xlrec->ntodelete; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, todelete[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ /*
+ * Locate the heap page that the index tuple points at
+ */
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at by
+ * using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ * Note that we are not looking at tuple data here, just headers.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
+ */
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating
+ * latestRemovedxids.
+ */
+ }
+ else
+ Assert(!ItemIdIsUsed(hitemid));
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /*
+ * If all heap tuples were LP_DEAD then we will be returning
+ * InvalidTransactionId here, which avoids conflicts. This matches
+ * existing logic which assumes that LP_DEAD tuples must already be older
+ * than the latestRemovedXid on the cleanup record that set them as
+ * LP_DEAD, hence must already have generated a conflict.
+ */
+ return latestRemovedXid;
+}
+
+/*
+ * redo delete on gist index page to remove tuples marked as DEAD during index
+ * tuple insertion
+ */
+static void
+gistRedoDeleteRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * GiST delete records can conflict with standby queries. You might think
+ * that vacuum records would conflict as well, but we've handled that
+ * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby)
+ {
+ TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record);
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete)
+ {
+ OffsetNumber *todelete;
+
+ todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete);
+
+ PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+ }
+
+ GistClearPageHasGarbage(page);
+ GistMarkTuplesDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
/*
* Returns an array of index pointers.
*/
@@ -318,6 +526,9 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
gistRedoPageUpdateRecord(record);
break;
+ case XLOG_GIST_DELETE:
+ gistRedoDeleteRecord(record);
+ break;
case XLOG_GIST_PAGE_SPLIT:
gistRedoPageSplitRecord(record);
break;
@@ -487,3 +698,35 @@ gistXLogUpdate(Buffer buffer,
return recptr;
}
+
+/*
+ * Write XLOG record describing a delete of leaf index tuples marked as DEAD
+ * during new tuple insertion. One may think that this case is already covered
+ * by gistXLogUpdate(). But deletion of index tuples might conflict with
+ * standby queries and needs special handling.
+ */
+XLogRecPtr
+gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
+ RelFileNode hnode)
+{
+ gistxlogDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.hnode = hnode;
+ xlrec.ntodelete = ntodelete;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete);
+
+ /*
+ * We need the target-offsets array whether or not we store the whole
+ * buffer, to allow us to find the latestRemovedXid on a standby server.
+ */
+ XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE);
+
+ return recptr;
+}
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index e5e925e0c5a..b79ed1dfdc8 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -23,6 +23,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
{
}
+static void
+out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec)
+{
+}
+
static void
out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
{
@@ -41,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec);
break;
+ case XLOG_GIST_DELETE:
+ out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec);
+ break;
case XLOG_GIST_PAGE_SPLIT:
out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
break;
@@ -59,6 +67,9 @@ gist_identify(uint8 info)
case XLOG_GIST_PAGE_UPDATE:
id = "PAGE_UPDATE";
break;
+ case XLOG_GIST_DELETE:
+ id = "DELETE";
+ break;
case XLOG_GIST_PAGE_SPLIT:
id = "PAGE_SPLIT";
break;
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba0..a73716d6eaa 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
typedef struct
{
Relation r;
+ Relation heapRel;
Size freespace; /* free space to be left */
GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
- GISTSTATE *GISTstate);
+ GISTSTATE *GISTstate,
+ Relation heapRel);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markleftchild);
+ bool markleftchild,
+ Relation heapRel);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -414,6 +417,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer,
IndexTuple *itup, int ntup,
Buffer leftchild);
+XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+ int ntodelete, RelFileNode hnode);
+
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0d..b67c7100500 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -18,6 +18,7 @@
#include "lib/stringinfo.h"
#define XLOG_GIST_PAGE_UPDATE 0x00
+#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
@@ -40,6 +41,22 @@ typedef struct gistxlogPageUpdate
*/
} gistxlogPageUpdate;
+/*
+ * Backup Blk 0: Leaf page, whose index tuples are deleted.
+ */
+typedef struct gistxlogDelete
+{
+ RelFileNode hnode; /* RelFileNode of the heap the index currently
+ * points at */
+ uint16 ntodelete; /* number of deleted offsets */
+
+ /*
+ * In payload of blk 0 : todelete OffsetNumbers
+ */
+} gistxlogDelete;
+
+#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
+
/*
* Backup Blk 0: If this operation completes a page split, by inserting a
* downlink for the split page, the left half of the split
gist-microvacuum-conflict-handling-2-backpatch.patchapplication/octet-stream; name=gist-microvacuum-conflict-handling-2-backpatch.patchDownload
commit f6f21679c3f64991a78a42b0a311ae3f448dd134
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu Dec 20 01:29:25 2018 +0300
Check for conflicting queries during replay of gistvacuumpage()
013ebc0a7b implements so-calles GiST microvacuum. That is gistgettuple() marks
index tuples as dead when kill_prior_tuple is set. Later, when new tuple
insertion claims page space, those dead index tuples are physically deleted
from page. When this deletion is replayed on standby, it might conflict with
read-only queries. But 013ebc0a7b doesn't handle this. That may lead to
disappearance of some tuples from read-only snapshots on standby.
This commit implements resolving of conflicts between replay of GiST microvacuum
and standby queries. On the master we implement new WAL record type
XLOG_GIST_DELETE, which comprises necessary information. On stable releases
we've to be tricky to keep WAL compatibility. Information required for conflict
processing is just appended to data of XLOG_GIST_PAGE_UPDATE record. So,
PostgreSQL version, which doesn't know about conflict processing, will just
ignore that.
Reported-by: Andres Freund
Diagnosed-by: Andres Freund
Discussion: https://postgr.es/m/20181212224524.scafnlyjindmrbe6%40alap3.anarazel.de
Author: Alexander Korotkov
Backpatch-through: 9.6
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7a..4b49d1222c7 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
#define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
values, isnull, true /* size is currently bogus */ );
itup->t_tid = *ht_ctid;
- gistdoinsert(r, itup, 0, giststate);
+ gistdoinsert(r, itup, 0, giststate, heapRel);
/* cleanup */
MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markfollowright)
+ bool markfollowright,
+ Relation heapRel)
{
BlockNumber blkno = BufferGetBlockNumber(buffer);
Page page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer);
+ gistvacuumpage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -556,7 +558,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
recptr = gistXLogUpdate(buffer,
deloffs, ndeloffs, itup, ntup,
- leftchildbuf);
+ leftchildbuf, NULL);
PageSetLSN(page, recptr);
}
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
* so it does not bother releasing palloc'd allocations.
*/
void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel)
{
ItemId iid;
IndexTuple idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
state.r = r;
+ state.heapRel = heapRel;
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
oldoffnum, NULL,
leftchild,
&splitinfo,
- true);
+ true,
+ state->heapRel);
/*
* Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
@@ -1591,7 +1596,8 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
recptr = gistXLogUpdate(buffer,
deletable, ndeletable,
- NULL, 0, InvalidBuffer);
+ NULL, 0, InvalidBuffer,
+ &heapRel->rd_node);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f0148..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -56,6 +56,7 @@ typedef enum
typedef struct
{
Relation indexrel;
+ Relation heaprel;
GISTSTATE *giststate;
int64 indtuples; /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
int fillfactor;
buildstate.indexrel = index;
+ buildstate.heaprel = heap;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
* locked, we call gistdoinsert directly.
*/
gistdoinsert(index, itup, buildstate->freespace,
- buildstate->giststate);
+ buildstate->giststate, buildstate->heaprel);
}
/* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
itup, ntup, oldoffnum, &placed_to_blk,
InvalidBuffer,
&splitinfo,
- false);
+ false,
+ buildstate->heaprel);
/*
* If this is a root split, update the root path item kept in memory. This
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c779..a396179df95 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -224,7 +224,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
recptr = gistXLogUpdate(buffer,
todelete, ntodelete,
- NULL, 0, InvalidBuffer);
+ NULL, 0, InvalidBuffer,
+ NULL);
PageSetLSN(page, recptr);
}
else
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e091269785..17e213967b9 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -16,8 +16,12 @@
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
#include "utils/memutils.h"
static MemoryContext opCtx; /* working memory for operations */
@@ -60,6 +64,155 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
UnlockReleaseBuffer(buffer);
}
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+ gistxlogPageUpdate *xlrec = (gistxlogPageUpdate *) XLogRecGetData(record);
+ OffsetNumber *todelete;
+ Buffer ibuffer,
+ hbuffer;
+ Page ipage,
+ hpage;
+ RelFileNode rnode,
+ *hnode;
+ BlockNumber blkno;
+ ItemId iitemid,
+ hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int i;
+
+ /*
+ * If there's nothing running on the standby we don't need to derive a
+ * full latestRemovedXid value, so use a fast path out of here. This
+ * returns InvalidTransactionId, and so will conflict with all HS
+ * transactions; but since we just worked out that that's zero people,
+ * it's OK.
+ *
+ * XXX There is a race condition here, which is that a new backend might
+ * start just after we look. If so, it cannot need to conflict, but this
+ * coding will result in throwing a conflict anyway.
+ */
+ if (CountDBBackends(InvalidOid) == 0)
+ return latestRemovedXid;
+
+ /*
+ * In what follows, we have to examine the previous state of the index
+ * page, as well as the heap page(s) it points to. This is only valid if
+ * WAL replay has reached a consistent database state; which means that
+ * the preceding check is not just an optimization, but is *necessary*. We
+ * won't have let in any user sessions before we reach consistency.
+ */
+ if (!reachedConsistency)
+ elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+ /*
+ * Get index page. If the DB is consistent, this should not fail, nor
+ * should any of the heap page fetches below. If one does, we return
+ * InvalidTransactionId to cancel all HS transactions. That's probably
+ * overkill, but it's safe, and certainly better than panicking here.
+ */
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+ ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId from
+ * the heap items they point to.
+ */
+ hnode = (RelFileNode *) ((char *) xlrec + sizeof(gistxlogPageUpdate));
+ todelete = (OffsetNumber *) ((char *) hnode + sizeof(RelFileNode));
+
+ for (i = 0; i < xlrec->ntodelete; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, todelete[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ /*
+ * Locate the heap page that the index tuple points at
+ */
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBufferExtended(*hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at by
+ * using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ * Note that we are not looking at tuple data here, just headers.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
+ */
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating
+ * latestRemovedxids.
+ */
+ }
+ else
+ Assert(!ItemIdIsUsed(hitemid));
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /*
+ * If all heap tuples were LP_DEAD then we will be returning
+ * InvalidTransactionId here, which avoids conflicts. This matches
+ * existing logic which assumes that LP_DEAD tuples must already be older
+ * than the latestRemovedXid on the cleanup record that set them as
+ * LP_DEAD, hence must already have generated a conflict.
+ */
+ return latestRemovedXid;
+}
+
/*
* redo any page update (except page split)
*/
@@ -71,6 +224,34 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
Buffer buffer;
Page page;
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * Support for conflict processing in GiST has been backpatched. This is
+ * why we have to use tricky way of saving WAL-compatibility between minor
+ * versions. Information required for conflict processing is just
+ * appended to data of XLOG_GIST_PAGE_UPDATE record. So, PostgreSQL
+ * version, which doesn't know about conflict processing, will just ignore
+ * that.
+ *
+ * GiST delete records can conflict with standby queries. You might think
+ * that vacuum records would conflict as well, but we've handled that
+ * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby && XLogRecGetDataLen(record) > sizeof(gistxlogPageUpdate))
+ {
+ TransactionId latestRemovedXid = gistRedoPageUpdateRecordGetLatestRemovedXid(record);
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ }
+
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
{
char *begin;
@@ -457,7 +638,7 @@ XLogRecPtr
gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ituplen,
- Buffer leftchildbuf)
+ Buffer leftchildbuf, RelFileNode *hnode)
{
gistxlogPageUpdate xlrec;
int i;
@@ -469,6 +650,16 @@ gistXLogUpdate(Buffer buffer,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate));
+ /*
+ * Append the information required for standby conflict processing if it
+ * is provided by caller.
+ */
+ if (hnode)
+ {
+ XLogRegisterData((char *) hnode, sizeof(RelFileNode));
+ XLogRegisterData((char *) todelete, sizeof(OffsetNumber) * ntodelete);
+ }
+
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete);
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba0..ed0fb634a52 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
typedef struct
{
Relation r;
+ Relation heapRel;
Size freespace; /* free space to be left */
GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
- GISTSTATE *GISTstate);
+ GISTSTATE *GISTstate,
+ Relation heapRel);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markleftchild);
+ bool markleftchild,
+ Relation heapRel);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -412,7 +415,7 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ntup,
- Buffer leftchild);
+ Buffer leftchild, RelFileNode *hnode);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
On Thu, Dec 20, 2018 at 1:41 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
Please, find attached two patches I'm going to commit: for master and
for backpatching.
So, pushed.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
Hi,
On 2018-12-21 02:40:18 +0300, Alexander Korotkov wrote:
On Thu, Dec 20, 2018 at 1:41 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:Please, find attached two patches I'm going to commit: for master and
for backpatching.So, pushed.
I noticed that I didn't adapt this in
commit 558a9165e081d1936573e5a7d576f5febd7fb55a
Author: Andres Freund <andres@anarazel.de>
Date: 2019-03-26 14:41:46 -0700
Compute XID horizon for page level index vacuum on primary.
Attached you thus can find the conversion of gist to the new logic for
computing the horizon. Any comments?
Greetings,
Andres Freund
Attachments:
0001-Convert-gist-to-compute-page-level-xid-horizon-on-pr.patchtext/x-diff; charset=us-asciiDownload
From f12b1105e4a0c356083a8dbae81e6a2c11da7bbe Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 5 Apr 2019 22:01:51 -0700
Subject: [PATCH] Convert gist to compute page level xid horizon on primary.
Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:
---
src/backend/access/gist/gist.c | 8 +-
src/backend/access/gist/gistxlog.c | 153 +------------------------
src/backend/access/rmgrdesc/gistdesc.c | 7 +-
src/include/access/gist_private.h | 2 +-
src/include/access/gistxlog.h | 3 +-
src/include/access/xlog_internal.h | 2 +-
6 files changed, 18 insertions(+), 157 deletions(-)
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 2db790c840c..769aca42e3b 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -1616,6 +1616,7 @@ gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
int ndeletable = 0;
OffsetNumber offnum,
maxoff;
+ TransactionId latestRemovedXid = InvalidTransactionId;
Assert(GistPageIsLeaf(page));
@@ -1634,6 +1635,11 @@ gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
deletable[ndeletable++] = offnum;
}
+ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+ latestRemovedXid =
+ index_compute_xid_horizon_for_tuples(rel, heapRel, buffer,
+ deletable, ndeletable);
+
if (ndeletable > 0)
{
START_CRIT_SECTION();
@@ -1658,7 +1664,7 @@ gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
recptr = gistXLogDelete(buffer,
deletable, ndeletable,
- heapRel->rd_node);
+ latestRemovedXid);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 4fb1855e890..503db34d863 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -165,152 +165,6 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
UnlockReleaseBuffer(buffer);
}
-/*
- * Get the latestRemovedXid from the heap pages pointed at by the index
- * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
- * on which this function is based.
- */
-static TransactionId
-gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record)
-{
- gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record);
- OffsetNumber *todelete;
- Buffer ibuffer,
- hbuffer;
- Page ipage,
- hpage;
- RelFileNode rnode;
- BlockNumber blkno;
- ItemId iitemid,
- hitemid;
- IndexTuple itup;
- HeapTupleHeader htuphdr;
- BlockNumber hblkno;
- OffsetNumber hoffnum;
- TransactionId latestRemovedXid = InvalidTransactionId;
- int i;
-
- /*
- * If there's nothing running on the standby we don't need to derive a
- * full latestRemovedXid value, so use a fast path out of here. This
- * returns InvalidTransactionId, and so will conflict with all HS
- * transactions; but since we just worked out that that's zero people,
- * it's OK.
- *
- * XXX There is a race condition here, which is that a new backend might
- * start just after we look. If so, it cannot need to conflict, but this
- * coding will result in throwing a conflict anyway.
- */
- if (CountDBBackends(InvalidOid) == 0)
- return latestRemovedXid;
-
- /*
- * In what follows, we have to examine the previous state of the index
- * page, as well as the heap page(s) it points to. This is only valid if
- * WAL replay has reached a consistent database state; which means that
- * the preceding check is not just an optimization, but is *necessary*. We
- * won't have let in any user sessions before we reach consistency.
- */
- if (!reachedConsistency)
- elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
-
- /*
- * Get index page. If the DB is consistent, this should not fail, nor
- * should any of the heap page fetches below. If one does, we return
- * InvalidTransactionId to cancel all HS transactions. That's probably
- * overkill, but it's safe, and certainly better than panicking here.
- */
- XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
- ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
- if (!BufferIsValid(ibuffer))
- return InvalidTransactionId;
- LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
- ipage = (Page) BufferGetPage(ibuffer);
-
- /*
- * Loop through the deleted index items to obtain the TransactionId from
- * the heap items they point to.
- */
- todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete);
-
- for (i = 0; i < xlrec->ntodelete; i++)
- {
- /*
- * Identify the index tuple about to be deleted
- */
- iitemid = PageGetItemId(ipage, todelete[i]);
- itup = (IndexTuple) PageGetItem(ipage, iitemid);
-
- /*
- * Locate the heap page that the index tuple points at
- */
- hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
- hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
- if (!BufferIsValid(hbuffer))
- {
- UnlockReleaseBuffer(ibuffer);
- return InvalidTransactionId;
- }
- LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
- hpage = (Page) BufferGetPage(hbuffer);
-
- /*
- * Look up the heap tuple header that the index tuple points at by
- * using the heap node supplied with the xlrec. We can't use
- * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
- * Note that we are not looking at tuple data here, just headers.
- */
- hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
- hitemid = PageGetItemId(hpage, hoffnum);
-
- /*
- * Follow any redirections until we find something useful.
- */
- while (ItemIdIsRedirected(hitemid))
- {
- hoffnum = ItemIdGetRedirect(hitemid);
- hitemid = PageGetItemId(hpage, hoffnum);
- CHECK_FOR_INTERRUPTS();
- }
-
- /*
- * If the heap item has storage, then read the header and use that to
- * set latestRemovedXid.
- *
- * Some LP_DEAD items may not be accessible, so we ignore them.
- */
- if (ItemIdHasStorage(hitemid))
- {
- htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
-
- HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
- }
- else if (ItemIdIsDead(hitemid))
- {
- /*
- * Conjecture: if hitemid is dead then it had xids before the xids
- * marked on LP_NORMAL items. So we just ignore this item and move
- * onto the next, for the purposes of calculating
- * latestRemovedxids.
- */
- }
- else
- Assert(!ItemIdIsUsed(hitemid));
-
- UnlockReleaseBuffer(hbuffer);
- }
-
- UnlockReleaseBuffer(ibuffer);
-
- /*
- * If all heap tuples were LP_DEAD then we will be returning
- * InvalidTransactionId here, which avoids conflicts. This matches
- * existing logic which assumes that LP_DEAD tuples must already be older
- * than the latestRemovedXid on the cleanup record that set them as
- * LP_DEAD, hence must already have generated a conflict.
- */
- return latestRemovedXid;
-}
/*
* redo delete on gist index page to remove tuples marked as DEAD during index
@@ -337,12 +191,11 @@ gistRedoDeleteRecord(XLogReaderState *record)
*/
if (InHotStandby)
{
- TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record);
RelFileNode rnode;
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
- ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode);
}
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
@@ -800,12 +653,12 @@ gistXLogUpdate(Buffer buffer,
*/
XLogRecPtr
gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
- RelFileNode hnode)
+ TransactionId latestRemovedXid)
{
gistxlogDelete xlrec;
XLogRecPtr recptr;
- xlrec.hnode = hnode;
+ xlrec.latestRemovedXid = latestRemovedXid;
xlrec.ntodelete = ntodelete;
XLogBeginInsert();
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index eb308c72d6b..767864b58e6 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -33,8 +33,11 @@ out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec)
}
static void
-out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec)
+out_gistxlogDelete(StringInfo buf, gistxlogDelete *xlrec)
{
+ appendStringInfo(buf, "delete: latestRemovedXid %u, nitems: %u",
+ xlrec->latestRemovedXid, xlrec->ntodelete);
+
}
static void
@@ -66,7 +69,7 @@ gist_desc(StringInfo buf, XLogReaderState *record)
out_gistxlogPageReuse(buf, (gistxlogPageReuse *) rec);
break;
case XLOG_GIST_DELETE:
- out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec);
+ out_gistxlogDelete(buf, (gistxlogDelete *) rec);
break;
case XLOG_GIST_PAGE_SPLIT:
out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 78e2e3fb312..ccf050cd62d 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -431,7 +431,7 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer,
Buffer leftchild);
extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
- int ntodelete, RelFileNode hnode);
+ int ntodelete, TransactionId latestRemovedXid);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 9990d97cbd3..e66b034d7b8 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -47,8 +47,7 @@ typedef struct gistxlogPageUpdate
*/
typedef struct gistxlogDelete
{
- RelFileNode hnode; /* RelFileNode of the heap the index currently
- * points at */
+ TransactionId latestRemovedXid;
uint16 ntodelete; /* number of deleted offsets */
/*
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 8b1348c36db..39a474c4996 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD100 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD101 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
--
2.21.0.dirty