brininsert optimization opportunity

Started by Soumyadeep Chakrabortyover 2 years ago38 messages
#1Soumyadeep Chakraborty
soumyadeep2007@gmail.com
2 attachment(s)

Hello hackers,

My colleague, Ashwin, pointed out to me that brininsert's per-tuple init
of the revmap access struct can have non-trivial overhead.

Turns out he is right. We are saving 24 bytes of memory per-call for
the access struct, and a bit on buffer/locking overhead, with the
attached patch.

The implementation ties the revmap cleanup as a MemoryContext callback
to the IndexInfo struct's MemoryContext, as there is no teardown
function provided by the index AM for end-of-insert-command.

Test setup (local Ubuntu workstation):

# Drop caches and restart between each run:
sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches;"
pg_ctl -D /usr/local/pgsql/data/ -l /tmp/logfile restart

\timing
DROP TABLE heap;
CREATE TABLE heap(i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
INSERT INTO heap SELECT 1 FROM generate_series(1, 200000000);

Results:
We see an improvement for 100M tuples and an even bigger improvement for
200M tuples.

Master (29cf61ade3f245aa40f427a1d6345287ef77e622):

test=# INSERT INTO heap SELECT 1 FROM generate_series(1, 100000000);
INSERT 0 100000000
Time: 222762.159 ms (03:42.762)

-- 3 runs
test=# INSERT INTO heap SELECT 1 FROM generate_series(1, 200000000);
INSERT 0 200000000
Time: 471168.181 ms (07:51.168)
Time: 457071.883 ms (07:37.072)
TimeL 486969.205 ms (08:06.969)

Branch:

test2=# INSERT INTO heap SELECT 1 FROM generate_series(1, 100000000);
INSERT 0 100000000
Time: 200046.519 ms (03:20.047)

-- 3 runs
test2=# INSERT INTO heap SELECT 1 FROM generate_series(1, 200000000);
INSERT 0 200000000
Time: 369041.832 ms (06:09.042)
Time: 365483.382 ms (06:05.483)
Time: 375506.144 ms (06:15.506)

# Profiled backend running INSERT of 100000000 rows
sudo perf record -p 11951 --call-graph fp sleep 180

Please see attached perf diff between master and branch. We see that we
save on a bit of overhead from brinRevmapInitialize(),
brinRevmapTerminate() and lock routines.

Regards,
Soumyadeep (VMware)

Attachments:

perf_diff.outapplication/octet-stream; name=perf_diff.outDownload
v1-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From 5d11219e413fe6806e00dd738b051c3948dffcab Mon Sep 17 00:00:00 2001
From: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Date: Mon, 3 Jul 2023 10:47:48 -0700
Subject: [PATCH v1 1/1] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.
---
 src/backend/access/brin/brin.c | 70 ++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 3c6a956eaa3..e27bee980f1 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap 	*bs_rmAccess;
+	BrinDesc   	*bs_desc;
+	BlockNumber bs_pages_per_range;
+} BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,7 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -140,6 +152,42 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+static void
+brininsertCleanupCallback(void *arg)
+{
+	BrinInsertState *bistate = (BrinInsertState *) arg;
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+}
+
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
+{
+	BrinInsertState 		*bistate;
+	MemoryContextCallback 	*cb;
+	MemoryContext 			oldcxt;
+
+	oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+	bistate = MemoryContextAllocZero(indexInfo->ii_Context,
+									 sizeof(BrinInsertState));
+
+	bistate->bs_desc = brin_build_desc(idxRel);
+	cb = palloc(sizeof(MemoryContextCallback));
+	cb->arg = bistate;
+	cb->func = brininsertCleanupCallback;
+	MemoryContextRegisterResetCallback(indexInfo->ii_Context, cb);
+	MemoryContextSwitchTo(oldcxt);
+
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range,
+												NULL);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +210,23 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel, indexInfo);
+		indexInfo->ii_AmCache = (void *) bistate;
+	}
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +285,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +355,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
-- 
2.34.1

#2Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: Soumyadeep Chakraborty (#1)
Re: brininsert optimization opportunity

On 2023-Jul-03, Soumyadeep Chakraborty wrote:

My colleague, Ashwin, pointed out to me that brininsert's per-tuple init
of the revmap access struct can have non-trivial overhead.

Turns out he is right. We are saving 24 bytes of memory per-call for
the access struct, and a bit on buffer/locking overhead, with the
attached patch.

Hmm, yeah, I remember being bit bothered by this repeated
initialization. Your patch looks reasonable to me. I would set
bistate->bs_rmAccess to NULL in the cleanup callback, just to be sure.
Also, please add comments atop these two new functions, to explain what
they are.

Nice results.

--
Álvaro Herrera Breisgau, Deutschland — https://www.EnterpriseDB.com/

#3Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Alvaro Herrera (#2)
Re: brininsert optimization opportunity

On 7/4/23 13:23, Alvaro Herrera wrote:

On 2023-Jul-03, Soumyadeep Chakraborty wrote:

My colleague, Ashwin, pointed out to me that brininsert's per-tuple init
of the revmap access struct can have non-trivial overhead.

Turns out he is right. We are saving 24 bytes of memory per-call for
the access struct, and a bit on buffer/locking overhead, with the
attached patch.

Hmm, yeah, I remember being bit bothered by this repeated
initialization. Your patch looks reasonable to me. I would set
bistate->bs_rmAccess to NULL in the cleanup callback, just to be sure.
Also, please add comments atop these two new functions, to explain what
they are.

Nice results.

Yeah. I wonder how much of that runtime is the generate_series(),
though. What's the speedup if that part is subtracted. It's guaranteed
to be even more significant, but by how much?

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#4Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Tomas Vondra (#3)
2 attachment(s)
Re: brininsert optimization opportunity

Thank you both for reviewing!

On Tue, Jul 4, 2023 at 4:24AM Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:

Hmm, yeah, I remember being bit bothered by this repeated
initialization. Your patch looks reasonable to me. I would set
bistate->bs_rmAccess to NULL in the cleanup callback, just to be sure.
Also, please add comments atop these two new functions, to explain what
they are.

Done. Set bistate->bs_desc = NULL; as well. Added comments.

On Tue, Jul 4, 2023 at 4:59AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

Yeah. I wonder how much of that runtime is the generate_series(),
though. What's the speedup if that part is subtracted. It's guaranteed
to be even more significant, but by how much?

When trying COPY, I got tripped by the following:

We get a buffer leak WARNING for the meta page and a revmap page.

WARNING: buffer refcount leak: [094] (rel=base/156912/206068,
blockNum=1, flags=0x83000000, refcount=1 1)
WARNING: buffer refcount leak: [093] (rel=base/156912/206068,
blockNum=0, flags=0x83000000, refcount=1 1)

PrintBufferLeakWarning bufmgr.c:3240
ResourceOwnerReleaseInternal resowner.c:554
ResourceOwnerRelease resowner.c:494
PortalDrop portalmem.c:563
exec_simple_query postgres.c:1284

We release the buffer during this resowner release and then we crash
with:

TRAP: failed Assert("bufnum <= NBuffers"), File:
"../../../../src/include/storage/bufmgr.h", Line: 305, PID: 86833
postgres: pivotal test4 [local] COPY(ExceptionalCondition+0xbb)[0x5572b55bcc79]
postgres: pivotal test4 [local] COPY(+0x61ccfc)[0x5572b537dcfc]
postgres: pivotal test4 [local] COPY(ReleaseBuffer+0x19)[0x5572b5384db2]
postgres: pivotal test4 [local] COPY(brinRevmapTerminate+0x1e)[0x5572b4e3fd39]
postgres: pivotal test4 [local] COPY(+0xcfc44)[0x5572b4e30c44]
postgres: pivotal test4 [local] COPY(+0x89e7f2)[0x5572b55ff7f2]
postgres: pivotal test4 [local] COPY(MemoryContextDelete+0xd7)[0x5572b55ff683]
postgres: pivotal test4 [local] COPY(PortalDrop+0x374)[0x5572b5602dc7]

Unfortunately, when we do COPY, the MemoryContext where makeIndexInfo
gets called is PortalContext and that is what is set in ii_Context.
Furthermore, we clean up the resource owner stuff before we can clean
up the MemoryContexts in PortalDrop().

The CurrentMemoryContext when initialize_brin_insertstate() is called
depends. For CopyMultiInsertBufferFlush() -> ExecInsertIndexTuples()
it is PortalContext, and for CopyFrom() -> ExecInsertIndexTuples() it is
ExecutorState/ExprContext. We can't rely on it to register the callback
neither.

What we can do is create a new MemoryContext for holding the
BrinInsertState, and we tie the callback to that so that cleanup is not
affected by all of these variables. See v2 patch attached. Passes make
installcheck-world and make installcheck -C src/test/modules/brin.

However, we do still have 1 issue with the v2 patch:
When we try to cancel (Ctrl-c) a running COPY command:
ERROR: buffer 151 is not owned by resource owner TopTransaction

#4 0x0000559cbc54a934 in ResourceOwnerForgetBuffer
(owner=0x559cbd6fcf28, buffer=143) at resowner.c:997
#5 0x0000559cbc2c45e7 in UnpinBuffer (buf=0x7f8d4a8f3f80) at bufmgr.c:2390
#6 0x0000559cbc2c7e49 in ReleaseBuffer (buffer=143) at bufmgr.c:4488
#7 0x0000559cbbd82d53 in brinRevmapTerminate (revmap=0x559cbd7a03b8)
at brin_revmap.c:105
#8 0x0000559cbbd73c44 in brininsertCleanupCallback
(arg=0x559cbd7a5b68) at brin.c:168
#9 0x0000559cbc54280c in MemoryContextCallResetCallbacks
(context=0x559cbd7a5a50) at mcxt.c:506
#10 0x0000559cbc54269d in MemoryContextDelete (context=0x559cbd7a5a50)
at mcxt.c:421
#11 0x0000559cbc54273e in MemoryContextDeleteChildren
(context=0x559cbd69ae90) at mcxt.c:457
#12 0x0000559cbc54625c in AtAbort_Portals () at portalmem.c:850

Haven't found a way to fix this ^ yet.

Maybe there is a better way of doing our cleanup? I'm not sure. Would
love your input!

The other alternative for all this is to introduce new AM callbacks for
insert_begin and insert_end. That might be a tougher sell?

Now, to finally answer your question about the speedup without
generate_series(). We do see an even higher speedup!

seq 1 200000000 > /tmp/data.csv
\timing
DROP TABLE heap;
CREATE TABLE heap(i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
COPY heap FROM '/tmp/data.csv';

-- 3 runs (master 29cf61ade3f245aa40f427a1d6345287ef77e622)
COPY 200000000
Time: 205072.444 ms (03:25.072)
Time: 215380.369 ms (03:35.380)
Time: 203492.347 ms (03:23.492)

-- 3 runs (branch v2)

COPY 200000000
Time: 135052.752 ms (02:15.053)
Time: 135093.131 ms (02:15.093)
Time: 138737.048 ms (02:18.737)

Regards,
Soumyadeep (VMware)

Attachments:

v2-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=US-ASCII; name=v2-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From 54ba134f4afe9c4bac19e7d8fde31b9768dc23cd Mon Sep 17 00:00:00 2001
From: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Date: Tue, 4 Jul 2023 11:50:35 -0700
Subject: [PATCH v2 1/1] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.
---
 src/backend/access/brin/brin.c | 89 +++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 3c6a956eaa3..32b588af4da 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap *bs_rmAccess;
+	BrinDesc   *bs_desc;
+	BlockNumber bs_pages_per_range;
+}			BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,8 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static void brininsertCleanupCallback(void *arg);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -140,6 +153,60 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+/*
+ * Callback to clean up the BrinInsertState once all tuple inserts are done.
+ */
+static void
+brininsertCleanupCallback(void *arg)
+{
+	BrinInsertState *bistate = (BrinInsertState *) arg;
+
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+	bistate->bs_rmAccess = NULL;
+	bistate->bs_desc = NULL;
+}
+
+/*
+ * Initialize a BrinInsertState to maintain state to be used across multiple
+ * tuple inserts, within the same command.
+ */
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel)
+{
+	BrinInsertState *bistate;
+	MemoryContextCallback *cb;
+	MemoryContext cxt;
+	MemoryContext oldcxt;
+
+	/*
+	 * Create private context for holding the BrinInsertState to ensure that we
+	 * clean up the revmap safely in the callback.
+	 */
+	cxt = AllocSetContextCreate(CurrentMemoryContext,
+								"brin insert cxt",
+								ALLOCSET_SMALL_SIZES);
+	oldcxt = MemoryContextSwitchTo(cxt);
+
+	bistate = palloc0(sizeof(BrinInsertState));
+	bistate->bs_desc = brin_build_desc(idxRel);
+	cb = palloc(sizeof(MemoryContextCallback));
+	cb->arg = bistate;
+	cb->func = brininsertCleanupCallback;
+	MemoryContextRegisterResetCallback(CurrentMemoryContext, cb);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range,
+												NULL);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +229,23 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc   *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel);
+		indexInfo->ii_AmCache = (void *) bistate;
+	}
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +304,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +374,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
-- 
2.34.1

perf_diff_v2.outapplication/octet-stream; name=perf_diff_v2.outDownload
#5Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Soumyadeep Chakraborty (#4)
Re: brininsert optimization opportunity

On 7/4/23 21:25, Soumyadeep Chakraborty wrote:

Thank you both for reviewing!

On Tue, Jul 4, 2023 at 4:24AM Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:

Hmm, yeah, I remember being bit bothered by this repeated
initialization. Your patch looks reasonable to me. I would set
bistate->bs_rmAccess to NULL in the cleanup callback, just to be sure.
Also, please add comments atop these two new functions, to explain what
they are.

Done. Set bistate->bs_desc = NULL; as well. Added comments.

On Tue, Jul 4, 2023 at 4:59AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

Yeah. I wonder how much of that runtime is the generate_series(),
though. What's the speedup if that part is subtracted. It's guaranteed
to be even more significant, but by how much?

When trying COPY, I got tripped by the following:

We get a buffer leak WARNING for the meta page and a revmap page.

WARNING: buffer refcount leak: [094] (rel=base/156912/206068,
blockNum=1, flags=0x83000000, refcount=1 1)
WARNING: buffer refcount leak: [093] (rel=base/156912/206068,
blockNum=0, flags=0x83000000, refcount=1 1)

PrintBufferLeakWarning bufmgr.c:3240
ResourceOwnerReleaseInternal resowner.c:554
ResourceOwnerRelease resowner.c:494
PortalDrop portalmem.c:563
exec_simple_query postgres.c:1284

We release the buffer during this resowner release and then we crash
with:

TRAP: failed Assert("bufnum <= NBuffers"), File:
"../../../../src/include/storage/bufmgr.h", Line: 305, PID: 86833
postgres: pivotal test4 [local] COPY(ExceptionalCondition+0xbb)[0x5572b55bcc79]
postgres: pivotal test4 [local] COPY(+0x61ccfc)[0x5572b537dcfc]
postgres: pivotal test4 [local] COPY(ReleaseBuffer+0x19)[0x5572b5384db2]
postgres: pivotal test4 [local] COPY(brinRevmapTerminate+0x1e)[0x5572b4e3fd39]
postgres: pivotal test4 [local] COPY(+0xcfc44)[0x5572b4e30c44]
postgres: pivotal test4 [local] COPY(+0x89e7f2)[0x5572b55ff7f2]
postgres: pivotal test4 [local] COPY(MemoryContextDelete+0xd7)[0x5572b55ff683]
postgres: pivotal test4 [local] COPY(PortalDrop+0x374)[0x5572b5602dc7]

Unfortunately, when we do COPY, the MemoryContext where makeIndexInfo
gets called is PortalContext and that is what is set in ii_Context.
Furthermore, we clean up the resource owner stuff before we can clean
up the MemoryContexts in PortalDrop().

The CurrentMemoryContext when initialize_brin_insertstate() is called
depends. For CopyMultiInsertBufferFlush() -> ExecInsertIndexTuples()
it is PortalContext, and for CopyFrom() -> ExecInsertIndexTuples() it is
ExecutorState/ExprContext. We can't rely on it to register the callback
neither.

What we can do is create a new MemoryContext for holding the

BrinInsertState, and we tie the callback to that so that cleanup is not
affected by all of these variables. See v2 patch attached. Passes make
installcheck-world and make installcheck -C src/test/modules/brin.

However, we do still have 1 issue with the v2 patch:

When we try to cancel (Ctrl-c) a running COPY command:
ERROR: buffer 151 is not owned by resource owner TopTransaction

I'm not sure if memory context callbacks are the right way to rely on
for this purpose. The primary purpose of memory contexts is to track
memory, so using them for this seems a bit weird.

There are cases that do something similar, like expandendrecord.c where
we track refcounted tuple slot, but IMHO there's a big difference
between tracking one slot allocated right there, and unknown number of
buffers allocated much later.

The fact that even with the extra context is still doesn't handle query
cancellations is another argument against that approach (I wonder how
expandedrecord.c handles that, but I haven't checked).

Maybe there is a better way of doing our cleanup? I'm not sure. Would
love your input!

The other alternative for all this is to introduce new AM callbacks for
insert_begin and insert_end. That might be a tougher sell?

That's the approach I wanted to suggest, more or less - to do the
cleanup from ExecCloseIndices() before index_close(). I wonder if it's
even correct to do that later, once we release the locks etc.

I don't think ii_AmCache was intended for stuff like this - GIN and GiST
only use it to cache stuff that can be just pfree-d, but for buffers
that's no enough. It's not surprising we need to improve this.

FWIW while debugging this (breakpoint on MemoryContextDelete) I was
rather annoyed the COPY keeps dropping and recreating the two BRIN
contexts - brininsert cxt / brin dtuple. I wonder if we could keep and
reuse those too, but I don't know how much it'd help.

Now, to finally answer your question about the speedup without
generate_series(). We do see an even higher speedup!

seq 1 200000000 > /tmp/data.csv
\timing
DROP TABLE heap;
CREATE TABLE heap(i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
COPY heap FROM '/tmp/data.csv';

-- 3 runs (master 29cf61ade3f245aa40f427a1d6345287ef77e622)
COPY 200000000
Time: 205072.444 ms (03:25.072)
Time: 215380.369 ms (03:35.380)
Time: 203492.347 ms (03:23.492)

-- 3 runs (branch v2)

COPY 200000000
Time: 135052.752 ms (02:15.053)
Time: 135093.131 ms (02:15.093)
Time: 138737.048 ms (02:18.737)

That's nice, but it still doesn't say how much of that is reading the
data. If you do just copy into a table without any indexes, how long
does it take?

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#6Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Tomas Vondra (#5)
1 attachment(s)
Re: brininsert optimization opportunity

On Tue, Jul 4, 2023 at 2:54 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

I'm not sure if memory context callbacks are the right way to rely on
for this purpose. The primary purpose of memory contexts is to track
memory, so using them for this seems a bit weird.

Yeah, this just kept getting dirtier and dirtier.

There are cases that do something similar, like expandendrecord.c where
we track refcounted tuple slot, but IMHO there's a big difference
between tracking one slot allocated right there, and unknown number of
buffers allocated much later.

Yeah, the following code in ER_mc_callbackis is there I think to prevent double
freeing the tupdesc (since it might be freed in ResourceOwnerReleaseInternal())
(The part about: /* Ditto for tupdesc references */).

if (tupdesc->tdrefcount > 0)
{
if (--tupdesc->tdrefcount == 0)
FreeTupleDesc(tupdesc);
}
Plus the above code doesn't try anything with Resource owner stuff, whereas
releasing a buffer means:
ReleaseBuffer() -> UnpinBuffer() ->
ResourceOwnerForgetBuffer(CurrentResourceOwner, b);

The fact that even with the extra context is still doesn't handle query
cancellations is another argument against that approach (I wonder how
expandedrecord.c handles that, but I haven't checked).

Maybe there is a better way of doing our cleanup? I'm not sure. Would
love your input!

The other alternative for all this is to introduce new AM callbacks for
insert_begin and insert_end. That might be a tougher sell?

That's the approach I wanted to suggest, more or less - to do the
cleanup from ExecCloseIndices() before index_close(). I wonder if it's
even correct to do that later, once we release the locks etc.

I'll try this out and introduce a couple of new index AM callbacks. I
think it's best to do it before releasing the locks - otherwise it
might be weird
to manipulate buffers of an index relation, without having some sort of lock on
it. I'll think about it some more.

I don't think ii_AmCache was intended for stuff like this - GIN and GiST
only use it to cache stuff that can be just pfree-d, but for buffers
that's no enough. It's not surprising we need to improve this.

Hmmm, yes, although the docs state:
"If the index AM wishes to cache data across successive index insertions within
an SQL statement, it can allocate space in indexInfo->ii_Context and
store a pointer
to the data in indexInfo->ii_AmCache (which will be NULL initially)."
they don't mention anything about buffer usage. Well we will fix it!

PS: It should be possible to make GIN and GiST use the new index AM APIs
as well.

FWIW while debugging this (breakpoint on MemoryContextDelete) I was
rather annoyed the COPY keeps dropping and recreating the two BRIN
contexts - brininsert cxt / brin dtuple. I wonder if we could keep and
reuse those too, but I don't know how much it'd help.

Interesting, I will investigate that.

Now, to finally answer your question about the speedup without
generate_series(). We do see an even higher speedup!

seq 1 200000000 > /tmp/data.csv
\timing
DROP TABLE heap;
CREATE TABLE heap(i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
COPY heap FROM '/tmp/data.csv';

-- 3 runs (master 29cf61ade3f245aa40f427a1d6345287ef77e622)
COPY 200000000
Time: 205072.444 ms (03:25.072)
Time: 215380.369 ms (03:35.380)
Time: 203492.347 ms (03:23.492)

-- 3 runs (branch v2)

COPY 200000000
Time: 135052.752 ms (02:15.053)
Time: 135093.131 ms (02:15.093)
Time: 138737.048 ms (02:18.737)

That's nice, but it still doesn't say how much of that is reading the
data. If you do just copy into a table without any indexes, how long
does it take?

So, I loaded the same heap table without any indexes and at the same
scale. I got:

postgres=# COPY heap FROM '/tmp/data.csv';
COPY 200000000
Time: 116161.545 ms (01:56.162)
Time: 114182.745 ms (01:54.183)
Time: 114975.368 ms (01:54.975)

perf diff also attached between the three: w/ no indexes (baseline),
master and v2.

Regards,
Soumyadeep (VMware)

Attachments:

perf_diff_3way.outapplication/octet-stream; name=perf_diff_3way.outDownload
#7Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Soumyadeep Chakraborty (#6)
Re: brininsert optimization opportunity

On 7/5/23 02:35, Soumyadeep Chakraborty wrote:

On Tue, Jul 4, 2023 at 2:54 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

I'm not sure if memory context callbacks are the right way to rely on
for this purpose. The primary purpose of memory contexts is to track
memory, so using them for this seems a bit weird.

Yeah, this just kept getting dirtier and dirtier.

There are cases that do something similar, like expandendrecord.c where
we track refcounted tuple slot, but IMHO there's a big difference
between tracking one slot allocated right there, and unknown number of
buffers allocated much later.

Yeah, the following code in ER_mc_callbackis is there I think to prevent double
freeing the tupdesc (since it might be freed in ResourceOwnerReleaseInternal())
(The part about: /* Ditto for tupdesc references */).

if (tupdesc->tdrefcount > 0)
{
if (--tupdesc->tdrefcount == 0)
FreeTupleDesc(tupdesc);
}
Plus the above code doesn't try anything with Resource owner stuff, whereas
releasing a buffer means:
ReleaseBuffer() -> UnpinBuffer() ->
ResourceOwnerForgetBuffer(CurrentResourceOwner, b);

Well, my understanding is the expandedrecord.c code increments the
refcount exactly to prevent the resource owner to release the slot too
early. My assumption is we'd need to do something similar for the revmap
buffers by calling IncrBufferRefCount or something. But that's going to
be messy, because the buffers are read elsewhere.

The fact that even with the extra context is still doesn't handle query
cancellations is another argument against that approach (I wonder how
expandedrecord.c handles that, but I haven't checked).

Maybe there is a better way of doing our cleanup? I'm not sure. Would
love your input!

The other alternative for all this is to introduce new AM callbacks for
insert_begin and insert_end. That might be a tougher sell?

That's the approach I wanted to suggest, more or less - to do the
cleanup from ExecCloseIndices() before index_close(). I wonder if it's
even correct to do that later, once we release the locks etc.

I'll try this out and introduce a couple of new index AM callbacks. I
think it's best to do it before releasing the locks - otherwise it
might be weird
to manipulate buffers of an index relation, without having some sort of lock on
it. I'll think about it some more.

I don't understand why would this need more than just a callback to
release the cache.

I don't think ii_AmCache was intended for stuff like this - GIN and GiST
only use it to cache stuff that can be just pfree-d, but for buffers
that's no enough. It's not surprising we need to improve this.

Hmmm, yes, although the docs state:
"If the index AM wishes to cache data across successive index insertions within
an SQL statement, it can allocate space in indexInfo->ii_Context and
store a pointer
to the data in indexInfo->ii_AmCache (which will be NULL initially)."
they don't mention anything about buffer usage. Well we will fix it!

PS: It should be possible to make GIN and GiST use the new index AM APIs
as well.

Why should GIN/GiST use the new API? I think it's perfectly sensible to
only require the "cleanup callback" when just pfree() is not enough.

FWIW while debugging this (breakpoint on MemoryContextDelete) I was
rather annoyed the COPY keeps dropping and recreating the two BRIN
contexts - brininsert cxt / brin dtuple. I wonder if we could keep and
reuse those too, but I don't know how much it'd help.

Interesting, I will investigate that.

Now, to finally answer your question about the speedup without
generate_series(). We do see an even higher speedup!

seq 1 200000000 > /tmp/data.csv
\timing
DROP TABLE heap;
CREATE TABLE heap(i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
COPY heap FROM '/tmp/data.csv';

-- 3 runs (master 29cf61ade3f245aa40f427a1d6345287ef77e622)
COPY 200000000
Time: 205072.444 ms (03:25.072)
Time: 215380.369 ms (03:35.380)
Time: 203492.347 ms (03:23.492)

-- 3 runs (branch v2)

COPY 200000000
Time: 135052.752 ms (02:15.053)
Time: 135093.131 ms (02:15.093)
Time: 138737.048 ms (02:18.737)

That's nice, but it still doesn't say how much of that is reading the
data. If you do just copy into a table without any indexes, how long
does it take?

So, I loaded the same heap table without any indexes and at the same
scale. I got:

postgres=# COPY heap FROM '/tmp/data.csv';
COPY 200000000
Time: 116161.545 ms (01:56.162)
Time: 114182.745 ms (01:54.183)
Time: 114975.368 ms (01:54.975)

OK, so the baseline is 115 seconds. With the current code, an index
means +100 seconds. With the optimization, it's just +20. Nice.

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#8Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Tomas Vondra (#7)
1 attachment(s)
Re: brininsert optimization opportunity

On Wed, Jul 5, 2023 at 3:16 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

I'll try this out and introduce a couple of new index AM callbacks. I
think it's best to do it before releasing the locks - otherwise it
might be weird
to manipulate buffers of an index relation, without having some sort of lock on
it. I'll think about it some more.

I don't understand why would this need more than just a callback to
release the cache.

We wouldn't. I thought that it would be slightly cleaner and slightly more
performant if we moved the (if !state) branches out of the XXXinsert()
functions.
But I guess, let's minimize the changes here. One cleanup callback is enough.

PS: It should be possible to make GIN and GiST use the new index AM APIs
as well.

Why should GIN/GiST use the new API? I think it's perfectly sensible to
only require the "cleanup callback" when just pfree() is not enough.

Yeah no need.

Attached v3 of the patch w/ a single index AM callback.

Regards,
Soumyadeep (VMware)

Attachments:

v3-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=US-ASCII; name=v3-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From 563b905da5c2602113044689e37f8385cbfbde64 Mon Sep 17 00:00:00 2001
From: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Date: Wed, 5 Jul 2023 10:59:11 -0700
Subject: [PATCH v3 1/1] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.

To mitigate, we introduce a new AM callback to clean up state at the end
of all inserts, and perform the revmap cleanup there.
---
 contrib/bloom/blutils.c              |  1 +
 doc/src/sgml/indexam.sgml            | 14 +++++-
 src/backend/access/brin/brin.c       | 74 +++++++++++++++++++++++-----
 src/backend/access/gin/ginutil.c     |  1 +
 src/backend/access/gist/gist.c       |  1 +
 src/backend/access/hash/hash.c       |  1 +
 src/backend/access/index/indexam.c   | 15 ++++++
 src/backend/access/nbtree/nbtree.c   |  1 +
 src/backend/access/spgist/spgutils.c |  1 +
 src/backend/executor/execIndexing.c  |  5 ++
 src/include/access/amapi.h           |  3 ++
 src/include/access/brin_internal.h   |  1 +
 src/include/access/genam.h           |  2 +
 13 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index d935ed8fbdf..9720f69de09 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -131,6 +131,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = blbuild;
 	amroutine->ambuildempty = blbuildempty;
 	amroutine->aminsert = blinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index e813e2b620a..17f7d6597f1 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -139,6 +139,7 @@ typedef struct IndexAmRoutine
     ambuild_function ambuild;
     ambuildempty_function ambuildempty;
     aminsert_function aminsert;
+    aminsertcleanup_function aminsertcleanup;
     ambulkdelete_function ambulkdelete;
     amvacuumcleanup_function amvacuumcleanup;
     amcanreturn_function amcanreturn;   /* can be NULL */
@@ -355,11 +356,22 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially).
+   initially). If the data cached contains structures that can be simply pfree'd,
+   they will implicitly be pfree'd. But, if it contains more complex data, such
+   as Buffers or TupleDescs, additional cleanup is necessary. This additional
+   cleanup can be performed in <literal>indexinsertcleanup</literal>.
   </para>
 
   <para>
 <programlisting>
+bool
+aminsertcleanup (IndexInfo *indexInfo);
+</programlisting>
+   Clean up state that was maintained across successive inserts in
+   <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
+   contained is complex - like Buffers or TupleDescs which need additional
+   cleanup, unlike simple structures that will be implicitly pfree'd.
+<programlisting>
 IndexBulkDeleteResult *
 ambulkdelete (IndexVacuumInfo *info,
               IndexBulkDeleteResult *stats,
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 3c6a956eaa3..2da59f2ab03 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap *bs_rmAccess;
+	BrinDesc   *bs_desc;
+	BlockNumber bs_pages_per_range;
+}			BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,7 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -117,6 +129,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = brinbuild;
 	amroutine->ambuildempty = brinbuildempty;
 	amroutine->aminsert = brininsert;
+	amroutine->aminsertcleanup = brininsertcleanup;
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
@@ -140,6 +153,28 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+/*
+ * Initialize a BrinInsertState to maintain state to be used across multiple
+ * tuple inserts, within the same command.
+ */
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate;
+	MemoryContext oldcxt;
+
+	oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+	bistate = palloc0(sizeof(BrinInsertState));
+	bistate->bs_desc = brin_build_desc(idxRel);
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range,
+												NULL);
+	indexInfo->ii_AmCache = bistate;
+	MemoryContextSwitchTo(oldcxt);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +197,22 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc   *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel, indexInfo);
+	}
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +271,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +341,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
@@ -316,6 +350,24 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	return false;
 }
 
+/*
+ * Callback to clean up the BrinInsertState once all tuple inserts are done.
+ */
+void
+brininsertcleanup(IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
+
+	Assert(bistate);
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+	bistate->bs_rmAccess = NULL;
+	bistate->bs_desc = NULL;
+}
+
 /*
  * Initialize state for a BRIN index scan.
  *
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 437f24753c0..56223305442 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -64,6 +64,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = ginbuild;
 	amroutine->ambuildempty = ginbuildempty;
 	amroutine->aminsert = gininsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 516465f8b7d..c42746130cc 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -86,6 +86,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = gistbuild;
 	amroutine->ambuildempty = gistbuildempty;
 	amroutine->aminsert = gistinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fc5d97f606e..cba15cde0d7 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -83,6 +83,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = hashbuild;
 	amroutine->ambuildempty = hashbuildempty;
 	amroutine->aminsert = hashinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b25b03f7abc..55b9c18369a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -196,6 +196,21 @@ index_insert(Relation indexRelation,
 											 indexInfo);
 }
 
+/* -------------------------
+ *		index_insert_cleanup - clean up after all index inserts are done.
+ * -------------------------
+ */
+void
+index_insert_cleanup(Relation indexRelation,
+					 IndexInfo *indexInfo)
+{
+	RELATION_CHECKS;
+	Assert(indexInfo);
+
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+}
+
 /*
  * index_beginscan - start a scan of an index with amgettuple
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4553aaee531..2753566437f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -122,6 +122,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 190e4f76a9e..b8efcf5cfb2 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -70,6 +70,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 1d82b64b897..63699fdd93d 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -233,15 +233,20 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
 	int			i;
 	int			numIndices;
 	RelationPtr indexDescs;
+	IndexInfo **indexInfos;
 
 	numIndices = resultRelInfo->ri_NumIndices;
 	indexDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfos = resultRelInfo->ri_IndexRelationInfo;
 
 	for (i = 0; i < numIndices; i++)
 	{
 		if (indexDescs[i] == NULL)
 			continue;			/* shouldn't happen? */
 
+		/* Give the index a chance to do some post-insert cleanup */
+		index_insert_cleanup(indexDescs[i], indexInfos[i]);
+
 		/* Drop lock acquired by ExecOpenIndices */
 		index_close(indexDescs[i], RowExclusiveLock);
 	}
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 4476ff7fba1..d29721eaf1e 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -113,6 +113,8 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   bool indexUnchanged,
 								   struct IndexInfo *indexInfo);
 
+typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
 														 IndexBulkDeleteResult *stats,
@@ -261,6 +263,7 @@ typedef struct IndexAmRoutine
 	ambuild_function ambuild;
 	ambuildempty_function ambuildempty;
 	aminsert_function aminsert;
+	aminsertcleanup_function aminsertcleanup;
 	ambulkdelete_function ambulkdelete;
 	amvacuumcleanup_function amvacuumcleanup;
 	amcanreturn_function amcanreturn;	/* can be NULL */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 97ddc925b27..ed231e208eb 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,6 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
+extern void brininsertcleanup(struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index a3087956654..2cf9dc5dca9 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -148,6 +148,8 @@ extern bool index_insert(Relation indexRelation,
 						 IndexUniqueCheck checkUnique,
 						 bool indexUnchanged,
 						 struct IndexInfo *indexInfo);
+extern void index_insert_cleanup(Relation indexRelation,
+								 struct IndexInfo *indexInfo);
 
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
-- 
2.34.1

#9Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Soumyadeep Chakraborty (#8)
1 attachment(s)
Re: brininsert optimization opportunity

Attached v4 of the patch, rebased against latest HEAD.

Regards,
Soumyadeep (VMware)

Attachments:

v4-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From b5fb12fbb8b0c1b2963a05a2877b5063bbc75f58 Mon Sep 17 00:00:00 2001
From: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Date: Sat, 29 Jul 2023 09:17:49 -0700
Subject: [PATCH v4 1/1] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.

To mitigate, we introduce a new AM callback to clean up state at the end
of all inserts, and perform the revmap cleanup there.
---
 contrib/bloom/blutils.c              |  1 +
 doc/src/sgml/indexam.sgml            | 14 +++++-
 src/backend/access/brin/brin.c       | 74 +++++++++++++++++++++++-----
 src/backend/access/gin/ginutil.c     |  1 +
 src/backend/access/gist/gist.c       |  1 +
 src/backend/access/hash/hash.c       |  1 +
 src/backend/access/index/indexam.c   | 15 ++++++
 src/backend/access/nbtree/nbtree.c   |  1 +
 src/backend/access/spgist/spgutils.c |  1 +
 src/backend/executor/execIndexing.c  |  5 ++
 src/include/access/amapi.h           |  3 ++
 src/include/access/brin_internal.h   |  1 +
 src/include/access/genam.h           |  2 +
 13 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index d935ed8fbdf..9720f69de09 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -131,6 +131,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = blbuild;
 	amroutine->ambuildempty = blbuildempty;
 	amroutine->aminsert = blinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index e813e2b620a..17f7d6597f1 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -139,6 +139,7 @@ typedef struct IndexAmRoutine
     ambuild_function ambuild;
     ambuildempty_function ambuildempty;
     aminsert_function aminsert;
+    aminsertcleanup_function aminsertcleanup;
     ambulkdelete_function ambulkdelete;
     amvacuumcleanup_function amvacuumcleanup;
     amcanreturn_function amcanreturn;   /* can be NULL */
@@ -355,11 +356,22 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially).
+   initially). If the data cached contains structures that can be simply pfree'd,
+   they will implicitly be pfree'd. But, if it contains more complex data, such
+   as Buffers or TupleDescs, additional cleanup is necessary. This additional
+   cleanup can be performed in <literal>indexinsertcleanup</literal>.
   </para>
 
   <para>
 <programlisting>
+bool
+aminsertcleanup (IndexInfo *indexInfo);
+</programlisting>
+   Clean up state that was maintained across successive inserts in
+   <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
+   contained is complex - like Buffers or TupleDescs which need additional
+   cleanup, unlike simple structures that will be implicitly pfree'd.
+<programlisting>
 IndexBulkDeleteResult *
 ambulkdelete (IndexVacuumInfo *info,
               IndexBulkDeleteResult *stats,
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 3c6a956eaa3..2da59f2ab03 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap *bs_rmAccess;
+	BrinDesc   *bs_desc;
+	BlockNumber bs_pages_per_range;
+}			BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,7 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -117,6 +129,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = brinbuild;
 	amroutine->ambuildempty = brinbuildempty;
 	amroutine->aminsert = brininsert;
+	amroutine->aminsertcleanup = brininsertcleanup;
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
@@ -140,6 +153,28 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+/*
+ * Initialize a BrinInsertState to maintain state to be used across multiple
+ * tuple inserts, within the same command.
+ */
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate;
+	MemoryContext oldcxt;
+
+	oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+	bistate = palloc0(sizeof(BrinInsertState));
+	bistate->bs_desc = brin_build_desc(idxRel);
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range,
+												NULL);
+	indexInfo->ii_AmCache = bistate;
+	MemoryContextSwitchTo(oldcxt);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +197,22 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc   *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel, indexInfo);
+	}
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +271,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +341,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
@@ -316,6 +350,24 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	return false;
 }
 
+/*
+ * Callback to clean up the BrinInsertState once all tuple inserts are done.
+ */
+void
+brininsertcleanup(IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
+
+	Assert(bistate);
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+	bistate->bs_rmAccess = NULL;
+	bistate->bs_desc = NULL;
+}
+
 /*
  * Initialize state for a BRIN index scan.
  *
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 437f24753c0..56223305442 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -64,6 +64,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = ginbuild;
 	amroutine->ambuildempty = ginbuildempty;
 	amroutine->aminsert = gininsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 516465f8b7d..c42746130cc 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -86,6 +86,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = gistbuild;
 	amroutine->ambuildempty = gistbuildempty;
 	amroutine->aminsert = gistinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fc5d97f606e..cba15cde0d7 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -83,6 +83,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = hashbuild;
 	amroutine->ambuildempty = hashbuildempty;
 	amroutine->aminsert = hashinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b25b03f7abc..55b9c18369a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -196,6 +196,21 @@ index_insert(Relation indexRelation,
 											 indexInfo);
 }
 
+/* -------------------------
+ *		index_insert_cleanup - clean up after all index inserts are done.
+ * -------------------------
+ */
+void
+index_insert_cleanup(Relation indexRelation,
+					 IndexInfo *indexInfo)
+{
+	RELATION_CHECKS;
+	Assert(indexInfo);
+
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+}
+
 /*
  * index_beginscan - start a scan of an index with amgettuple
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4553aaee531..2753566437f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -122,6 +122,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 190e4f76a9e..b8efcf5cfb2 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -70,6 +70,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 1d82b64b897..63699fdd93d 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -233,15 +233,20 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
 	int			i;
 	int			numIndices;
 	RelationPtr indexDescs;
+	IndexInfo **indexInfos;
 
 	numIndices = resultRelInfo->ri_NumIndices;
 	indexDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfos = resultRelInfo->ri_IndexRelationInfo;
 
 	for (i = 0; i < numIndices; i++)
 	{
 		if (indexDescs[i] == NULL)
 			continue;			/* shouldn't happen? */
 
+		/* Give the index a chance to do some post-insert cleanup */
+		index_insert_cleanup(indexDescs[i], indexInfos[i]);
+
 		/* Drop lock acquired by ExecOpenIndices */
 		index_close(indexDescs[i], RowExclusiveLock);
 	}
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 4476ff7fba1..d29721eaf1e 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -113,6 +113,8 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   bool indexUnchanged,
 								   struct IndexInfo *indexInfo);
 
+typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
 														 IndexBulkDeleteResult *stats,
@@ -261,6 +263,7 @@ typedef struct IndexAmRoutine
 	ambuild_function ambuild;
 	ambuildempty_function ambuildempty;
 	aminsert_function aminsert;
+	aminsertcleanup_function aminsertcleanup;
 	ambulkdelete_function ambulkdelete;
 	amvacuumcleanup_function amvacuumcleanup;
 	amcanreturn_function amcanreturn;	/* can be NULL */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 97ddc925b27..ed231e208eb 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,6 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
+extern void brininsertcleanup(struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index a3087956654..2cf9dc5dca9 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -148,6 +148,8 @@ extern bool index_insert(Relation indexRelation,
 						 IndexUniqueCheck checkUnique,
 						 bool indexUnchanged,
 						 struct IndexInfo *indexInfo);
+extern void index_insert_cleanup(Relation indexRelation,
+								 struct IndexInfo *indexInfo);
 
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
-- 
2.34.1

#10Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Soumyadeep Chakraborty (#9)
Re: brininsert optimization opportunity

Created an entry for the Sep CF: https://commitfest.postgresql.org/44/4484/

Regards,
Soumyadeep (VMware)

On Sat, Jul 29, 2023 at 9:28 AM Soumyadeep Chakraborty
<soumyadeep2007@gmail.com> wrote:

Show quoted text

Attached v4 of the patch, rebased against latest HEAD.

Regards,
Soumyadeep (VMware)

#11Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Soumyadeep Chakraborty (#10)
1 attachment(s)
Re: brininsert optimization opportunity

Rebased against latest HEAD.

Regards,
Soumyadeep (VMware)

Attachments:

v5-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=US-ASCII; name=v5-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From 94a8acd3125aa4a613c238e435ed78dba9f40625 Mon Sep 17 00:00:00 2001
From: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Date: Sat, 29 Jul 2023 09:17:49 -0700
Subject: [PATCH v5 1/1] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.

To mitigate, we introduce a new AM callback to clean up state at the end
of all inserts, and perform the revmap cleanup there.
---
 contrib/bloom/blutils.c              |  1 +
 doc/src/sgml/indexam.sgml            | 14 +++++-
 src/backend/access/brin/brin.c       | 74 +++++++++++++++++++++++-----
 src/backend/access/gin/ginutil.c     |  1 +
 src/backend/access/gist/gist.c       |  1 +
 src/backend/access/hash/hash.c       |  1 +
 src/backend/access/index/indexam.c   | 15 ++++++
 src/backend/access/nbtree/nbtree.c   |  1 +
 src/backend/access/spgist/spgutils.c |  1 +
 src/backend/executor/execIndexing.c  |  5 ++
 src/include/access/amapi.h           |  3 ++
 src/include/access/brin_internal.h   |  1 +
 src/include/access/genam.h           |  2 +
 13 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index f23fbb1d9e0..4830cb3fee6 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -131,6 +131,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = blbuild;
 	amroutine->ambuildempty = blbuildempty;
 	amroutine->aminsert = blinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index e813e2b620a..17f7d6597f1 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -139,6 +139,7 @@ typedef struct IndexAmRoutine
     ambuild_function ambuild;
     ambuildempty_function ambuildempty;
     aminsert_function aminsert;
+    aminsertcleanup_function aminsertcleanup;
     ambulkdelete_function ambulkdelete;
     amvacuumcleanup_function amvacuumcleanup;
     amcanreturn_function amcanreturn;   /* can be NULL */
@@ -355,11 +356,22 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially).
+   initially). If the data cached contains structures that can be simply pfree'd,
+   they will implicitly be pfree'd. But, if it contains more complex data, such
+   as Buffers or TupleDescs, additional cleanup is necessary. This additional
+   cleanup can be performed in <literal>indexinsertcleanup</literal>.
   </para>
 
   <para>
 <programlisting>
+bool
+aminsertcleanup (IndexInfo *indexInfo);
+</programlisting>
+   Clean up state that was maintained across successive inserts in
+   <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
+   contained is complex - like Buffers or TupleDescs which need additional
+   cleanup, unlike simple structures that will be implicitly pfree'd.
+<programlisting>
 IndexBulkDeleteResult *
 ambulkdelete (IndexVacuumInfo *info,
               IndexBulkDeleteResult *stats,
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index d4fec654bb6..76927beb0ec 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap *bs_rmAccess;
+	BrinDesc   *bs_desc;
+	BlockNumber bs_pages_per_range;
+}			BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,7 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -117,6 +129,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = brinbuild;
 	amroutine->ambuildempty = brinbuildempty;
 	amroutine->aminsert = brininsert;
+	amroutine->aminsertcleanup = brininsertcleanup;
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
@@ -140,6 +153,28 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+/*
+ * Initialize a BrinInsertState to maintain state to be used across multiple
+ * tuple inserts, within the same command.
+ */
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate;
+	MemoryContext oldcxt;
+
+	oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+	bistate = palloc0(sizeof(BrinInsertState));
+	bistate->bs_desc = brin_build_desc(idxRel);
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range,
+												NULL);
+	indexInfo->ii_AmCache = bistate;
+	MemoryContextSwitchTo(oldcxt);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +197,22 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc   *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel, indexInfo);
+	}
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +271,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +341,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
@@ -316,6 +350,24 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	return false;
 }
 
+/*
+ * Callback to clean up the BrinInsertState once all tuple inserts are done.
+ */
+void
+brininsertcleanup(IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
+
+	Assert(bistate);
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+	bistate->bs_rmAccess = NULL;
+	bistate->bs_desc = NULL;
+}
+
 /*
  * Initialize state for a BRIN index scan.
  *
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 7a4cd93f301..a875c5d3d7a 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -64,6 +64,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = ginbuild;
 	amroutine->ambuildempty = ginbuildempty;
 	amroutine->aminsert = gininsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index b1f19f6a8e6..bfdd8fef118 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -86,6 +86,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = gistbuild;
 	amroutine->ambuildempty = gistbuildempty;
 	amroutine->aminsert = gistinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fc5d97f606e..cba15cde0d7 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -83,6 +83,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = hashbuild;
 	amroutine->ambuildempty = hashbuildempty;
 	amroutine->aminsert = hashinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b25b03f7abc..55b9c18369a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -196,6 +196,21 @@ index_insert(Relation indexRelation,
 											 indexInfo);
 }
 
+/* -------------------------
+ *		index_insert_cleanup - clean up after all index inserts are done.
+ * -------------------------
+ */
+void
+index_insert_cleanup(Relation indexRelation,
+					 IndexInfo *indexInfo)
+{
+	RELATION_CHECKS;
+	Assert(indexInfo);
+
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+}
+
 /*
  * index_beginscan - start a scan of an index with amgettuple
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 62bc9917f13..bb60f97270c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -122,6 +122,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 8f32e46fb83..5bd5dcac9db 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -70,6 +70,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 1d82b64b897..63699fdd93d 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -233,15 +233,20 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
 	int			i;
 	int			numIndices;
 	RelationPtr indexDescs;
+	IndexInfo **indexInfos;
 
 	numIndices = resultRelInfo->ri_NumIndices;
 	indexDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfos = resultRelInfo->ri_IndexRelationInfo;
 
 	for (i = 0; i < numIndices; i++)
 	{
 		if (indexDescs[i] == NULL)
 			continue;			/* shouldn't happen? */
 
+		/* Give the index a chance to do some post-insert cleanup */
+		index_insert_cleanup(indexDescs[i], indexInfos[i]);
+
 		/* Drop lock acquired by ExecOpenIndices */
 		index_close(indexDescs[i], RowExclusiveLock);
 	}
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 4476ff7fba1..d29721eaf1e 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -113,6 +113,8 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   bool indexUnchanged,
 								   struct IndexInfo *indexInfo);
 
+typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
 														 IndexBulkDeleteResult *stats,
@@ -261,6 +263,7 @@ typedef struct IndexAmRoutine
 	ambuild_function ambuild;
 	ambuildempty_function ambuildempty;
 	aminsert_function aminsert;
+	aminsertcleanup_function aminsertcleanup;
 	ambulkdelete_function ambulkdelete;
 	amvacuumcleanup_function amvacuumcleanup;
 	amcanreturn_function amcanreturn;	/* can be NULL */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 97ddc925b27..ed231e208eb 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,6 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
+extern void brininsertcleanup(struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index a3087956654..2cf9dc5dca9 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -148,6 +148,8 @@ extern bool index_insert(Relation indexRelation,
 						 IndexUniqueCheck checkUnique,
 						 bool indexUnchanged,
 						 struct IndexInfo *indexInfo);
+extern void index_insert_cleanup(Relation indexRelation,
+								 struct IndexInfo *indexInfo);
 
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
-- 
2.34.1

#12Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Soumyadeep Chakraborty (#11)
1 attachment(s)
Re: brininsert optimization opportunity

Hi,

I took a look at this patch today. I had to rebase the patch, due to
some minor bitrot related to 9f0602539d (but nothing major). I also did
a couple tiny cosmetic tweaks, but other than that the patch seems OK.
See the attached v6.

I did some simple performance tests too, similar to those in the initial
message:

CREATE UNLOGGED TABLE heap (i int);
CREATE INDEX ON heap USING brin(i) WITH (pages_per_range=1);
--
TRUNCATE heap;
INSERT INTO heap SELECT 1 FROM generate_series(1, 20000000);

And the results look like this (5 runs each):

master: 16448.338 16066.473 16039.166 16067.420 16080.066
patched: 13260.065 13229.800 13254.454 13265.479 13273.693

So that's a nice improvement, even though enabling WAL will make the
relative speedup somewhat smaller.

The one thing I'm not entirely sure about is adding new stuff to the
IndexAmRoutine. I don't think we want to end up with too many callbacks
that all AMs have to initialize etc. I can't think of a different/better
way to do this, though.

Barring objections, I'll try to push this early next week, after another
round of cleanup.

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Attachments:

v6-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchtext/x-patch; charset=UTF-8; name=v6-0001-Reuse-revmap-and-brin-desc-in-brininsert.patchDownload
From b1c47527132f0e8c39ff221b6b5adcc9678ba6ce Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Fri, 3 Nov 2023 19:17:21 +0100
Subject: [PATCH v6] Reuse revmap and brin desc in brininsert

brininsert() used to have code that performed per-tuple initialization
of the revmap. That had some overhead.

To mitigate, we introduce a new AM callback to clean up state at the end
of all inserts, and perform the revmap cleanup there.
---
 contrib/bloom/blutils.c              |  1 +
 doc/src/sgml/indexam.sgml            | 14 +++++-
 src/backend/access/brin/brin.c       | 74 +++++++++++++++++++++++-----
 src/backend/access/gin/ginutil.c     |  1 +
 src/backend/access/gist/gist.c       |  1 +
 src/backend/access/hash/hash.c       |  1 +
 src/backend/access/index/indexam.c   | 15 ++++++
 src/backend/access/nbtree/nbtree.c   |  1 +
 src/backend/access/spgist/spgutils.c |  1 +
 src/backend/executor/execIndexing.c  |  5 ++
 src/include/access/amapi.h           |  4 ++
 src/include/access/brin_internal.h   |  1 +
 src/include/access/genam.h           |  2 +
 13 files changed, 109 insertions(+), 12 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index f23fbb1d9e0..4830cb3fee6 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -131,6 +131,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = blbuild;
 	amroutine->ambuildempty = blbuildempty;
 	amroutine->aminsert = blinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 30eda37afa8..a36d2eaebe7 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -139,6 +139,7 @@ typedef struct IndexAmRoutine
     ambuild_function ambuild;
     ambuildempty_function ambuildempty;
     aminsert_function aminsert;
+    aminsertcleanup_function aminsertcleanup;
     ambulkdelete_function ambulkdelete;
     amvacuumcleanup_function amvacuumcleanup;
     amcanreturn_function amcanreturn;   /* can be NULL */
@@ -359,11 +360,22 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially).
+   initially). If the data cached contains structures that can be simply pfree'd,
+   they will implicitly be pfree'd. But, if it contains more complex data, such
+   as Buffers or TupleDescs, additional cleanup is necessary. This additional
+   cleanup can be performed in <literal>indexinsertcleanup</literal>.
   </para>
 
   <para>
 <programlisting>
+bool
+aminsertcleanup (IndexInfo *indexInfo);
+</programlisting>
+   Clean up state that was maintained across successive inserts in
+   <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
+   contained is complex - like Buffers or TupleDescs which need additional
+   cleanup, unlike simple structures that will be implicitly pfree'd.
+<programlisting>
 IndexBulkDeleteResult *
 ambulkdelete (IndexVacuumInfo *info,
               IndexBulkDeleteResult *stats,
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 25338a90e29..fd99b6eb11e 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -58,6 +58,17 @@ typedef struct BrinBuildState
 	BrinMemTuple *bs_dtuple;
 } BrinBuildState;
 
+/*
+ * We use a BrinInsertState to capture running state spanning multiple
+ * brininsert invocations, within the same command.
+ */
+typedef struct BrinInsertState
+{
+	BrinRevmap *bs_rmAccess;
+	BrinDesc   *bs_desc;
+	BlockNumber bs_pages_per_range;
+}			BrinInsertState;
+
 /*
  * Struct used as "opaque" during index scans
  */
@@ -72,6 +83,7 @@ typedef struct BrinOpaque
 
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 												  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
 static void terminate_brin_buildstate(BrinBuildState *state);
 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 						  bool include_partial, double *numSummarized, double *numExisting);
@@ -117,6 +129,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = brinbuild;
 	amroutine->ambuildempty = brinbuildempty;
 	amroutine->aminsert = brininsert;
+	amroutine->aminsertcleanup = brininsertcleanup;
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
@@ -140,6 +153,27 @@ brinhandler(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(amroutine);
 }
 
+/*
+ * Initialize a BrinInsertState to maintain state to be used across multiple
+ * tuple inserts, within the same command.
+ */
+static BrinInsertState *
+initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate;
+	MemoryContext oldcxt;
+
+	oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+	bistate = palloc0(sizeof(BrinInsertState));
+	bistate->bs_desc = brin_build_desc(idxRel);
+	bistate->bs_rmAccess = brinRevmapInitialize(idxRel,
+												&bistate->bs_pages_per_range);
+	indexInfo->ii_AmCache = bistate;
+	MemoryContextSwitchTo(oldcxt);
+
+	return bistate;
+}
+
 /*
  * A tuple in the heap is being inserted.  To keep a brin index up to date,
  * we need to obtain the relevant index tuple and compare its stored values
@@ -162,14 +196,23 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	BlockNumber pagesPerRange;
 	BlockNumber origHeapBlk;
 	BlockNumber heapBlk;
-	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
+	BrinDesc   *bdesc;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
-	revmap = brinRevmapInitialize(idxRel, &pagesPerRange);
+	if (!bistate)
+	{
+		/* First time through in this statement? */
+		bistate = initialize_brin_insertstate(idxRel, indexInfo);
+	}
+
+	revmap = bistate->bs_rmAccess;
+	bdesc = bistate->bs_desc;
+	pagesPerRange = bistate->bs_pages_per_range;
 
 	/*
 	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
@@ -228,14 +271,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		if (!brtup)
 			break;
 
-		/* First time through in this statement? */
-		if (bdesc == NULL)
-		{
-			MemoryContextSwitchTo(indexInfo->ii_Context);
-			bdesc = brin_build_desc(idxRel);
-			indexInfo->ii_AmCache = (void *) bdesc;
-			MemoryContextSwitchTo(oldcxt);
-		}
 		/* First time through in this brininsert call? */
 		if (tupcxt == NULL)
 		{
@@ -306,7 +341,6 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 		break;
 	}
 
-	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
 	MemoryContextSwitchTo(oldcxt);
@@ -316,6 +350,24 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	return false;
 }
 
+/*
+ * Callback to clean up the BrinInsertState once all tuple inserts are done.
+ */
+void
+brininsertcleanup(IndexInfo *indexInfo)
+{
+	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
+
+	Assert(bistate);
+	/*
+	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
+	 * as part of its own memory context.
+	 */
+	brinRevmapTerminate(bistate->bs_rmAccess);
+	bistate->bs_rmAccess = NULL;
+	bistate->bs_desc = NULL;
+}
+
 /*
  * Initialize state for a BRIN index scan.
  *
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 7a4cd93f301..a875c5d3d7a 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -64,6 +64,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = ginbuild;
 	amroutine->ambuildempty = ginbuildempty;
 	amroutine->aminsert = gininsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8ef5fa03290..9a1bf8f66cb 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -86,6 +86,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = gistbuild;
 	amroutine->ambuildempty = gistbuildempty;
 	amroutine->aminsert = gistinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 7a025f33cfe..6443ff21bda 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -83,6 +83,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = hashbuild;
 	amroutine->ambuildempty = hashbuildempty;
 	amroutine->aminsert = hashinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b25b03f7abc..597b763d1f6 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -196,6 +196,21 @@ index_insert(Relation indexRelation,
 											 indexInfo);
 }
 
+/* -------------------------
+ *		index_insert_cleanup - clean up after all index inserts are done
+ * -------------------------
+ */
+void
+index_insert_cleanup(Relation indexRelation,
+					 IndexInfo *indexInfo)
+{
+	RELATION_CHECKS;
+	Assert(indexInfo);
+
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+}
+
 /*
  * index_beginscan - start a scan of an index with amgettuple
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index a88b36a589a..0930f9b37e3 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -122,6 +122,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index c112e1e5dd4..30c00876a56 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -70,6 +70,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
+	amroutine->aminsertcleanup = NULL;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 384b39839a0..2fa2118f3c2 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -233,15 +233,20 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
 	int			i;
 	int			numIndices;
 	RelationPtr indexDescs;
+	IndexInfo **indexInfos;
 
 	numIndices = resultRelInfo->ri_NumIndices;
 	indexDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfos = resultRelInfo->ri_IndexRelationInfo;
 
 	for (i = 0; i < numIndices; i++)
 	{
 		if (indexDescs[i] == NULL)
 			continue;			/* shouldn't happen? */
 
+		/* Give the index a chance to do some post-insert cleanup */
+		index_insert_cleanup(indexDescs[i], indexInfos[i]);
+
 		/* Drop lock acquired by ExecOpenIndices */
 		index_close(indexDescs[i], RowExclusiveLock);
 	}
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 995725502a6..244459587fc 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -113,6 +113,9 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   bool indexUnchanged,
 								   struct IndexInfo *indexInfo);
 
+/* cleanup after insert */
+typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
 														 IndexBulkDeleteResult *stats,
@@ -261,6 +264,7 @@ typedef struct IndexAmRoutine
 	ambuild_function ambuild;
 	ambuildempty_function ambuildempty;
 	aminsert_function aminsert;
+	aminsertcleanup_function aminsertcleanup;
 	ambulkdelete_function ambulkdelete;
 	amvacuumcleanup_function amvacuumcleanup;
 	amcanreturn_function amcanreturn;	/* can be NULL */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 97ddc925b27..ed231e208eb 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,6 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
+extern void brininsertcleanup(struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index f31dec6ee0f..80dc8d54066 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -148,6 +148,8 @@ extern bool index_insert(Relation indexRelation,
 						 IndexUniqueCheck checkUnique,
 						 bool indexUnchanged,
 						 struct IndexInfo *indexInfo);
+extern void index_insert_cleanup(Relation indexRelation,
+								 struct IndexInfo *indexInfo);
 
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
-- 
2.41.0

#13Matthias van de Meent
boekewurm+postgres@gmail.com
In reply to: Tomas Vondra (#12)
Re: brininsert optimization opportunity

On Fri, 3 Nov 2023 at 19:37, Tomas Vondra <tomas.vondra@enterprisedb.com> wrote:

Hi,

I took a look at this patch today. I had to rebase the patch, due to
some minor bitrot related to 9f0602539d (but nothing major). I also did
a couple tiny cosmetic tweaks, but other than that the patch seems OK.
See the attached v6.
[...]
Barring objections, I'll try to push this early next week, after another
round of cleanup.

No hard objections: The principle looks fine.

I do think we should choose a better namespace than bs_* for the
fields of BrinInsertState, as BrinBuildState already uses the bs_*
namespace for its fields in the same file, but that's only cosmetic.

Kind regards,

Matthias van de Meent
Neon (https://neon.tech)

#14Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Matthias van de Meent (#13)
Re: brininsert optimization opportunity

On Fri, 3 Nov 2023 at 19:37, Tomas Vondra <tomas.vondra@enterprisedb.com> wrote:

The one thing I'm not entirely sure about is adding new stuff to the
IndexAmRoutine. I don't think we want to end up with too many callbacks
that all AMs have to initialize etc. I can't think of a different/better
way to do this, though.

Yes there is not really an alternative. Also, aminsertcleanup() is very similar
to amvacuumcleanup(), so it is not awkward. Why should vacuum be an
exclusive VIP? :)
And there are other indexam callbacks that not every AM implements. So this
addition is not unprecedented in that sense.

Barring objections, I'll try to push this early next week, after another
round of cleanup.

Many thanks for resurrecting this patch!

On Fri, Nov 3, 2023 at 12:16PM Matthias van de Meent
<boekewurm+postgres@gmail.com> wrote:

I do think we should choose a better namespace than bs_* for the
fields of BrinInsertState, as BrinBuildState already uses the bs_*
namespace for its fields in the same file, but that's only cosmetic.

bis_* then.

Regards,
Soumyadeep (VMware)

#15Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Soumyadeep Chakraborty (#14)
Re: brininsert optimization opportunity

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

Thanks for the patch!

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#16Ashwin Agrawal
ashwinstar@gmail.com
In reply to: Tomas Vondra (#15)
Re: brininsert optimization opportunity

On Sat, Nov 25, 2023 at 12:06 PM Tomas Vondra <tomas.vondra@enterprisedb.com>
wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

Thanks a lot Tomas for helping to drive the patch to completion iteratively
and realizing the benefits.

- Ashwin

#17Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Ashwin Agrawal (#16)
Re: brininsert optimization opportunity

Thanks a lot for reviewing and pushing! :)

Regards,
Soumyadeep (VMware)

#18Richard Guo
guofenglinux@gmail.com
In reply to: Tomas Vondra (#15)
Re: brininsert optimization opportunity

On Sun, Nov 26, 2023 at 4:06 AM Tomas Vondra <tomas.vondra@enterprisedb.com>
wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

It seems that we have an oversight in this commit. If there is no tuple
that has been inserted, we wouldn't have an available insert state in
the clean up phase. So the Assert in brininsertcleanup() is not always
right. For example:

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

So I wonder if we should check 'bistate' and do the clean up only if
there is an available one, something like below.

--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -359,7 +359,9 @@ brininsertcleanup(IndexInfo *indexInfo)
 {
    BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
-   Assert(bistate);
+   /* We don't have an available insert state, nothing to do */
+   if (!bistate)
+       return;

Thanks
Richard

#19Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Richard Guo (#18)
Re: brininsert optimization opportunity

On Sun, Nov 26, 2023 at 9:28 PM Richard Guo <guofenglinux@gmail.com> wrote:

On Sun, Nov 26, 2023 at 4:06 AM Tomas Vondra <tomas.vondra@enterprisedb.com> wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

It seems that we have an oversight in this commit. If there is no tuple
that has been inserted, we wouldn't have an available insert state in
the clean up phase. So the Assert in brininsertcleanup() is not always
right. For example:

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

I wasn't able to repro the issue on 86b64bafc19c4c60136a4038d2a8d1e6eecc59f2.
with UPDATE/INSERT:

postgres=# drop table a;
DROP TABLE
postgres=# create table a(i int);
CREATE TABLE
postgres=# create index on a using brin(i);
CREATE INDEX
postgres=# insert into a select 1 where 1!=1;
INSERT 0 0
postgres=# update a set i = 2 where i = 0;
UPDATE 0
postgres=# update a set i = a.i;
UPDATE 0

This could be because since c5b7ba4e67aeb5d6f824b74f94114d99ed6e42b7,
we have moved ExecOpenIndices()
from ExecInitModifyTable() to ExecInsert(). Since we never open the
indices if nothing is
inserted, we would never attempt to close them with ExecCloseIndices()
while the ii_AmCache
is NULL (which is what causes this assertion failure).

However, it is possible to repro the issue with:
# create empty file
touch /tmp/a.csv
postgres=# create table a(i int);
CREATE TABLE
postgres=# create index on a using brin(i);
CREATE INDEX
postgres=# copy a from '/tmp/a.csv';
TRAP: failed Assert("bistate"), File: "brin.c", Line: 362, PID: 995511

So I wonder if we should check 'bistate' and do the clean up only if

there is an available one, something like below.

Yes, this is the right way to go. Thanks for reporting!

Regards,
Soumyadeep (VMware)

#20Richard Guo
guofenglinux@gmail.com
In reply to: Soumyadeep Chakraborty (#19)
Re: brininsert optimization opportunity

On Mon, Nov 27, 2023 at 1:53 PM Soumyadeep Chakraborty <
soumyadeep2007@gmail.com> wrote:

On Sun, Nov 26, 2023 at 9:28 PM Richard Guo <guofenglinux@gmail.com>
wrote:

It seems that we have an oversight in this commit. If there is no tuple
that has been inserted, we wouldn't have an available insert state in
the clean up phase. So the Assert in brininsertcleanup() is not always
right. For example:

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

I wasn't able to repro the issue on
86b64bafc19c4c60136a4038d2a8d1e6eecc59f2.
with UPDATE/INSERT:

This could be because since c5b7ba4e67aeb5d6f824b74f94114d99ed6e42b7,
we have moved ExecOpenIndices()
from ExecInitModifyTable() to ExecInsert(). Since we never open the
indices if nothing is
inserted, we would never attempt to close them with ExecCloseIndices()
while the ii_AmCache
is NULL (which is what causes this assertion failure).

AFAICS we would also open the indices from ExecUpdate(). So if we
update the table in a way that no new tuples are inserted, we will have
this issue. As I showed previously, the query below crashes for me on
latest master (dc9f8a7983).

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

There are other code paths that call ExecOpenIndices(), such as
ExecMerge(). I believe it's not hard to create queries that trigger this
Assert for those cases.

Thanks
Richard

#21Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Richard Guo (#20)
Re: brininsert optimization opportunity

On 11/27/23 08:37, Richard Guo wrote:

On Mon, Nov 27, 2023 at 1:53 PM Soumyadeep Chakraborty
<soumyadeep2007@gmail.com <mailto:soumyadeep2007@gmail.com>> wrote:

On Sun, Nov 26, 2023 at 9:28 PM Richard Guo <guofenglinux@gmail.com
<mailto:guofenglinux@gmail.com>> wrote:

It seems that we have an oversight in this commit.  If there is no

tuple

that has been inserted, we wouldn't have an available insert state in
the clean up phase.  So the Assert in brininsertcleanup() is not

always

right.  For example:

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

I wasn't able to repro the issue on
86b64bafc19c4c60136a4038d2a8d1e6eecc59f2.
with UPDATE/INSERT:

This could be because since c5b7ba4e67aeb5d6f824b74f94114d99ed6e42b7,
we have moved ExecOpenIndices()
from ExecInitModifyTable() to ExecInsert(). Since we never open the
indices if nothing is
inserted, we would never attempt to close them with ExecCloseIndices()
while the ii_AmCache
is NULL (which is what causes this assertion failure).

AFAICS we would also open the indices from ExecUpdate().  So if we
update the table in a way that no new tuples are inserted, we will have
this issue.  As I showed previously, the query below crashes for me on
latest master (dc9f8a7983).

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

There are other code paths that call ExecOpenIndices(), such as
ExecMerge(). I believe it's not hard to create queries that trigger
this Assert for those cases.

FWIW I can readily reproduce it like this:

drop table t;
create table t (a int);
insert into t values (1);
create index on t using brin (a);
update t set a = a;

I however wonder if maybe we should do the check in index_insert_cleanup
and not in the AM callback. That seems simpler / better, because the AM
callbacks then can't make this mistake.

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#22Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Tomas Vondra (#21)
Re: brininsert optimization opportunity

On 11/27/23 11:34, Tomas Vondra wrote:

On 11/27/23 08:37, Richard Guo wrote:

On Mon, Nov 27, 2023 at 1:53 PM Soumyadeep Chakraborty
<soumyadeep2007@gmail.com <mailto:soumyadeep2007@gmail.com>> wrote:

On Sun, Nov 26, 2023 at 9:28 PM Richard Guo <guofenglinux@gmail.com
<mailto:guofenglinux@gmail.com>> wrote:

It seems that we have an oversight in this commit.  If there is no

tuple

that has been inserted, we wouldn't have an available insert state in
the clean up phase.  So the Assert in brininsertcleanup() is not

always

right.  For example:

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

I wasn't able to repro the issue on
86b64bafc19c4c60136a4038d2a8d1e6eecc59f2.
with UPDATE/INSERT:

This could be because since c5b7ba4e67aeb5d6f824b74f94114d99ed6e42b7,
we have moved ExecOpenIndices()
from ExecInitModifyTable() to ExecInsert(). Since we never open the
indices if nothing is
inserted, we would never attempt to close them with ExecCloseIndices()
while the ii_AmCache
is NULL (which is what causes this assertion failure).

AFAICS we would also open the indices from ExecUpdate().  So if we
update the table in a way that no new tuples are inserted, we will have
this issue.  As I showed previously, the query below crashes for me on
latest master (dc9f8a7983).

regression=# update brin_summarize set value = brin_summarize.value;
server closed the connection unexpectedly

There are other code paths that call ExecOpenIndices(), such as
ExecMerge(). I believe it's not hard to create queries that trigger
this Assert for those cases.

FWIW I can readily reproduce it like this:

drop table t;
create table t (a int);
insert into t values (1);
create index on t using brin (a);
update t set a = a;

I however wonder if maybe we should do the check in index_insert_cleanup
and not in the AM callback. That seems simpler / better, because the AM
callbacks then can't make this mistake.

I did it this way (check in index_insert_cleanup) and pushed. Thanks for
the report!

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#23Alexander Lakhin
exclusion@gmail.com
In reply to: Tomas Vondra (#15)
Re: brininsert optimization opportunity

Hello Tomas and Soumyadeep,

25.11.2023 23:06, Tomas Vondra wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

Please look at a query, which produces warnings similar to the ones
observed upthread:
CREATE TABLE t(a INT);
INSERT INTO t SELECT x FROM generate_series(1,10000) x;
CREATE INDEX idx ON t USING brin (a);
REINDEX index CONCURRENTLY idx;

WARNING:  resource was not closed: [1863] (rel=base/16384/16389, blockNum=1, flags=0x93800000, refcount=1 1)
WARNING:  resource was not closed: [1862] (rel=base/16384/16389, blockNum=0, flags=0x93800000, refcount=1 1)

The first bad commit for this anomaly is c1ec02be1.

May be you would also want to fix in passing some typos/inconsistencies
introduced with recent brin-related commits:
s/bs_blkno/bt_blkno/
s/emptry/empty/
s/firt/first/
s/indexinsertcleanup/aminsertcleanup/
s/ maxRange/ nextRange/
s/paga /page /

Best regards,
Alexander

#24Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Alexander Lakhin (#23)
Re: brininsert optimization opportunity

On 12/11/23 16:00, Alexander Lakhin wrote:

Hello Tomas and Soumyadeep,

25.11.2023 23:06, Tomas Vondra wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

Please look at a query, which produces warnings similar to the ones
observed upthread:
CREATE TABLE t(a INT);
INSERT INTO t SELECT x FROM generate_series(1,10000) x;
CREATE INDEX idx ON t USING brin (a);
REINDEX index CONCURRENTLY idx;

WARNING:  resource was not closed: [1863] (rel=base/16384/16389,
blockNum=1, flags=0x93800000, refcount=1 1)
WARNING:  resource was not closed: [1862] (rel=base/16384/16389,
blockNum=0, flags=0x93800000, refcount=1 1)

The first bad commit for this anomaly is c1ec02be1.

Thanks for the report. I haven't investigated what the issue is, but it
seems we fail to release the buffers in some situations - I'd bet we
fail to call the cleanup callback in some place, or something like that.
I'll take a look tomorrow.

May be you would also want to fix in passing some typos/inconsistencies
introduced with recent brin-related commits:
s/bs_blkno/bt_blkno/
s/emptry/empty/
s/firt/first/
s/indexinsertcleanup/aminsertcleanup/
s/ maxRange/ nextRange/
s/paga /page /

Definitely. Thanks for noticing those!

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#25Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Tomas Vondra (#24)
2 attachment(s)
Re: brininsert optimization opportunity

On 12/11/23 16:41, Tomas Vondra wrote:

On 12/11/23 16:00, Alexander Lakhin wrote:

Hello Tomas and Soumyadeep,

25.11.2023 23:06, Tomas Vondra wrote:

I've done a bit more cleanup on the last version of the patch (renamed
the fields to start with bis_ as agreed, rephrased the comments / docs /
commit message a bit) and pushed.

Please look at a query, which produces warnings similar to the ones
observed upthread:
CREATE TABLE t(a INT);
INSERT INTO t SELECT x FROM generate_series(1,10000) x;
CREATE INDEX idx ON t USING brin (a);
REINDEX index CONCURRENTLY idx;

WARNING:  resource was not closed: [1863] (rel=base/16384/16389,
blockNum=1, flags=0x93800000, refcount=1 1)
WARNING:  resource was not closed: [1862] (rel=base/16384/16389,
blockNum=0, flags=0x93800000, refcount=1 1)

The first bad commit for this anomaly is c1ec02be1.

Thanks for the report. I haven't investigated what the issue is, but it
seems we fail to release the buffers in some situations - I'd bet we
fail to call the cleanup callback in some place, or something like that.
I'll take a look tomorrow.

Yeah, just as I expected this seems to be a case of failing to call the
index_insert_cleanup after doing some inserts - in this case the inserts
happen in table_index_validate_scan, but validate_index has no idea it
needs to do the cleanup.

The attached 0001 patch fixes this by adding the call to validate_index,
which seems like the proper place as it's where the indexInfo is
allocated and destroyed.

But this makes me think - are there any other places that might call
index_insert without then also doing the cleanup? I did grep for the
index_insert() calls, and it seems OK except for this one.

There's a call in toast_internals.c, but that seems OK because that only
deals with btree indexes (and those don't need any cleanup). The same
logic applies to unique_key_recheck(). The rest goes through
execIndexing.c, which should do the cleanup in ExecCloseIndices().

Note: We should probably call the cleanup even in the btree cases, even
if only to make it clear it needs to be called after index_insert().

I was thinking maybe we should have some explicit call to destroy the
IndexInfo, but that seems rather inconvenient - it'd force everyone to
carefully track lifetimes of the IndexInfo instead of just relying on
memory context reset/destruction. That seems quite error-prone.

I propose we do a much simpler thing instead - allow the cache may be
initialized / cleaned up repeatedly, and make sure it gets reset at
convenient place (typically after index_insert calls that don't go
through execIndexing). That'd mean the cleanup does not need to happen
very far from the index_insert(), which makes the reasoning much easier.
0002 does this.

But maybe there's a better way to ensure the cleanup gets called even
when not using execIndexing.

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Attachments:

0001-call-cleanup-in-validate_index.patchtext/x-patch; charset=UTF-8; name=0001-call-cleanup-in-validate_index.patchDownload
From 0cf18dd53842e3809b76525867214ce8ff823f32 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Tue, 12 Dec 2023 12:01:07 +0100
Subject: [PATCH 1/2] call cleanup in validate_index

---
 src/backend/catalog/index.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 7b186c0220b..7a0e337a418 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3414,6 +3414,8 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	/* Done with tuplesort object */
 	tuplesort_end(state.tuplesort);
 
+	index_insert_cleanup(indexRelation, indexInfo);
+
 	elog(DEBUG2,
 		 "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
 		 state.htups, state.itups, state.tups_inserted);
-- 
2.42.0

0002-call-index_insert_cleanup-repeatedly.patchtext/x-patch; charset=UTF-8; name=0002-call-index_insert_cleanup-repeatedly.patchDownload
From 0c04b53573cf164c5a0108d909f05ffcbae4e72d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Tue, 12 Dec 2023 12:24:05 +0100
Subject: [PATCH 2/2] call index_insert_cleanup repeatedly

---
 src/backend/access/heap/heapam_handler.c |  3 +++
 src/backend/access/index/indexam.c       | 11 ++++++++++-
 src/backend/catalog/index.c              |  2 --
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 7c28dafb728..5db0cf1dffa 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1976,6 +1976,9 @@ heapam_index_validate_scan(Relation heapRelation,
 	/* These may have been pointing to the now-gone estate */
 	indexInfo->ii_ExpressionsState = NIL;
 	indexInfo->ii_PredicateState = NULL;
+
+	/* also make sure the IndexInfo cache gets freed */
+	index_insert_cleanup(indexRelation, indexInfo);
 }
 
 /*
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index f23e0199f08..515d7649c4d 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -207,8 +207,17 @@ index_insert_cleanup(Relation indexRelation,
 	RELATION_CHECKS;
 	Assert(indexInfo);
 
-	if (indexRelation->rd_indam->aminsertcleanup && indexInfo->ii_AmCache)
+	if (!indexInfo->ii_AmCache)
+		return;
+
+	if (indexRelation->rd_indam->aminsertcleanup)
 		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+
+	if (indexInfo->ii_AmCache)
+	{
+		pfree(indexInfo->ii_AmCache);
+		indexInfo->ii_AmCache = NULL;
+	}
 }
 
 /*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 7a0e337a418..7b186c0220b 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3414,8 +3414,6 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	/* Done with tuplesort object */
 	tuplesort_end(state.tuplesort);
 
-	index_insert_cleanup(indexRelation, indexInfo);
-
 	elog(DEBUG2,
 		 "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
 		 state.htups, state.itups, state.tups_inserted);
-- 
2.42.0

#26Soumyadeep Chakraborty
soumyadeep2007@gmail.com
In reply to: Tomas Vondra (#25)
Re: brininsert optimization opportunity

On Tue, Dec 12, 2023 at 3:25 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

The attached 0001 patch fixes this by adding the call to validate_index,

which seems like the proper place as it's where the indexInfo is
allocated and destroyed.

Yes, and by reading validate_index's header comment, there is a clear
expectation that we will be adding tuples to the index in the table AM call
table_index_validate_scan (albeit "validating" doesn't necessarily convey this
side effect). So, it makes perfect sense to call it here.

But this makes me think - are there any other places that might call
index_insert without then also doing the cleanup? I did grep for the
index_insert() calls, and it seems OK except for this one.

There's a call in toast_internals.c, but that seems OK because that only
deals with btree indexes (and those don't need any cleanup). The same
logic applies to unique_key_recheck(). The rest goes through
execIndexing.c, which should do the cleanup in ExecCloseIndices().

Note: We should probably call the cleanup even in the btree cases, even
if only to make it clear it needs to be called after index_insert().

Agreed. Doesn't feel great, but yeah all of these btree specific code does call
through index_* functions, instead of calling btree functions directly. So,
ideally we should follow through with that pattern and call the cleanup
explicitly. But we are introducing per-tuple overhead that is totally wasted.
Maybe we can add a comment instead like:

void
toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock)
{
int i;

/*
* Save a few cycles by choosing not to call index_insert_cleanup(). Toast
* indexes are btree, which don't have a aminsertcleanup() anyway.
*/

/* Close relations and clean up things */
...
}

And add something similar for unique_key_recheck()? That said, I am also not
opposed to just accepting these wasted cycles, if the commenting seems wonky.

I propose we do a much simpler thing instead - allow the cache may be
initialized / cleaned up repeatedly, and make sure it gets reset at
convenient place (typically after index_insert calls that don't go
through execIndexing). That'd mean the cleanup does not need to happen
very far from the index_insert(), which makes the reasoning much easier.
0002 does this.

That kind of goes against the ethos of the ii_AmCache, which is meant to capture
state to be used across multiple index inserts. Also, quoting the current docs:

"If the index AM wishes to cache data across successive index insertions
within an SQL statement, it can allocate space
in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
initially). After the index insertions complete, the memory will be freed
automatically. If additional cleanup is required (e.g. if the cache contains
buffers and tuple descriptors), the AM may define an optional function
<literal>indexinsertcleanup</literal>, called before the memory is released."

The memory will be freed automatically (as soon as ii_Context goes away). So,
why would we explicitly free it, like in the attached 0002 patch? And the whole
point of the cleanup function is to do something other than freeing memory, as
the docs note highlights so well.

Also, the 0002 patch does introduce per-tuple function call overhead in
heapam_index_validate_scan().

Also, now we have split brain...in certain situations we want to call
index_insert_cleanup() per index insert and in certain cases we would like it
called once for all inserts. Not very easy to understand IMO.

Finally, I don't think that any index AM would have anything to clean up after
every insert.

I had tried (abused) an approach with MemoryContextCallbacks upthread and that
seems a no-go. And yes I agree, having a dual to makeIndexInfo() (like
destroyIndexInfo()) means that we lose the benefits of ii_Context. That could
be disruptive to existing AMs in-core and outside.

All said and done, v1 has my vote.

Regards,
Soumyadeep (VMware)

#27Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Soumyadeep Chakraborty (#26)
Re: brininsert optimization opportunity

On 12/13/23 09:12, Soumyadeep Chakraborty wrote:

On Tue, Dec 12, 2023 at 3:25 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:

The attached 0001 patch fixes this by adding the call to validate_index,

which seems like the proper place as it's where the indexInfo is
allocated and destroyed.

Yes, and by reading validate_index's header comment, there is a clear
expectation that we will be adding tuples to the index in the table AM call
table_index_validate_scan (albeit "validating" doesn't necessarily convey this
side effect). So, it makes perfect sense to call it here.

OK

But this makes me think - are there any other places that might call
index_insert without then also doing the cleanup? I did grep for the
index_insert() calls, and it seems OK except for this one.

There's a call in toast_internals.c, but that seems OK because that only
deals with btree indexes (and those don't need any cleanup). The same
logic applies to unique_key_recheck(). The rest goes through
execIndexing.c, which should do the cleanup in ExecCloseIndices().

Note: We should probably call the cleanup even in the btree cases, even
if only to make it clear it needs to be called after index_insert().

Agreed. Doesn't feel great, but yeah all of these btree specific code does call
through index_* functions, instead of calling btree functions directly. So,
ideally we should follow through with that pattern and call the cleanup
explicitly. But we are introducing per-tuple overhead that is totally wasted.

I haven't tried but I very much doubt this will be measurable. It's just
a trivial check if a pointer is NULL. We do far more expensive stuff in
this code path.

Maybe we can add a comment instead like:

void
toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock)
{
int i;

/*
* Save a few cycles by choosing not to call index_insert_cleanup(). Toast
* indexes are btree, which don't have a aminsertcleanup() anyway.
*/

/* Close relations and clean up things */
...
}

And add something similar for unique_key_recheck()? That said, I am also not
opposed to just accepting these wasted cycles, if the commenting seems wonky.

I really don't want to do this sort of stuff unless we know it actually
saves something.

I propose we do a much simpler thing instead - allow the cache may be
initialized / cleaned up repeatedly, and make sure it gets reset at
convenient place (typically after index_insert calls that don't go
through execIndexing). That'd mean the cleanup does not need to happen
very far from the index_insert(), which makes the reasoning much easier.
0002 does this.

That kind of goes against the ethos of the ii_AmCache, which is meant
to capture state to be used across multiple index inserts.

Why would it be against the ethos? The point is that we reuse stuff over
multiple index_insert() calls. If we can do that for all inserts, cool.
But if that's difficult, it's maybe better to cache for smaller batches
of inserts (instead of making it more complex for all index AMs, even
those not doing any caching).

Also, quoting the current docs:

"If the index AM wishes to cache data across successive index insertions
within an SQL statement, it can allocate space
in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
initially). After the index insertions complete, the memory will be freed
automatically. If additional cleanup is required (e.g. if the cache contains
buffers and tuple descriptors), the AM may define an optional function
<literal>indexinsertcleanup</literal>, called before the memory is released."

The memory will be freed automatically (as soon as ii_Context goes away). So,
why would we explicitly free it, like in the attached 0002 patch? And the whole
point of the cleanup function is to do something other than freeing memory, as
the docs note highlights so well.

Not sure I follow. The whole reason for having the index_insert_cleanup
callback is we can't rely on ii_Context going away, exactly because that
just throws away the memory and we need to release buffers etc.

The only reason why the 0002 patch does pfree() is that it clearly
indicates whether the cache is initialized. We could have a third state
"allocated but not initialized", but that doesn't seem worth it.

If you're saying the docs are misleading in some way, then maybe we need
to clarify that.

Also, the 0002 patch does introduce per-tuple function call overhead in
heapam_index_validate_scan().

How come? The cleanup is done only once after the scan completes. Isn't
that exactly what we want to do?

Also, now we have split brain...in certain situations we want to call
index_insert_cleanup() per index insert and in certain cases we would like it
called once for all inserts. Not very easy to understand IMO.

Finally, I don't think that any index AM would have anything to clean up after
every insert.

But I didn't suggest to call the cleanup after every index insert. If a
function does a bunch of index_insert calls, it'd do the cleanup only
once. The point is that it'd happen "close" to the inserts, when we know
it needs to be done. Yes, it might happen multiple times for the same
query, but that still likely saves quite a bit of work (compared to
per-insert init+cleanup).

We need to call the cleanup at some point, and the only alternative I
can think of is to call it in every place that calls BuildIndexInfo
(unless we can guarantee the place can't do index_insert).

I had tried (abused) an approach with MemoryContextCallbacks upthread and that
seems a no-go. And yes I agree, having a dual to makeIndexInfo() (like
destroyIndexInfo()) means that we lose the benefits of ii_Context. That could
be disruptive to existing AMs in-core and outside.

What do you mean by "dual makeIndexInfo"?

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#28James Wang
jwang25610@gmail.com
In reply to: Tomas Vondra (#27)
Re: brininsert optimization opportunity

Hi All, not sure how to "Specify thread msgid" - choose one which i think is close to my new feature request.

query:

SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t1.a_indexed_col='some_value' OR t2.a_indexed_col='some_vable';

can the server automatically replace the OR logic above with UNION please? i.e. replace it with:

(SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t1.a_indexed_col='some_value' )
UNION
(SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t2.a_indexed_col='some_vable');

Thanks

#29Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: James Wang (#28)
Re: brininsert optimization opportunity

On 2023-Dec-21, James Wang wrote:

Hi All, not sure how to "Specify thread msgid" - choose one which i think is close to my new feature request.

Hello James, based on the "Specify thread msgid" message it looks like
you were trying to request a feature using the Commitfest website? That
won't work; the commitfest website is only intended as a tracker of
in-progress development work. Without a Postgres code patch, that
website doesn't help you any. What you have done amounts to hijacking
an unrelated mailing list thread, which is discouraged and frowned upon.

That said, sadly we don't have any official feature request system,
Please start a new thread by composing an entirely new message to
pgsql-hackers@lists.postgresql.org, and don't use the commitfest
website for it.

That said,

query:

SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t1.a_indexed_col='some_value' OR t2.a_indexed_col='some_vable';

can the server automatically replace the OR logic above with UNION please? i.e. replace it with:

(SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t1.a_indexed_col='some_value' )
UNION
(SELECT count(1) FROM table1 t1 JOIN table2 t2 ON t1.id = t2.id WHERE t2.a_indexed_col='some_vable');

I have the feeling that this has already been discussed, but I can't
find anything useful in the mailing list archives.

--
Álvaro Herrera Breisgau, Deutschland — https://www.EnterpriseDB.com/
"Saca el libro que tu religión considere como el indicado para encontrar la
oración que traiga paz a tu alma. Luego rebootea el computador
y ve si funciona" (Carlos Duclós)

#30Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: Tomas Vondra (#25)
Re: brininsert optimization opportunity

On 2023-Dec-12, Tomas Vondra wrote:

I propose we do a much simpler thing instead - allow the cache may be
initialized / cleaned up repeatedly, and make sure it gets reset at
convenient place (typically after index_insert calls that don't go
through execIndexing). That'd mean the cleanup does not need to happen
very far from the index_insert(), which makes the reasoning much easier.
0002 does this.

I'm not in love with this 0002 patch; I think the layering after 0001 is
correct in that the insert_cleanup call should remain in validate_index
and called after the whole thing is done, but 0002 changes things so
that now every table AM has to worry about doing this correctly; and a
bug of omission will not be detected unless you have a BRIN index on
such a table and happen to use CREATE INDEX CONCURRENTLY. So a
developer has essentially zero chance to do things correctly, which I
think we'd rather avoid.

So I think we should do 0001 and perhaps some further tweaks to the
original brininsert optimization commit: I think the aminsertcleanup
callback should receive the indexRelation as first argument; and also I
think it's not index_insert_cleanup() job to worry about whether
ii_AmCache is NULL or not, but instead the callback should be invoked
always, and then it's aminsertcleanup job to do nothing if ii_AmCache is
NULL. That way, the index AM API doesn't have to worry about which
parts of IndexInfo (or the indexRelation) is aminsertcleanup going to
care about. If we decide to change this, then the docs also need a bit
of tweaking I think.

Lastly, I kinda disagree with the notion that only some of the callers
of aminsert should call aminsertcleanup, even though btree doesn't have
an aminsertcleanup and thus it can't affect TOAST or catalogs. Maybe we
can turn index_insert_cleanup into an inline function, which can quickly
do nothing if aminsertcleanup isn't defined. Then we no longer have the
layering violation where we assume that btree doesn't care. But the
proposed change in this paragraph can be maybe handled separately to
avoid confusing things with the bugfix in the two paragraphs above.

--
Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/
Essentially, you're proposing Kevlar shoes as a solution for the problem
that you want to walk around carrying a loaded gun aimed at your foot.
(Tom Lane)

#31Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: Alvaro Herrera (#30)
1 attachment(s)
Re: brininsert optimization opportunity

On 2024-Jan-08, Alvaro Herrera wrote:

So I think we should do 0001 and perhaps some further tweaks to the
original brininsert optimization commit: [...]

So I propose the attached patch, which should fix the reported bug and
the things I mentioned above, and also the typos Alexander mentioned
elsewhere in the thread.

Lastly, I kinda disagree with the notion that only some of the callers
of aminsert should call aminsertcleanup, even though btree doesn't have
an aminsertcleanup and thus it can't affect TOAST or catalogs. [...]

I didn't do anything about this.

--
Álvaro Herrera Breisgau, Deutschland — https://www.EnterpriseDB.com/

Attachments:

v2-0001-call-cleanup-in-validate_index.patchtext/x-diff; charset=utf-8Download
From 5eff6872a446c7a3e86607b5d73a18e1a95b7da7 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Tue, 12 Dec 2023 12:01:07 +0100
Subject: [PATCH v2] call cleanup in validate_index

---
 doc/src/sgml/indexam.sgml          | 14 +++++++-------
 src/backend/access/brin/brin.c     | 19 +++++++++++--------
 src/backend/access/index/indexam.c |  5 ++---
 src/backend/catalog/index.c        |  2 ++
 src/include/access/amapi.h         |  2 +-
 src/include/access/brin_internal.h |  2 +-
 6 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index cc4135e394..4817acd31f 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -367,21 +367,21 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially). After the index insertions complete, the memory will be freed
-   automatically. If additional cleanup is required (e.g. if the cache contains
-   buffers and tuple descriptors), the AM may define an optional function
-   <literal>indexinsertcleanup</literal>, called before the memory is released.
+   initially).  If resources other than memory have to be released
+   after index insertions, <function>aminsertcleanup</function> may
+   be provided, which will be called before the memory is released.
   </para>
 
   <para>
 <programlisting>
 void
-aminsertcleanup (IndexInfo *indexInfo);
+aminsertcleanup (Relation indexRelation,
+                 IndexInfo *indexInfo);
 </programlisting>
    Clean up state that was maintained across successive inserts in
    <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
-   requires additional cleanup steps, and simply releasing the memory is not
-   sufficient.
+   requires additional cleanup steps (e.g., releasing pinned
+   buffers), and simply releasing the memory is not sufficient.
   </para>
 
   <para>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 1087a9011e..6ac12b9ba8 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -222,7 +222,7 @@ static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
 								BrinMemTuple *dtup, const Datum *values, const bool *nulls);
 static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
 static void brin_fill_empty_ranges(BrinBuildState *state,
-								   BlockNumber prevRange, BlockNumber maxRange);
+								   BlockNumber prevRange, BlockNumber nextRange);
 
 /* parallel index builds */
 static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
@@ -498,11 +498,14 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
  * Callback to clean up the BrinInsertState once all tuple inserts are done.
  */
 void
-brininsertcleanup(IndexInfo *indexInfo)
+brininsertcleanup(Relation idx, IndexInfo *indexInfo)
 {
-	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
+	BrinInsertState *bistate;
 
-	Assert(bistate);
+	if (indexInfo->ii_AmCache == NULL)
+		return;
+
+	bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 
 	/*
 	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
@@ -1149,8 +1152,8 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 *
 	 * XXX plan_create_index_workers makes the number of workers dependent on
 	 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
-	 * for btree, but not for BRIN, which can do away with much less memory.
-	 * So maybe make that somehow less strict, optionally?
+	 * for btree, but not for BRIN, which can do with much less memory.  So
+	 * maybe make that somehow less strict, optionally?
 	 */
 	if (indexInfo->ii_ParallelWorkers > 0)
 		_brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
@@ -2543,7 +2546,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 
 	/*
 	 * Initialize BrinMemTuple we'll use to union summaries from workers (in
-	 * case they happened to produce parts of the same paga range).
+	 * case they happened to produce parts of the same page range).
 	 */
 	memtuple = brin_new_memtuple(state->bs_bdesc);
 
@@ -2867,7 +2870,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
  * specified block number. The empty tuple is initialized only once, when it's
  * needed for the first time, stored in the memory context bs_context to ensure
  * proper life span, and reused on following calls. All empty tuples are
- * exactly the same except for the bs_blkno field, which is set to the value
+ * exactly the same except for the bt_blkno field, which is set to the value
  * in blkno parameter.
  */
 static void
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 63dff101e2..ff84bf0fa0 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -205,10 +205,9 @@ index_insert_cleanup(Relation indexRelation,
 					 IndexInfo *indexInfo)
 {
 	RELATION_CHECKS;
-	Assert(indexInfo);
 
-	if (indexRelation->rd_indam->aminsertcleanup && indexInfo->ii_AmCache)
-		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo);
 }
 
 /*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 88f7994b5a..40ac683b35 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3414,6 +3414,8 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	/* Done with tuplesort object */
 	tuplesort_end(state.tuplesort);
 
+	index_insert_cleanup(indexRelation, indexInfo);
+
 	elog(DEBUG2,
 		 "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
 		 state.htups, state.itups, state.tups_inserted);
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 2c6c307efc..683ad13f07 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -114,7 +114,7 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   struct IndexInfo *indexInfo);
 
 /* cleanup after insert */
-typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+typedef void (*aminsertcleanup_function) (Relation indexRelation, struct IndexInfo *indexInfo);
 
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 1c7eabe604..a5a9772621 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,7 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
-extern void brininsertcleanup(struct IndexInfo *indexInfo);
+extern void brininsertcleanup(Relation index, struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
-- 
2.39.2

#32Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Alvaro Herrera (#30)
Re: brininsert optimization opportunity

On 1/8/24 16:51, Alvaro Herrera wrote:

On 2023-Dec-12, Tomas Vondra wrote:

I propose we do a much simpler thing instead - allow the cache may be
initialized / cleaned up repeatedly, and make sure it gets reset at
convenient place (typically after index_insert calls that don't go
through execIndexing). That'd mean the cleanup does not need to happen
very far from the index_insert(), which makes the reasoning much easier.
0002 does this.

I'm not in love with this 0002 patch; I think the layering after 0001 is
correct in that the insert_cleanup call should remain in validate_index
and called after the whole thing is done, but 0002 changes things so
that now every table AM has to worry about doing this correctly; and a
bug of omission will not be detected unless you have a BRIN index on
such a table and happen to use CREATE INDEX CONCURRENTLY. So a
developer has essentially zero chance to do things correctly, which I
think we'd rather avoid.

True. If the AM code does not need to worry about this kind of stuff,
that would be good / less error prone.

One thing that is not very clear to me is that I don't think there's a
very good way to determine which places need the cleanup call. Because
it depends on (a) what kind of index is used and (b) what happens in the
code called earlier (which may easily do arbitrary stuff). Which means
we have to call the cleanup whenever the code *might* have done inserts
into the index. Maybe it's not such an issue in practice, though.

So I think we should do 0001 and perhaps some further tweaks to the
original brininsert optimization commit: I think the aminsertcleanup
callback should receive the indexRelation as first argument; and also I
think it's not index_insert_cleanup() job to worry about whether
ii_AmCache is NULL or not, but instead the callback should be invoked
always, and then it's aminsertcleanup job to do nothing if ii_AmCache is
NULL. That way, the index AM API doesn't have to worry about which
parts of IndexInfo (or the indexRelation) is aminsertcleanup going to
care about. If we decide to change this, then the docs also need a bit
of tweaking I think.

Yeah, passing the indexRelation to the am callback seems reasonable.
It's more consistent what we do for other callbacks, and perhaps the
callback might need the indexRelation.

I don't quite see why we should invoke the callback with ii_AmCache=NULL
though. If there's nothing cached, why bother? It just means all cleanup
callbacks have to do this NULL check on their own.

Lastly, I kinda disagree with the notion that only some of the callers
of aminsert should call aminsertcleanup, even though btree doesn't have
an aminsertcleanup and thus it can't affect TOAST or catalogs. Maybe we
can turn index_insert_cleanup into an inline function, which can quickly
do nothing if aminsertcleanup isn't defined. Then we no longer have the
layering violation where we assume that btree doesn't care. But the
proposed change in this paragraph can be maybe handled separately to
avoid confusing things with the bugfix in the two paragraphs above.

After thinking about this a bit more I agree with you - we should call
the cleanup from each place calling aminsert, even if it's for nbtree
(or other index types that don't require cleanup at the moment).

I wonder if there's a nice way to check this in assert-enabled builds?
Could we tweak nbtree (or index AM in general) to check that all places
that called aminsert also called aminsertcleanup?

For example, I was thinking we might add a flag to IndexInfo (separate
from the ii_AmCache), tracking if aminsert() was called, and then later
check the aminsertcleanup() got called too. The problem however is
there's no obviously convenient place for this check, because IndexInfo
is not freed explicitly ...

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#33Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: Tomas Vondra (#32)
Re: brininsert optimization opportunity

On 2024-Feb-13, Tomas Vondra wrote:

One thing that is not very clear to me is that I don't think there's a
very good way to determine which places need the cleanup call. Because
it depends on (a) what kind of index is used and (b) what happens in the
code called earlier (which may easily do arbitrary stuff). Which means
we have to call the cleanup whenever the code *might* have done inserts
into the index. Maybe it's not such an issue in practice, though.

I think it's not an issue, or rather that we should not try to guess.
Instead make it a simple rule: if aminsert is called, then
aminsertcleanup must be called afterwards, period.

On 1/8/24 16:51, Alvaro Herrera wrote:

So I think we should do 0001 and perhaps some further tweaks to the
original brininsert optimization commit: I think the aminsertcleanup
callback should receive the indexRelation as first argument; and also I
think it's not index_insert_cleanup() job to worry about whether
ii_AmCache is NULL or not, but instead the callback should be invoked
always, and then it's aminsertcleanup job to do nothing if ii_AmCache is
NULL. [...]

I don't quite see why we should invoke the callback with ii_AmCache=NULL
though. If there's nothing cached, why bother? It just means all cleanup
callbacks have to do this NULL check on their own.

Guessing that aminsertcleanup is not needed when ii_AmCache is NULL
seems like a leaky abstraction. I propose to have only the AM know
whether the cleanup call is important or not, without
index_insert_cleanup assuming that it's related to ii_AmCache. Somebody
could decide to have something completely different during insert
cleanup, which is not in ii_AmCache.

I wonder if there's a nice way to check this in assert-enabled builds?
Could we tweak nbtree (or index AM in general) to check that all places
that called aminsert also called aminsertcleanup?

For example, I was thinking we might add a flag to IndexInfo (separate
from the ii_AmCache), tracking if aminsert() was called, and Then later
check the aminsertcleanup() got called too. The problem however is
there's no obviously convenient place for this check, because IndexInfo
is not freed explicitly ...

I agree it would be nice to have a way to verify, but it doesn't seem
100% essential. After all, it's not very common to add new calls to
aminsert.

--
Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/

#34Michael Paquier
michael@paquier.xyz
In reply to: Alvaro Herrera (#33)
Re: brininsert optimization opportunity

On Thu, Feb 29, 2024 at 01:20:39PM +0100, Alvaro Herrera wrote:

I think it's not an issue, or rather that we should not try to guess.
Instead make it a simple rule: if aminsert is called, then
aminsertcleanup must be called afterwards, period.

I agree it would be nice to have a way to verify, but it doesn't seem
100% essential. After all, it's not very common to add new calls to
aminsert.

This thread is listed as an open item. What's the follow-up plan?
The last email of this thread is dated as of the 29th of February,
which was 6 weeks ago.
--
Michael

#35Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Michael Paquier (#34)
Re: brininsert optimization opportunity

On 4/18/24 09:07, Michael Paquier wrote:

On Thu, Feb 29, 2024 at 01:20:39PM +0100, Alvaro Herrera wrote:

I think it's not an issue, or rather that we should not try to guess.
Instead make it a simple rule: if aminsert is called, then
aminsertcleanup must be called afterwards, period.

I agree it would be nice to have a way to verify, but it doesn't seem
100% essential. After all, it's not very common to add new calls to
aminsert.

This thread is listed as an open item. What's the follow-up plan?
The last email of this thread is dated as of the 29th of February,
which was 6 weeks ago.

Apologies, I got distracted by the other patches. The bug is still
there, I believe the patch shared by Alvaro in [1]/messages/by-id/202401091043.e3nrqiad6gb7@alvherre.pgsql is the right way to
deal with it. I'll take care of that today/tomorrow.

[1]: /messages/by-id/202401091043.e3nrqiad6gb7@alvherre.pgsql
/messages/by-id/202401091043.e3nrqiad6gb7@alvherre.pgsql

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#36Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Tomas Vondra (#35)
2 attachment(s)
Re: brininsert optimization opportunity

Hi,

Here's two patched to deal with this open item. 0001 is a trivial fix of
typos and wording, I moved it into a separate commit for clarity. 0002
does the actual fix - adding the index_insert_cleanup(). It's 99% the
patch Alvaro shared some time ago, with only some minor formatting
tweaks by me.

I've also returned to this Alvaro's comment:

Lastly, I kinda disagree with the notion that only some of the callers
of aminsert should call aminsertcleanup, even though btree doesn't
have an aminsertcleanup and thus it can't affect TOAST or catalogs.

which was a reaction to my earlier statement about places calling
index_insert():

There's a call in toast_internals.c, but that seems OK because that
only deals with btree indexes (and those don't need any cleanup). The
same logic applies to unique_key_recheck(). The rest goes through
execIndexing.c, which should do the cleanup in ExecCloseIndices().

I think Alvaro is right, so I went through all index_insert() callers
and checked which need the cleanup. Luckily there's not that many of
them, only 5 in total call index_insert() directly:

1) toast_save_datum (src/backend/access/common/toast_internals.c)

This is safe, because the index_insert() passes indexInfo=NULL, so
there can't possibly be any cache. If we ever decide to pass a valid
indexInfo, we can add the cleanup, now it seems pointless.

Note: If someone created a BRIN index on a TOAST table, that'd already
crash, because BRIN blindly dereferences the indexInfo. Maybe that
should be fixed, but we don't support CREATE INDEX on TOAST tables.

2) heapam_index_validate_scan (src/backend/access/heap/heapam_handler.c)

Covered by the committed fix, adding cleanup to validate_index.

3) CatalogIndexInsert (src/backend/catalog/indexing.c)

Covered by all callers also calling CatalogCloseIndexes, which in turn
calls ExecCloseIndices and cleanup.

4) unique_key_recheck (src/backend/commands/constraint.c)

This seems like the only place missing the cleanup call.

5) ExecInsertIndexTuples (src/backend/executor/execIndexing.c)

Should be covered by ExecCloseIndices, called after the insertions.

So it seems only (4) unique_key_recheck needs the extra call (it can't
really happen higher because the indexInfo is a local variable). So the
0002 patch adds the call.

The patch also adds a test for this (or rather tweaks an existing one).

It's a bit too late for me to push this now, I'll do so early tomorrow.

regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Attachments:

0001-Fix-a-couple-typos-in-BRIN-code.patchtext/x-patch; charset=UTF-8; name=0001-Fix-a-couple-typos-in-BRIN-code.patchDownload
From 0f89677b99b4b70ddfcc6c2cd08f433584bf65aa Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 18 Apr 2024 22:22:37 +0200
Subject: [PATCH 1/2] Fix a couple typos in BRIN code

Typos introduced by commits c1ec02be1d79, b43757171470 and dae761a87eda.

Author: Alvaro Herrera
Reported-by: Alexander Lakhin
Discussion: https://postgr.es/m/202401091043.e3nrqiad6gb7@alvherre.pgsql
---
 doc/src/sgml/indexam.sgml      |  2 +-
 src/backend/access/brin/brin.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 76ac0fcddd7..18cf23296f2 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -370,7 +370,7 @@ aminsert (Relation indexRelation,
    initially). After the index insertions complete, the memory will be freed
    automatically. If additional cleanup is required (e.g. if the cache contains
    buffers and tuple descriptors), the AM may define an optional function
-   <literal>indexinsertcleanup</literal>, called before the memory is released.
+   <literal>aminsertcleanup</literal>, called before the memory is released.
   </para>
 
   <para>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 32722f0961b..4f708bba658 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -222,7 +222,7 @@ static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
 								BrinMemTuple *dtup, const Datum *values, const bool *nulls);
 static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
 static void brin_fill_empty_ranges(BrinBuildState *state,
-								   BlockNumber prevRange, BlockNumber maxRange);
+								   BlockNumber prevRange, BlockNumber nextRange);
 
 /* parallel index builds */
 static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
@@ -1151,8 +1151,8 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 *
 	 * XXX plan_create_index_workers makes the number of workers dependent on
 	 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
-	 * for btree, but not for BRIN, which can do away with much less memory.
-	 * So maybe make that somehow less strict, optionally?
+	 * for btree, but not for BRIN, which can do with much less memory. So
+	 * maybe make that somehow less strict, optionally?
 	 */
 	if (indexInfo->ii_ParallelWorkers > 0)
 		_brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
@@ -2621,7 +2621,7 @@ _brin_parallel_merge(BrinBuildState *state)
 
 	/*
 	 * Initialize BrinMemTuple we'll use to union summaries from workers (in
-	 * case they happened to produce parts of the same paga range).
+	 * case they happened to produce parts of the same page range).
 	 */
 	memtuple = brin_new_memtuple(state->bs_bdesc);
 
@@ -2932,7 +2932,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
  * specified block number. The empty tuple is initialized only once, when it's
  * needed for the first time, stored in the memory context bs_context to ensure
  * proper life span, and reused on following calls. All empty tuples are
- * exactly the same except for the bs_blkno field, which is set to the value
+ * exactly the same except for the bt_blkno field, which is set to the value
  * in blkno parameter.
  */
 static void
-- 
2.44.0

0002-Add-index_insert_cleanup-call-to-validate_index.patchtext/x-patch; charset=UTF-8; name=0002-Add-index_insert_cleanup-call-to-validate_index.patchDownload
From b9f93f05fb0060135eddb7f68f754793d61fe08b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 18 Apr 2024 23:58:30 +0200
Subject: [PATCH 2/2] Add index_insert_cleanup call to validate_index

The optimization for inserts into BRIN indexes added by commit
c1ec02be1d79 relies on a cache that needs to be explicitly released
after calling index_insert(). The commit however failed to call the
cleanup in validate_index(), as it calls index_insert() indirectly
through table_index_validate_scan().

Fixed by adding the missing call to validate_index(), and also to
unique_key_recheck().

The commit does two additional improvements. It adds the index as the
first arguments to aminsertcleanup(), to make it more like the other AM
callbacks. And it calls the aminsertcleanup() always, even if the
ii_AmCache is NULL - this means it's up to the AM callback to decide if
cleanup is needed.

Author: Alvaro Herrera, Tomas Vondra
Reported-by: Alexander Lakhin
---
 doc/src/sgml/indexam.sgml          | 14 +++++++-------
 src/backend/access/brin/brin.c     |  6 ++++--
 src/backend/access/index/indexam.c |  5 ++---
 src/backend/catalog/index.c        |  3 +++
 src/backend/commands/constraint.c  |  2 ++
 src/include/access/amapi.h         |  3 ++-
 src/include/access/brin_internal.h |  2 +-
 src/test/regress/expected/brin.out |  3 ++-
 src/test/regress/sql/brin.sql      |  3 ++-
 9 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 18cf23296f2..e3c1539a1e3 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -367,21 +367,21 @@ aminsert (Relation indexRelation,
    within an SQL statement, it can allocate space
    in <literal>indexInfo-&gt;ii_Context</literal> and store a pointer to the
    data in <literal>indexInfo-&gt;ii_AmCache</literal> (which will be NULL
-   initially). After the index insertions complete, the memory will be freed
-   automatically. If additional cleanup is required (e.g. if the cache contains
-   buffers and tuple descriptors), the AM may define an optional function
-   <literal>aminsertcleanup</literal>, called before the memory is released.
+   initially).  If resources other than memory have to be released after
+   index insertions, <function>aminsertcleanup</function> may be provided,
+   which will be called before the memory is released.
   </para>
 
   <para>
 <programlisting>
 void
-aminsertcleanup (IndexInfo *indexInfo);
+aminsertcleanup (Relation indexRelation,
+                 IndexInfo *indexInfo);
 </programlisting>
    Clean up state that was maintained across successive inserts in
    <literal>indexInfo-&gt;ii_AmCache</literal>. This is useful if the data
-   requires additional cleanup steps, and simply releasing the memory is not
-   sufficient.
+   requires additional cleanup steps (e.g., releasing pinned buffers), and
+   simply releasing the memory is not sufficient.
   </para>
 
   <para>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 4f708bba658..bf28400dd84 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -500,11 +500,13 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
  * Callback to clean up the BrinInsertState once all tuple inserts are done.
  */
 void
-brininsertcleanup(IndexInfo *indexInfo)
+brininsertcleanup(Relation index, IndexInfo *indexInfo)
 {
 	BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
 
-	Assert(bistate);
+	/* bail out if cache not initialized */
+	if (indexInfo->ii_AmCache == NULL)
+		return;
 
 	/*
 	 * Clean up the revmap. Note that the brinDesc has already been cleaned up
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 7510159fc8d..dcd04b813d8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -242,10 +242,9 @@ index_insert_cleanup(Relation indexRelation,
 					 IndexInfo *indexInfo)
 {
 	RELATION_CHECKS;
-	Assert(indexInfo);
 
-	if (indexRelation->rd_indam->aminsertcleanup && indexInfo->ii_AmCache)
-		indexRelation->rd_indam->aminsertcleanup(indexInfo);
+	if (indexRelation->rd_indam->aminsertcleanup)
+		indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo);
 }
 
 /*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 9b7ef71d6fe..5a8568c55c9 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3402,6 +3402,9 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	/* Done with tuplesort object */
 	tuplesort_end(state.tuplesort);
 
+	/* Make sure to release resources cached in indexInfo (if needed). */
+	index_insert_cleanup(indexRelation, indexInfo);
+
 	elog(DEBUG2,
 		 "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
 		 state.htups, state.itups, state.tups_inserted);
diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c
index 94d491b7541..00880446c49 100644
--- a/src/backend/commands/constraint.c
+++ b/src/backend/commands/constraint.c
@@ -197,6 +197,8 @@ unique_key_recheck(PG_FUNCTION_ARGS)
 
 	ExecDropSingleTupleTableSlot(slot);
 
+	index_insert_cleanup(indexRel, indexInfo);
+
 	index_close(indexRel, RowExclusiveLock);
 
 	return PointerGetDatum(NULL);
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 00300dd720e..f25c9d58a7d 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -114,7 +114,8 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   struct IndexInfo *indexInfo);
 
 /* cleanup after insert */
-typedef void (*aminsertcleanup_function) (struct IndexInfo *indexInfo);
+typedef void (*aminsertcleanup_function) (Relation indexRelation,
+										  struct IndexInfo *indexInfo);
 
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
index 1c7eabe6041..a5a9772621c 100644
--- a/src/include/access/brin_internal.h
+++ b/src/include/access/brin_internal.h
@@ -96,7 +96,7 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
 					   IndexUniqueCheck checkUnique,
 					   bool indexUnchanged,
 					   struct IndexInfo *indexInfo);
-extern void brininsertcleanup(struct IndexInfo *indexInfo);
+extern void brininsertcleanup(Relation index, struct IndexInfo *indexInfo);
 extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
 extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 2662bb6ed43..d6779d8c7d2 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -575,6 +575,7 @@ DROP TABLE brintest_unlogged;
 -- test that the insert optimization works if no rows end up inserted
 CREATE TABLE brin_insert_optimization (a int);
 INSERT INTO brin_insert_optimization VALUES (1);
-CREATE INDEX ON brin_insert_optimization USING brin (a);
+CREATE INDEX brin_insert_optimization_idx ON brin_insert_optimization USING brin (a);
 UPDATE brin_insert_optimization SET a = a;
+REINDEX INDEX CONCURRENTLY brin_insert_optimization_idx;
 DROP TABLE brin_insert_optimization;
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index 0d3beabb3d7..695cfad4bea 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -519,6 +519,7 @@ DROP TABLE brintest_unlogged;
 -- test that the insert optimization works if no rows end up inserted
 CREATE TABLE brin_insert_optimization (a int);
 INSERT INTO brin_insert_optimization VALUES (1);
-CREATE INDEX ON brin_insert_optimization USING brin (a);
+CREATE INDEX brin_insert_optimization_idx ON brin_insert_optimization USING brin (a);
 UPDATE brin_insert_optimization SET a = a;
+REINDEX INDEX CONCURRENTLY brin_insert_optimization_idx;
 DROP TABLE brin_insert_optimization;
-- 
2.44.0

#37Tomas Vondra
tomas.vondra@enterprisedb.com
In reply to: Tomas Vondra (#36)
Re: brininsert optimization opportunity

On 4/19/24 00:13, Tomas Vondra wrote:

...

It's a bit too late for me to push this now, I'll do so early tomorrow.

FWIW I've pushed both patches, which resolves the open item, so I've
moved it to the "resolved" part on wiki.

thanks

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#38Michael Paquier
michael@paquier.xyz
In reply to: Tomas Vondra (#37)
Re: brininsert optimization opportunity

On Fri, Apr 19, 2024 at 04:14:00PM +0200, Tomas Vondra wrote:

FWIW I've pushed both patches, which resolves the open item, so I've
moved it to the "resolved" part on wiki.

Thanks, Tomas!
--
Michael