[BUG] Error in BRIN summarization

Started by Anastasia Lubennikovaover 5 years ago17 messages

a.lubennikova@postgrespro.ru

over 5 years ago

2 attachment(s)

One of our clients caught an error "failed to find parent tuple for
heap-only tuple at (50661,130) in table "tbl'" in PostgreSQL v12.

Steps to reproduce (REL_12_STABLE):

1) Create table with primary key, create brin index, fill table with
some initial data:

create table tbl (id int primary key, a int) with (fillfactor=50);
create index idx on tbl using brin (a) with (autosummarize=on);
insert into tbl select i, i from generate_series(0,100000) as i;

2) Run script test_brin.sql using pgbench:

pgbench postgres -f ../review/brin_test.sql -n -T 120

The script is a bit messy because I was trying to reproduce a
problematic workload. Though I didn't manage to simplify it.
The idea is that it inserts new values into the table to produce
unindexed pages and also updates some values to trigger HOT-updates on
these pages.

3) Open psql session and run brin_summarize_new_values

select brin_summarize_new_values('idx'::regclass::oid); \watch 2

Wait a bit. And in psql you will see the ERROR.

This error is caused by the problem with root_offsets array bounds. It
occurs if a new HOT tuple was inserted after we've collected
root_offsets, and thus we don't have root_offset for tuple's offnum.
Concurrent insertions are possible, because brin_summarize_new_values()
only holds ShareUpdateLock on table and no lock (only pin) on the page.

The draft fix is in the attachments. It saves root_offsets_size and
checks that we only access valid fields.
Patch also adds some debug messages, just to ensure that problem was caught.

TODO:

- check if heapam_index_validate_scan() has the same problem
- code cleanup
- test other PostgreSQL versions

[1]: /messages/by-id/CA+TgmoYgwjmmjK24Qxb_vWAu8_Hh7gfVFcr3+R7ocdLvYOWJXg@mail.gmail.com
/messages/by-id/CA+TgmoYgwjmmjK24Qxb_vWAu8_Hh7gfVFcr3+R7ocdLvYOWJXg@mail.gmail.com

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

brin_summarize_fix_v0.patchtext/x-patch; charset=UTF-8; name=brin_summarize_fix_v0.patchDownload

commit 71bdcd268386c4240613b080b9ebd5ae935c1405
Author: anastasia <a.lubennikova@postgrespro.ru>
Date:   Thu Jul 23 17:55:16 2020 +0300

    WIP Fix root_offsets out of bound error in brin_summarize_new_values()

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index fc19f40a2e3..34f74a61ea3 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1176,6 +1176,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 	BlockNumber previous_blkno = InvalidBlockNumber;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -1355,7 +1356,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 			Page		page = BufferGetPage(hscan->rs_cbuf);
 
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = hscan->rs_cblock;
@@ -1643,13 +1644,31 @@ heapam_index_build_range_scan(Relation heapRelation,
 			rootTuple = *heapTuple;
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			elog(LOG, "HeapTupleIsHeapOnly. Check Offset last_valid %d offnum %d", root_offsets_size, offnum);
+
+			if (root_offsets_size <= offnum)
+			{
+				Page		page = BufferGetPage(hscan->rs_cbuf);
+
+				ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+						errmsg_internal("DEBUG failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+										ItemPointerGetBlockNumber(&heapTuple->t_self),
+										offnum,
+										RelationGetRelationName(heapRelation))));
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
-						(errcode(ERRCODE_DATA_CORRUPTED),
-						 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
-										 ItemPointerGetBlockNumber(&heapTuple->t_self),
-										 offnum,
-										 RelationGetRelationName(heapRelation))));
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+									ItemPointerGetBlockNumber(&heapTuple->t_self),
+									offnum,
+									RelationGetRelationName(heapRelation))));
 
 			ItemPointerSetOffsetNumber(&rootTuple.t_self,
 									   root_offsets[offnum - 1]);
@@ -1820,6 +1839,7 @@ heapam_index_validate_scan(Relation heapRelation,
 		if (HeapTupleIsHeapOnly(heapTuple))
 		{
 			root_offnum = root_offsets[root_offnum - 1];
+			// TODO add check of root_offsets_size like in heapam_index_build_range_scan()
 			if (!OffsetNumberIsValid(root_offnum))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 0efe3ce9995..9b2bff1b8fd 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -741,11 +741,11 @@ heap_page_prune_execute(Buffer buffer,
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum = 0;
+	OffsetNumber maxoff;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -832,4 +832,5 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+	return offnum;
 }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index dffb57bf11a..45f8622c6cb 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -181,7 +181,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
 									OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);

brin_test.sqlapplication/sql; name=brin_test.sqlDownload

Anastasia Lubennikova

a.lubennikova@postgrespro.ru

over 5 years ago

In reply to: Anastasia Lubennikova (#1)

2 attachment(s)

Re: [BUG] Error in BRIN summarization

On 23.07.2020 20:39, Anastasia Lubennikova wrote:

One of our clients caught an error "failed to find parent tuple for
heap-only tuple at (50661,130) in table "tbl'" in PostgreSQL v12.

Steps to reproduce (REL_12_STABLE):

1) Create table with primary key, create brin index, fill table with
some initial data:

create table tbl (id int primary key, a int) with (fillfactor=50);
create index idx on tbl using brin (a) with (autosummarize=on);
insert into tbl select i, i from generate_series(0,100000) as i;

2) Run script test_brin.sql using pgbench:

pgbench postgres -f ../review/brin_test.sql -n -T 120

The script is a bit messy because I was trying to reproduce a
problematic workload. Though I didn't manage to simplify it.
The idea is that it inserts new values into the table to produce
unindexed pages and also updates some values to trigger HOT-updates on
these pages.

3) Open psql session and run brin_summarize_new_values

select brin_summarize_new_values('idx'::regclass::oid); \watch 2

Wait a bit. And in psql you will see the ERROR.

This error is caused by the problem with root_offsets array bounds. It
occurs if a new HOT tuple was inserted after we've collected
root_offsets, and thus we don't have root_offset for tuple's offnum.
Concurrent insertions are possible, because
brin_summarize_new_values() only holds ShareUpdateLock on table and no
lock (only pin) on the page.

The draft fix is in the attachments. It saves root_offsets_size and
checks that we only access valid fields.
Patch also adds some debug messages, just to ensure that problem was
caught.

TODO:

- check if heapam_index_validate_scan() has the same problem
- code cleanup
- test other PostgreSQL versions

[1]
/messages/by-id/CA+TgmoYgwjmmjK24Qxb_vWAu8_Hh7gfVFcr3+R7ocdLvYOWJXg@mail.gmail.com

Here is the updated version of the fix.
The problem can be reproduced on all supported versions, so I suggest to
backpatch it.
Code slightly changed in v12, so here are two patches: one for versions
9.5 to 11 and another for versions from 12 to master.

As for heapam_index_validate_scan(), I've tried to reproduce the same
error with CREATE INDEX CONCURRENTLY, but haven't found any problem with it.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

brin_summarize_fix_REL_12_v1.patchtext/x-patch; charset=UTF-8; name=brin_summarize_fix_REL_12_v1.patchDownload

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index fc19f40a2e3..cb29a833663 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1176,6 +1176,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 	BlockNumber previous_blkno = InvalidBlockNumber;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -1341,6 +1342,11 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * It is essential, though, to check the root_offsets_size bound
+		 * before accessing the array, because concurrently inserted HOT tuples
+		 * don't have a valid cached root offset and we need to build the map
+		 * once again for them.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1355,7 +1361,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 			Page		page = BufferGetPage(hscan->rs_cbuf);
 
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = hscan->rs_cblock;
@@ -1643,6 +1649,22 @@ heapam_index_build_range_scan(Relation heapRelation,
 			rootTuple = *heapTuple;
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * As we do not hold buffer lock, concurrent insertion can happen.
+			 * If so, collect the map once again to find the root offset for
+			 * the new tuple.
+			 */
+			if (root_offsets_size < offnum)
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
+			Assert(root_offsets_size >= offnum);
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 0efe3ce9995..5ddb123f30c 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -740,12 +740,15 @@ heap_page_prune_execute(Buffer buffer,
  * Note: The information collected here is valid only as long as the caller
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
+ *
+ * The return value is the size of the root_offsets array. The caller must
+ * never access values that are out of this bound.
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum, maxoff;
+	OffsetNumber last_valid_offnum = 0;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -778,6 +781,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			 * Remember it in the mapping.
 			 */
 			root_offsets[offnum - 1] = offnum;
+			if (offnum > last_valid_offnum)
+				last_valid_offnum = offnum;
 
 			/* If it's not the start of a HOT-chain, we're done with it */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -820,6 +825,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 			/* Remember the root line pointer for this item */
 			root_offsets[nextoffnum - 1] = offnum;
+			if (nextoffnum > last_valid_offnum)
+				last_valid_offnum = nextoffnum;
 
 			/* Advance to next chain member, if any */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -832,4 +839,6 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+
+	return last_valid_offnum;
 }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index dffb57bf11a..45f8622c6cb 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -181,7 +181,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
 									OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);

brin_summarize_fix_REL9_5_v1.patchtext/x-patch; charset=UTF-8; name=brin_summarize_fix_REL9_5_v1.patchDownload

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 6ff92516eda..51387d015e2 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -739,12 +739,15 @@ heap_page_prune_execute(Buffer buffer,
  * Note: The information collected here is valid only as long as the caller
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
+ *
+ * The return value is the size of the root_offsets array. The caller must
+ * never access values that are out of this bound.
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum, maxoff;
+	OffsetNumber last_valid_offnum = 0;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -777,6 +780,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			 * Remember it in the mapping.
 			 */
 			root_offsets[offnum - 1] = offnum;
+			if (offnum > last_valid_offnum)
+				last_valid_offnum = offnum;
 
 			/* If it's not the start of a HOT-chain, we're done with it */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -819,6 +824,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 			/* Remember the root line pointer for this item */
 			root_offsets[nextoffnum - 1] = offnum;
+			if (nextoffnum > last_valid_offnum)
+				last_valid_offnum = nextoffnum;
 
 			/* Advance to next chain member, if any */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -828,4 +835,6 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+
+	return last_valid_offnum;
 }
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f5c12d3d1c9..66ce910ca81 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2288,6 +2288,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
 	TransactionId OldestXmin;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -2389,6 +2390,11 @@ IndexBuildHeapRangeScan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * It is essential, though, to check the root_offsets_size bound
+		 * before accessing the array, because concurrently inserted HOT tuples
+		 * don't have a valid cached root offset and we need to build the map
+		 * once again for them.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -2403,7 +2409,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
 			Page		page = BufferGetPage(scan->rs_cbuf);
 
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = scan->rs_cblock;
@@ -2659,6 +2665,22 @@ IndexBuildHeapRangeScan(Relation heapRelation,
 			rootTuple = *heapTuple;
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * As we do not hold buffer lock, concurrent insertion can happen.
+			 * If so, collect the map once again to find the root offset for
+			 * the new tuple.
+			 */
+			if (root_offsets_size < offnum)
+			{
+				Page	page = BufferGetPage(scan->rs_cbuf);
+
+				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
+			Assert(root_offsets_size >= offnum);
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 175509812da..bec2a31902f 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -190,7 +190,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 						OffsetNumber *redirected, int nredirected,
 						OffsetNumber *nowdead, int ndead,
 						OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Anastasia Lubennikova (#2)

Re: [BUG] Error in BRIN summarization

On 2020-Jul-27, Anastasia Lubennikova wrote:

Here is the updated version of the fix.
The problem can be reproduced on all supported versions, so I suggest to
backpatch it.
Code slightly changed in v12, so here are two patches: one for versions 9.5
to 11 and another for versions from 12 to master.

Hi Anastasia, thanks for this report and fix. I was considering this
last week and noticed that the patch changes the ABI of
heap_get_root_tuples, which may be problematic in back branches. I
suggest that for unreleased branches (12 and prior) we need to create a
new function with the new signature, and keep heap_get_root_tuples
unchanged. In 13 and master we don't need that trick, so we can keep
the code as you have it in this version of the patch.

OffsetNumber
heap_get_root_tuples_new(Page page, OffsetNumber *root_offsets)
{ .. full implementation ... }

/* ABI compatibility only */
void
heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
{
(void) heap_get_root_tuples_new(page, root_offsets);
}

(I was also considering whether it needs to be a loop to reobtain root
tuples, in case a concurrent transaction can create a new item while
we're checking that item; but I don't think that can really happen for
one individual tuple.)

Thanks

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Peter Geoghegan

pg@bowt.ie

over 5 years ago

In reply to: Alvaro Herrera (#3)

Re: [BUG] Error in BRIN summarization

On Mon, Jul 27, 2020 at 10:25 AM Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

(I was also considering whether it needs to be a loop to reobtain root
tuples, in case a concurrent transaction can create a new item while
we're checking that item; but I don't think that can really happen for
one individual tuple.)

I wonder if something like that is the underlying problem in a recent
problem case involving a "REINDEX index
pg_class_tblspc_relfilenode_index" command that runs concurrently with
the regression tests:

/messages/by-id/CAH2-WzmBxu4o=pMsniur+bwHqCGCmV_AOLkuK6BuU7ngA6evqw@mail.gmail.com

We see a violation of the HOT invariant in this case, though only for
a system catalog index, and only in fairly particular circumstances
involving concurrency.

--
Peter Geoghegan

Anastasia Lubennikova

a.lubennikova@postgrespro.ru

over 5 years ago

In reply to: Alvaro Herrera (#3)

Re: [BUG] Error in BRIN summarization

On 27.07.2020 20:25, Alvaro Herrera wrote:

On 2020-Jul-27, Anastasia Lubennikova wrote:

Here is the updated version of the fix.
The problem can be reproduced on all supported versions, so I suggest to
backpatch it.
Code slightly changed in v12, so here are two patches: one for versions 9.5
to 11 and another for versions from 12 to master.

(I was also considering whether it needs to be a loop to reobtain root
tuples, in case a concurrent transaction can create a new item while
we're checking that item; but I don't think that can really happen for
one individual tuple.)

I don't think we need a recheck for a single tuple, because we only care
about finding its root, which simply must exist somewhere on this page,
as concurrent pruning is not allowed. We also may catch root_offsets[]
for subsequent tuples, but it's okay if we don't. These tuples will do
the same recheck on their turn.

While testing this fix, Alexander Lakhin spotted another problem. I
simplified the test case to this:

1) prepare a table with brin index

create table tbl (iint, bchar(1000)) with (fillfactor=10);
insert into tbl select i, md5(i::text) from generate_series(0,1000) as i;
create index idx on tbl using brin(i, b) with (pages_per_range=1);

2) run brin_desummarize_range() in a loop:

echo "-- desummarize all ranges
SELECT FROM generate_series(0, pg_relation_size('tbl')/8192 - 1) as i, lateral (SELECT brin_desummarize_range('idx', i)) as d;
-- summarize them back
VACUUM tbl" > brin_desum_test.sql

pgbench postgres -f brin_desum_test.sql -n -T 120

3) run a search that invokes bringetbitmap:

set enable_seqscan to off;
explain analyze select * from tbl where i>10 and i<100; \watch 1

After a few runs, it will fail with "ERROR: corrupted BRIN index:
inconsistent range map"

The problem is caused by a race in page locking in
brinGetTupleForHeapBlock [1]https://github.com/postgres/postgres/blob/master/src/backend/access/brin/brin_revmap.c#L269:

(1) bitmapsan locks revmap->rm_currBuf and finds the address of the
tuple on a regular page "page", then unlocks revmap->rm_currBuf
(2) in another transaction desummarize locks both revmap->rm_currBuf and
"page", cleans up the tuple and unlocks both buffers
(1) bitmapscan locks buffer, containing "page", attempts to access the
tuple and fails to find it

At first, I tried to fix it by holding the lock on revmap->rm_currBuf
until we locked the regular page, but it causes a deadlock with
brinsummarize(), It can be easily reproduced with the same test as above.
Is there any rule about the order of locking revmap and regular pages in
brin? I haven't found anything in README.

As an alternative, we can leave locks as is and add a recheck, before
throwing an error.

What do you think?

[1]: https://github.com/postgres/postgres/blob/master/src/backend/access/brin/brin_revmap.c#L269
https://github.com/postgres/postgres/blob/master/src/backend/access/brin/brin_revmap.c#L269

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Anastasia Lubennikova

a.lubennikova@postgrespro.ru

over 5 years ago

In reply to: Anastasia Lubennikova (#5)

2 attachment(s)

Re: [BUG] Error in BRIN summarization

On 30.07.2020 16:40, Anastasia Lubennikova wrote:

While testing this fix, Alexander Lakhin spotted another problem.

After a few runs, it will fail with "ERROR: corrupted BRIN index:
inconsistent range map"

The problem is caused by a race in page locking in
brinGetTupleForHeapBlock [1]:

(1) bitmapsan locks revmap->rm_currBuf and finds the address of the
tuple on a regular page "page", then unlocks revmap->rm_currBuf
(2) in another transaction desummarize locks both revmap->rm_currBuf
and "page", cleans up the tuple and unlocks both buffers
(1) bitmapscan locks buffer, containing "page", attempts to access the
tuple and fails to find it

At first, I tried to fix it by holding the lock on revmap->rm_currBuf
until we locked the regular page, but it causes a deadlock with
brinsummarize(), It can be easily reproduced with the same test as above.
Is there any rule about the order of locking revmap and regular pages
in brin? I haven't found anything in README.

As an alternative, we can leave locks as is and add a recheck, before
throwing an error.

Here are the updated patches for both problems.

1) brin_summarize_fix_REL_12_v2 fixes
"failed to find parent tuple for heap-only tuple at (50661,130) in table
"tbl'"

This patch checks that we only access initialized entries of
root_offsets[] array. If necessary, collect the array again. One recheck
is enough here, since concurrent pruning is not possible.

2) brin_pagelock_fix_REL_12_v1.patch fixes
"ERROR: corrupted BRIN index: inconsistent range map"

This patch adds a recheck as suggested in previous message.
I am not sure if one recheck is enough to eliminate the race completely,
but the problem cannot be reproduced anymore.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

brin_pagelock_fix_REL_12_v1.patchtext/x-patch; charset=UTF-8; name=brin_pagelock_fix_REL_12_v1.patchDownload

commit 0b1fbfb34236882591c5ac6665a407d9780f4017
Author: anastasia <a.lubennikova@postgrespro.ru>
Date:   Wed Aug 5 15:56:10 2020 +0300

    fix race in BRIN page locking

diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 7dcb1cd73f..89724c3bd4 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -207,6 +207,7 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 	ItemId		lp;
 	BrinTuple  *tup;
 	ItemPointerData previptr;
+	bool in_retry = false;
 
 	/* normalize the heap block number to be the first page in the range */
 	heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
@@ -283,9 +284,23 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 		if (BRIN_IS_REGULAR_PAGE(page))
 		{
 			if (*off > PageGetMaxOffsetNumber(page))
-				ereport(ERROR,
-						(errcode(ERRCODE_INDEX_CORRUPTED),
-						 errmsg_internal("corrupted BRIN index: inconsistent range map")));
+			{
+				if (in_retry)
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							errmsg_internal("corrupted BRIN index: inconsistent range map")));
+				else
+				{
+					/*
+					 * Assume that the revmap was updated concurrently.
+					 * Retry only once to avoid eternal looping in case
+					 * the index is really corrupted.
+					 */
+					LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+					in_retry = true;
+					continue;
+				}
+			}
 			lp = PageGetItemId(page, *off);
 			if (ItemIdIsUsed(lp))
 			{

brin_summarize_fix_REL_12_v2.patchtext/x-patch; charset=UTF-8; name=brin_summarize_fix_REL_12_v2.patchDownload

commit 16f0387dbeba3570836b506241bf0a1820de3390
Author: anastasia <a.lubennikova@postgrespro.ru>
Date:   Mon Aug 10 20:07:22 2020 +0300

    brin_summarize_fix_REL_12_v2.patch

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 267a6ee25a..c651d8b04e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1159,6 +1159,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 	BlockNumber previous_blkno = InvalidBlockNumber;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -1324,6 +1325,11 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * It is essential, though, to check the root_offsets_size bound
+		 * before accessing the array, because concurrently inserted HOT tuples
+		 * don't have a valid cached root offset and we need to build the map
+		 * once again for them.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1338,7 +1344,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 			Page		page = BufferGetPage(hscan->rs_cbuf);
 
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = hscan->rs_cblock;
@@ -1625,6 +1631,25 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * As we do not hold buffer lock, concurrent insertion can happen.
+			 * If so, collect the map once again to find the root offset for
+			 * the new tuple.
+			 * (MaxOffsetNumber+1) is a special value that we use to
+			 * differentiate uninitialized entries.
+			 */
+			if (root_offsets_size < offnum ||
+				root_offsets[offnum - 1] == (MaxOffsetNumber+1))
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
+			Assert(root_offsets_size >= offnum);
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 256df4de10..3810481df7 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -732,7 +732,9 @@ heap_page_prune_execute(Buffer buffer,
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
- * We zero out all unused entries.
+ * The return value is the size of the root_offsets array. The caller must
+ * never access values that are out of this bound.
+ * We also set special value of (MaxOffsetNumber+1) to unused entries.
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
@@ -740,12 +742,13 @@ heap_page_prune_execute(Buffer buffer,
  * Note: The information collected here is valid only as long as the caller
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
+ *
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum, maxoff;
+	OffsetNumber last_valid_offnum = 0;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -759,7 +762,14 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 		/* skip unused and dead items */
 		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+		{
+			/*
+			 * (MaxOffsetNumber+1) is a special value that we use to
+			 * differentiate values that we've skipped.
+			 */
+			root_offsets[offnum - 1] = (MaxOffsetNumber+1);
 			continue;
+		}
 
 		if (ItemIdIsNormal(lp))
 		{
@@ -778,6 +788,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			 * Remember it in the mapping.
 			 */
 			root_offsets[offnum - 1] = offnum;
+			if (offnum > last_valid_offnum)
+				last_valid_offnum = offnum;
 
 			/* If it's not the start of a HOT-chain, we're done with it */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -820,6 +832,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 			/* Remember the root line pointer for this item */
 			root_offsets[nextoffnum - 1] = offnum;
+			if (nextoffnum > last_valid_offnum)
+				last_valid_offnum = nextoffnum;
 
 			/* Advance to next chain member, if any */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -832,4 +846,6 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+
+	return last_valid_offnum;
 }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index b31de38910..b7f570887e 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -180,7 +180,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
 									OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/vacuumlazy.c */
 struct VacuumParams;

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Anastasia Lubennikova (#5)

1 attachment(s)

Re: [BUG] Error in BRIN summarization

On 2020-Jul-30, Anastasia Lubennikova wrote:

While testing this fix, Alexander Lakhin spotted another problem. I
simplifiedï¿½ the test case to this:

Ah, good catch. I think a cleaner way to fix this problem is to just
consider the range as not summarized and return NULL from there, as in
the attached patch. Running your test case with a telltale WARNING
added at that point, it's clear that it's being hit.

By returning NULL, we're forcing the caller to scan the heap, which is
not great. But note that if you retry, and your VACUUM hasn't run yet
by the time we go through the loop again, the same thing would happen.
So it seems to me a good enough answer.

A much more troubling thought is what happens if the range is
desummarized, then the index item is used for the summary of a different
range. Then the index might end up returning corrupt results.

At first, I tried to fix it by holding the lock on revmap->rm_currBuf until
we locked the regular page, but it causes a deadlock with brinsummarize(),
It can be easily reproduced with the same test as above.
Is there any rule about the order of locking revmap and regular pages in
brin? I haven't found anything in README.

Umm, I thought that stuff was in the README, but it seems I didn't add
it there. I think I had a .org file with my notes on that ... must be
in an older laptop disk, because it's not in my worktree for that. I'll
see if I can fish it out.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

brin-desumm-race.patchtext/x-diff; charset=us-asciiDownload

diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index e8b8308f82..35746714a7 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -282,10 +282,17 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 		/* If we land on a revmap page, start over */
 		if (BRIN_IS_REGULAR_PAGE(page))
 		{
+			/*
+			 * If the offset number is greater than what's in the page, it's
+			 * possible that the range was desummarized concurrently. Just
+			 * return NULL to handle that case.
+			 */
 			if (*off > PageGetMaxOffsetNumber(page))
-				ereport(ERROR,
-						(errcode(ERRCODE_INDEX_CORRUPTED),
-						 errmsg_internal("corrupted BRIN index: inconsistent range map")));
+			{
+				LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+				return NULL;
+			}
+
 			lp = PageGetItemId(page, *off);
 			if (ItemIdIsUsed(lp))
 			{

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Anastasia Lubennikova (#1)

1 attachment(s)

Re: [BUG] Error in BRIN summarization

On 2020-Jul-23, Anastasia Lubennikova wrote:

This error is caused by the problem with root_offsets array bounds. It
occurs if a new HOT tuple was inserted after we've collected root_offsets,
and thus we don't have root_offset for tuple's offnum. Concurrent insertions
are possible, because brin_summarize_new_values() only holds ShareUpdateLock
on table and no lock (only pin) on the page.

Excellent detective work, thanks.

The draft fix is in the attachments. It saves root_offsets_size and checks
that we only access valid fields.

I think this is more complicated than necessary. It seems easier to
solve this problem by just checking whether the given root pointer is
set to InvalidOffsetNumber, which is already done in the existing coding
of heap_get_root_tuples (only they spell it "0" rather than
InvalidOffsetNumber, which I propose to change). AFAIR this should only
happen in the 'anyvisible' mode, so I added that in an assert.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

hot-summarization.patchtext/x-diff; charset=us-asciiDownload

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 267a6ee25a..08252bc0c0 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1324,6 +1324,12 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * In the case where we don't hold ShareUpdateExclusiveLock on the
+		 * table, it's possible for some HOT tuples to appear that we didn't
+		 * know about when we first read the page.  To handle that case, we
+		 * re-obtain the list of root offsets when a HOT tuple points to a
+		 * root item that we don't know about.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1625,6 +1631,23 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * If a HOT tuple points to a root that we don't know
+			 * about, obtain root items afresh.  If that still fails,
+			 * report it as corruption.
+			 */
+			if (root_offsets[offnum - 1] == InvalidOffsetNumber)
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				/* This can only happen in 'anyvisible' mode */
+				Assert(anyvisible);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 256df4de10..7e3d44dfd6 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -732,7 +732,7 @@ heap_page_prune_execute(Buffer buffer,
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
- * We zero out all unused entries.
+ * Unused entries are filled with InvalidOffsetNumber (zero).
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
@@ -747,7 +747,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 	OffsetNumber offnum,
 				maxoff;
 
-	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+	MemSet(root_offsets, InvalidOffsetNumber,
+		   MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Peter Geoghegan (#4)

Re: [BUG] Error in BRIN summarization

On 2020-Jul-28, Peter Geoghegan wrote:

On Mon, Jul 27, 2020 at 10:25 AM Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

(I was also considering whether it needs to be a loop to reobtain root
tuples, in case a concurrent transaction can create a new item while
we're checking that item; but I don't think that can really happen for
one individual tuple.)

I wonder if something like that is the underlying problem in a recent
problem case involving a "REINDEX index
pg_class_tblspc_relfilenode_index" command that runs concurrently with
the regression tests:

/messages/by-id/CAH2-WzmBxu4o=pMsniur+bwHqCGCmV_AOLkuK6BuU7ngA6evqw@mail.gmail.com

We see a violation of the HOT invariant in this case, though only for
a system catalog index, and only in fairly particular circumstances
involving concurrency.

Hmm. As far as I understand, the bug Anastasia reports can only hit an
index build that occurs concurrently to heap updates; and that cannot
happen for a regular index build, only for CREATE INDEX CONCURRENTLY and
REINDEX CONCURRENTLY. So unless I miss something, it's not related to
that other bug.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#10

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Alvaro Herrera (#8)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-11, Alvaro Herrera wrote:

I think this is more complicated than necessary. It seems easier to
solve this problem by just checking whether the given root pointer is
set to InvalidOffsetNumber, which is already done in the existing coding
of heap_get_root_tuples (only they spell it "0" rather than
InvalidOffsetNumber, which I propose to change). AFAIR this should only
happen in the 'anyvisible' mode, so I added that in an assert.

'anyvisible' mode is not required AFAICS; reading the code, I think this
could also hit REINDEX CONCURRENTLY and CREATE INDEX CONCURRENTLY, which
do not use that flag. I didn't try to reproduce it there, though.
Anyway, I'm going to remove that Assert() I added.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#11

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Alvaro Herrera (#7)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-11, Alvaro Herrera wrote:

A much more troubling thought is what happens if the range is
desummarized, then the index item is used for the summary of a different
range. Then the index might end up returning corrupt results.

Actually, this is not a concern because the brin tuple's bt_blkno is
rechecked before returning it, and if it doesn't match what we're
searching, the loop is restarted. It becomes an infinite loop problem
if the revmap is pointing to a tuple that's labelled with a different
range's blkno. So I think my patch as posted is a sufficient fix for
this problem.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#12

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Alvaro Herrera (#10)

1 attachment(s)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-12, Alvaro Herrera wrote:

'anyvisible' mode is not required AFAICS; reading the code, I think this
could also hit REINDEX CONCURRENTLY and CREATE INDEX CONCURRENTLY, which
do not use that flag. I didn't try to reproduce it there, though.
Anyway, I'm going to remove that Assert() I added.

So this is what I propose as the final form of the fix.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

v4-0001-fix-HOT-tuples-while-scanning-for-index-builds.patchtext/x-diff; charset=us-asciiDownload

From e0abe5d957155285980a40fb33c192100699e8c0 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 12 Aug 2020 14:02:58 -0400
Subject: [PATCH v4] fix HOT tuples while scanning for index builds

---
 src/backend/access/heap/heapam_handler.c | 20 ++++++++++++++++++++
 src/backend/access/heap/pruneheap.c      |  5 +++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 267a6ee25a..ba44e30035 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1324,6 +1324,12 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * In cases with only ShareUpdateExclusiveLock on the table, it's
+		 * possible for some HOT tuples to appear that we didn't know about
+		 * when we first read the page.  To handle that case, we re-obtain the
+		 * list of root offsets when a HOT tuple points to a root item that we
+		 * don't know about.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1625,6 +1631,20 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * If a HOT tuple points to a root that we don't know
+			 * about, obtain root items afresh.  If that still fails,
+			 * report it as corruption.
+			 */
+			if (root_offsets[offnum - 1] == InvalidOffsetNumber)
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 256df4de10..7e3d44dfd6 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -732,7 +732,7 @@ heap_page_prune_execute(Buffer buffer,
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
- * We zero out all unused entries.
+ * Unused entries are filled with InvalidOffsetNumber (zero).
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
@@ -747,7 +747,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 	OffsetNumber offnum,
 				maxoff;
 
-	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+	MemSet(root_offsets, InvalidOffsetNumber,
+		   MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
-- 
2.20.1

#13

Anastasia Lubennikova

a.lubennikova@postgrespro.ru

over 5 years ago

In reply to: Alvaro Herrera (#12)

Re: [BUG] Error in BRIN summarization

On 12.08.2020 22:58, Alvaro Herrera wrote:

On 2020-Aug-12, Alvaro Herrera wrote:

'anyvisible' mode is not required AFAICS; reading the code, I think this
could also hit REINDEX CONCURRENTLY and CREATE INDEX CONCURRENTLY, which
do not use that flag. I didn't try to reproduce it there, though.
Anyway, I'm going to remove that Assert() I added.

So this is what I propose as the final form of the fix.

Cool.
This version looks much simpler than mine and passes the tests fine.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#14

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Anastasia Lubennikova (#13)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-13, Anastasia Lubennikova wrote:

Cool.
This version looks much simpler than mine and passes the tests fine.

Thanks, pushed it to all branches. Thanks for diagnosing this problem!

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#15

Tom Lane

tgl@sss.pgh.pa.us

over 5 years ago

In reply to: Alvaro Herrera (#12)

Re: [BUG] Error in BRIN summarization

hyrax's latest report suggests that this patch has issues under
CLOBBER_CACHE_ALWAYS:

https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=hyrax&dt=2020-08-13%2005%3A09%3A58

Hard to tell whether there's an actual bug there or just test instability,
but either way it needs to be resolved.

regards, tom lane

#16

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Tom Lane (#15)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-15, Tom Lane wrote:

hyrax's latest report suggests that this patch has issues under
CLOBBER_CACHE_ALWAYS:

https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=hyrax&dt=2020-08-13%2005%3A09%3A58

Hard to tell whether there's an actual bug there or just test instability,
but either way it needs to be resolved.

Hmm, the only explanation I can see for this is that autovacuum managed
to summarize the range before the test script did it. So the solution
would simply be to disable autovacuum for the table across the whole
script.

I'm running the scripts and dependencies to verify that fix, but under
CLOBBER_CACHE_ALWAYS that takes quite a bit.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#17

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Alvaro Herrera (#16)

1 attachment(s)

Re: [BUG] Error in BRIN summarization

On 2020-Aug-17, Alvaro Herrera wrote:

Hmm, the only explanation I can see for this is that autovacuum managed
to summarize the range before the test script did it. So the solution
would simply be to disable autovacuum for the table across the whole
script.

I'm running the scripts and dependencies to verify that fix, but under
CLOBBER_CACHE_ALWAYS that takes quite a bit.

I ran a subset of tests a few times, but was unable to reproduce the
problem. I'll just push this to all branches and hope for the best.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

0001-Disable-autovacuum-for-BRIN-test-table.patchtext/x-diff; charset=us-asciiDownload

From 6e70443edacfc86674995c0c10ade0aec7a4fddf Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 17 Aug 2020 16:20:06 -0400
Subject: [PATCH] Disable autovacuum for BRIN test table

This should improve stability in the tests.

Per buildfarm member hyrax (CLOBBER_CACHE_ALWAYS) via Tom Lane.

Discussion: https://postgr.es/m/871534.1597503261@sss.pgh.pa.us
---
 src/test/regress/expected/brin.out | 2 +-
 src/test/regress/sql/brin.sql      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 0b14c73fc6..18403498df 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
 	int4rangecol int4range,
 	lsncol pg_lsn,
 	boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
 INSERT INTO brintest SELECT
 	repeat(stringu1, 8)::bytea,
 	substr(stringu1, 1, 1)::"char",
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index 1289e76ecb..d1a82474f3 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
 	int4rangecol int4range,
 	lsncol pg_lsn,
 	boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
 
 INSERT INTO brintest SELECT
 	repeat(stringu1, 8)::bytea,
-- 
2.20.1