From 846f4a434a1cc7a72a5beb88326f9c03c9d599f1 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 20 Oct 2022 19:55:23 +0200
Subject: [PATCH 2/3] Fix handling of NULLs when building BRIN summaries

The existing code initializes all_nulls = true for new summaries, but
this poses issue when adding the first non-NULL value. We have to reset
all_nulls to false, but we don't know whether to modify has_nulls.

If there was a NULL value before, we need to set has_nulls=true. But if
the range was empty and we're adding the first value, we must not set
has_nulls.

The current code resets all_nulls=false, without setting has_nulls, but
that means we may forget the range contains NULL values.  So this is a
index corruption, producing incorrect results to IS NULL conditions.

We might always set has_nulls=true whenever resetting all_nulls, which
would resolve the index corruption, but it'd mean all ranges have either
all_nulls or has_nulls set, making the index useless for IS [NOT] NULL
queries.

Ideally, we'd add a new flag to identify empty summaries, but that does
not really work for backpatching - we'd need to add the flag to some
struct, e.g. BrinValues, but that'd change stride of the bt_columns
array, i.e. an ABI break.

So instead we use an "impossible" combination with both all_nulls and
has_nulls set to true to identify this case. And we never store index
tuples with this combination.

Note: This may be an issue because we won't store summaries for empty
ranges, making them match any condition. So far we had the same issue
for IS NULL conditions only.
---
 .../expected/pg_freespacemap.out              | 11 ++-
 .../pg_freespacemap/sql/pg_freespacemap.sql   |  8 +-
 src/backend/access/brin/brin.c                | 88 +++++++++++++++++--
 src/backend/access/brin/brin_tuple.c          | 19 +++-
 ...summarization-and-inprogress-insertion.out | 20 +----
 ...ummarization-and-inprogress-insertion.spec |  4 +-
 src/test/modules/brin/t/01_workitems.pl       | 11 +--
 src/test/regress/expected/brin.out            |  2 +-
 src/test/regress/expected/brin_bloom.out      |  2 +-
 src/test/regress/expected/brin_multi.out      |  2 +-
 10 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/contrib/pg_freespacemap/expected/pg_freespacemap.out b/contrib/pg_freespacemap/expected/pg_freespacemap.out
index eb574c23736..fa0d78c88a4 100644
--- a/contrib/pg_freespacemap/expected/pg_freespacemap.out
+++ b/contrib/pg_freespacemap/expected/pg_freespacemap.out
@@ -1,8 +1,11 @@
 CREATE EXTENSION pg_freespacemap;
-CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off);
-CREATE INDEX freespace_brin ON freespace_tab USING brin (c1);
+CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off, fillfactor = 10);
+CREATE INDEX freespace_brin ON freespace_tab USING brin (c1) WITH (pages_per_range=1);
 CREATE INDEX freespace_btree ON freespace_tab USING btree (c1);
 CREATE INDEX freespace_hash ON freespace_tab USING hash (c1);
+-- necessary to build the first BRIN index tuple
+INSERT INTO freespace_tab VALUES (1);
+VACUUM;
 -- report all the sizes of the FSMs for all the relation blocks.
 WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace')
   SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail
@@ -10,10 +13,12 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace
     ORDER BY 1, 2;
        id        | blkno | is_avail 
 -----------------+-------+----------
+ freespace_tab   |     0 | t
  freespace_brin  |     0 | f
  freespace_brin  |     1 | f
  freespace_brin  |     2 | t
  freespace_btree |     0 | f
+ freespace_btree |     1 | f
  freespace_hash  |     0 | f
  freespace_hash  |     1 | f
  freespace_hash  |     2 | f
@@ -24,7 +29,7 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace
  freespace_hash  |     7 | f
  freespace_hash  |     8 | f
  freespace_hash  |     9 | f
-(14 rows)
+(16 rows)
 
 INSERT INTO freespace_tab VALUES (1);
 VACUUM freespace_tab;
diff --git a/contrib/pg_freespacemap/sql/pg_freespacemap.sql b/contrib/pg_freespacemap/sql/pg_freespacemap.sql
index 06275d8fac8..efc0699aa6f 100644
--- a/contrib/pg_freespacemap/sql/pg_freespacemap.sql
+++ b/contrib/pg_freespacemap/sql/pg_freespacemap.sql
@@ -1,10 +1,14 @@
 CREATE EXTENSION pg_freespacemap;
 
-CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off);
-CREATE INDEX freespace_brin ON freespace_tab USING brin (c1);
+CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off, fillfactor = 10);
+CREATE INDEX freespace_brin ON freespace_tab USING brin (c1) WITH (pages_per_range=1);
 CREATE INDEX freespace_btree ON freespace_tab USING btree (c1);
 CREATE INDEX freespace_hash ON freespace_tab USING hash (c1);
 
+-- necessary to build the first BRIN index tuple
+INSERT INTO freespace_tab VALUES (1);
+VACUUM;
+
 -- report all the sizes of the FSMs for all the relation blocks.
 WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace')
   SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 7e386250ae9..3ed8eefab86 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -1568,15 +1568,48 @@ form_and_insert_tuple(BrinBuildState *state)
 {
 	BrinTuple  *tup;
 	Size		size;
+	bool		modified = false;
+	BrinMemTuple   *dtuple = state->bs_dtuple;
+	int				i;
 
-	tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
-						  state->bs_dtuple, &size);
-	brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
-				  &state->bs_currentInsertBuf, state->bs_currRangeStart,
-				  tup, size);
-	state->bs_numtuples++;
+	/*
+	 * Check if any rows were processed for the page range represented by this
+	 * memtuple. We initially set both allnulls/hasnulls to true to identify
+	 * if the range is in this initial/empty state.
+	 *
+	 * XXX It should be enough to check only the first summary - either all
+	 * summaries are empty or none of them.
+	 */
+	for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
+	{
+		if (!(dtuple->bt_columns[i].bv_allnulls &&
+			  dtuple->bt_columns[i].bv_hasnulls))
+		{
+			modified = true;
+			break;
+		}
+	}
 
-	pfree(tup);
+	/*
+	 * If the memtuple was modified (i.e. we added any rows to it), insert it
+	 * into the index. That is, we don't store index tuples not representing
+	 * any rows from table.
+	 *
+	 * XXX This has the undesirable consequence, that if the table has a gap
+	 * (a long sequence of pages with no remaining tuples), we won't have any
+	 * BRIN summaries for this part of the table. Which means that we'll have
+	 * to scan this gap for each bitmap index scan.
+	 */
+	if (modified)
+	{
+		tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
+							  state->bs_dtuple, &size);
+		brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
+					  &state->bs_currentInsertBuf, state->bs_currRangeStart,
+					  tup, size);
+		state->bs_numtuples++;
+		pfree(tup);
+	}
 }
 
 /*
@@ -1710,24 +1743,53 @@ add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
 		Datum		result;
 		BrinValues *bval;
 		FmgrInfo   *addValue;
+		bool		first_row;
+		bool		has_nulls = false;
 
 		bval = &dtup->bt_columns[keyno];
 
+		/*
+		 * Is this the first tuple we're adding to the range range? We track
+		 * that by setting both bv_hasnulls and bval->bv_allnulls to true
+		 * during initialization. But it's not a valid combination (at most
+		 * one of those flags should be set), so we reset the second flag.
+		 */
+		first_row = (bval->bv_hasnulls && bval->bv_allnulls);
+
+		if (bval->bv_hasnulls && bval->bv_allnulls)
+		{
+			bval->bv_hasnulls = false;
+			modified = true;
+		}
+
 		if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
 		{
 			/*
 			 * If the new value is null, we record that we saw it if it's the
 			 * first one; otherwise, there's nothing to do.
+			 *
+			 * XXX This used to check "hasnulls" but now that might result in
+			 * having both flags set. That used to be OK, because we just
+			 * ignore hasnulls flag in brin_form_tuple when allnulls=true.
+			 * But now we interpret this combination as "firt row" so it
+			 * would confuse following calls. So make sure to only set one
+			 * of the flags - when allnulls=true we're done, as it already
+			 * marks the range as containing ranges.
 			 */
-			if (!bval->bv_hasnulls)
+			if (!bval->bv_allnulls)
 			{
 				bval->bv_hasnulls = true;
 				modified = true;
 			}
-
 			continue;
 		}
 
+		/*
+		 * Does the range already has NULL values? Either of the flags can
+		 * be set, but we ignore the state before adding first row.
+		 */
+		has_nulls = (bval->bv_hasnulls || bval->bv_allnulls) && !first_row;
+
 		addValue = index_getprocinfo(idxRel, keyno + 1,
 									 BRIN_PROCNUM_ADDVALUE);
 		result = FunctionCall4Coll(addValue,
@@ -1736,8 +1798,16 @@ add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
 								   PointerGetDatum(bval),
 								   values[keyno],
 								   nulls[keyno]);
+
 		/* if that returned true, we need to insert the updated tuple */
 		modified |= DatumGetBool(result);
+
+		/*
+		 * If we had NULLS, and the opclass didn't set allnulls=true, set
+		 * the hasnulls so that we know there are NULL values.
+		 */
+		if (has_nulls && !bval->bv_allnulls)
+			bval->bv_hasnulls = true;
 	}
 
 	return modified;
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
index c0e2dbd23ba..1b5e72cde24 100644
--- a/src/backend/access/brin/brin_tuple.c
+++ b/src/backend/access/brin/brin_tuple.c
@@ -136,6 +136,13 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
 	{
 		int			datumno;
 
+		/*
+		 * We should never get here for memtuples in the initial state, i.e.
+		 * before any rows were added to it.
+		 */
+		Assert(!(tuple->bt_columns[keyno].bv_hasnulls &&
+				 tuple->bt_columns[keyno].bv_allnulls));
+
 		/*
 		 * "allnulls" is set when there's no nonnull value in any row in the
 		 * column; when this happens, there is no data to store.  Thus set the
@@ -516,8 +523,10 @@ brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
 	for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
 	{
 		dtuple->bt_columns[i].bv_attno = i + 1;
+
+		/* each memtuple starts as if it represents no rows */
 		dtuple->bt_columns[i].bv_allnulls = true;
-		dtuple->bt_columns[i].bv_hasnulls = false;
+		dtuple->bt_columns[i].bv_hasnulls = true;
 		dtuple->bt_columns[i].bv_values = (Datum *) currdatum;
 
 		dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL);
@@ -585,6 +594,14 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
 	{
 		int			i;
 
+		Assert(!(allnulls[keyno] && hasnulls[keyno]));
+
+		/*
+		 * Make sure to overwrite the hasnulls flag, because it might have
+		 * been initialized as true by brin_memtuple_initialize.
+		 */
+		dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
+
 		if (allnulls[keyno])
 		{
 			valueno += brdesc->bd_info[keyno]->oi_nstored;
diff --git a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
index 02ef52d299a..2266012eac7 100644
--- a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
+++ b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
@@ -1,12 +1,6 @@
 Parsed test spec with 2 sessions
 
-starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check
-step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
-itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
-----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |t       |f          |{1 .. 1}
-(1 row)
-
+starting permutation: s1b s2b s1i s2summ s1c s2c s2check
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
 step s2b: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT 1;
 ?column?
@@ -18,7 +12,7 @@ step s1i: INSERT INTO brin_iso VALUES (1000);
 step s2summ: SELECT brin_summarize_new_values('brinidx'::regclass);
 brin_summarize_new_values
 -------------------------
-                        1
+                        2
 (1 row)
 
 step s1c: COMMIT;
@@ -31,13 +25,7 @@ itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value
 (2 rows)
 
 
-starting permutation: s2check s1b s1i s2vacuum s1c s2check
-step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
-itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
-----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |t       |f          |{1 .. 1}
-(1 row)
-
+starting permutation: s1b s1i s2vacuum s1c s2check
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
 step s1i: INSERT INTO brin_iso VALUES (1000);
 step s2vacuum: VACUUM brin_iso;
@@ -45,7 +33,7 @@ step s1c: COMMIT;
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value      
 ----------+------+------+--------+--------+-----------+-----------
-         1|     0|     1|f       |f       |f          |{1 .. 1}   
+         1|     0|     1|f       |t       |f          |{1 .. 1}   
          2|     1|     1|f       |f       |f          |{1 .. 1000}
 (2 rows)
 
diff --git a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
index 18ba92b7ba1..6319ae4c38d 100644
--- a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
+++ b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
@@ -41,5 +41,5 @@ step "s2vacuum"	{ VACUUM brin_iso; }
 
 step "s2check"	{ SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); }
 
-permutation "s2check" "s1b" "s2b" "s1i" "s2summ" "s1c" "s2c" "s2check"
-permutation "s2check" "s1b" "s1i" "s2vacuum" "s1c" "s2check"
+permutation "s1b" "s2b" "s1i" "s2summ" "s1c" "s2c" "s2check"
+permutation "s1b" "s1i" "s2vacuum" "s1c" "s2check"
diff --git a/src/test/modules/brin/t/01_workitems.pl b/src/test/modules/brin/t/01_workitems.pl
index 3108c02cf4d..eeec44b0060 100644
--- a/src/test/modules/brin/t/01_workitems.pl
+++ b/src/test/modules/brin/t/01_workitems.pl
@@ -24,20 +24,21 @@ $node->safe_psql(
 	 create index brin_wi_idx on brin_wi using brin (a) with (pages_per_range=1, autosummarize=on);
 	 '
 );
-my $count = $node->safe_psql('postgres',
-	"select count(*) from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)"
-);
-is($count, '1', "initial index state is correct");
 
 $node->safe_psql('postgres',
 	'insert into brin_wi select * from generate_series(1, 100)');
 
+$node->poll_query_until(
+	'postgres',
+	"select pg_relation_size('brin_wi_idx'::regclass) / current_setting('block_size')::int = 3",
+	't');
+
 $node->poll_query_until(
 	'postgres',
 	"select count(*) > 1 from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)",
 	't');
 
-$count = $node->safe_psql('postgres',
+my $count = $node->safe_psql('postgres',
 	"select count(*) > 1 from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)"
 );
 is($count, 't', "index got summarized");
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 73fa38396e4..ebc31222354 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -454,7 +454,7 @@ $$;
 SELECT brin_summarize_range('brin_summarize_idx', 0);
  brin_summarize_range 
 ----------------------
-                    0
+                    1
 (1 row)
 
 -- nothing: already summarized
diff --git a/src/test/regress/expected/brin_bloom.out b/src/test/regress/expected/brin_bloom.out
index 32c56a996a2..6e847f9113d 100644
--- a/src/test/regress/expected/brin_bloom.out
+++ b/src/test/regress/expected/brin_bloom.out
@@ -373,7 +373,7 @@ $$;
 SELECT brin_summarize_range('brin_summarize_bloom_idx', 0);
  brin_summarize_range 
 ----------------------
-                    0
+                    1
 (1 row)
 
 -- nothing: already summarized
diff --git a/src/test/regress/expected/brin_multi.out b/src/test/regress/expected/brin_multi.out
index f3309f433f8..e65f1c20d4f 100644
--- a/src/test/regress/expected/brin_multi.out
+++ b/src/test/regress/expected/brin_multi.out
@@ -407,7 +407,7 @@ $$;
 SELECT brin_summarize_range('brin_summarize_multi_idx', 0);
  brin_summarize_range 
 ----------------------
-                    0
+                    1
 (1 row)
 
 -- nothing: already summarized
-- 
2.38.1

