Ignore dropped columns when checking the column list in logical replication

Started by shiy.fnst@fujitsu.comabout 3 years ago2 messages
#1shiy.fnst@fujitsu.com
shiy.fnst@fujitsu.com
1 attachment(s)

Hi hackers,

I saw a problem related to column list.

There's a restriction that different column lists for same table can't be used
in the publications of single subscription. But we will get unexpected errors in
some cases because the dropped columns are not ignored.

For example:
-- publisher
create table tbl1 (a int primary key, b int, c int);
create publication pub1 for table tbl1(a,b);
create publication pub2 for table tbl1;
alter table tbl1 drop column c;

-- subscriber
create table tbl1 (a int primary key, b int, c int);
create subscription sub connection 'dbname=postgres port=5432' publication pub1, pub2;

-- publisher
insert into tbl1 values (1,2);

The publisher and subscriber will report the error:
ERROR: cannot use different column lists for table "public.tbl1" in different publications

This is caused by:
a. walsender (in pgoutput_column_list_init())
/*
* If column list includes all the columns of the table,
* set it to NULL.
*/
if (bms_num_members(cols) == RelationGetNumberOfAttributes(relation))
{
bms_free(cols);
cols = NULL;
}

The returned value of RelationGetNumberOfAttributes() contains dropped columns.

b. table synchronization (in fetch_remote_table_info())
appendStringInfo(&cmd,
"SELECT DISTINCT"
" (CASE WHEN (array_length(gpt.attrs, 1) = c.relnatts)"
" THEN NULL ELSE gpt.attrs END)"
" FROM pg_publication p,"
" LATERAL pg_get_publication_tables(p.pubname) gpt,"
" pg_class c"
" WHERE gpt.relid = %u AND c.oid = gpt.relid"
" AND p.pubname IN ( %s )",
lrel->remoteid,
pub_names.data);

If the result of the above SQL contains more than one tuple, an error will be
report (cannot use different column lists for table ...). In this SQL, attrs is
NULL if `array_length(gpt.attrs, 1) = c.relnatts`, but `c.relnatts` contains
dropped columns, what we want is the count of alive columns.

I tried to fix them in the attached patch.

Regards,
Shi yu

Attachments:

v1-0001-Ignore-dropped-columns-when-checking-the-column-l.patchapplication/octet-stream; name=v1-0001-Ignore-dropped-columns-when-checking-the-column-l.patchDownload
From db6073ec96089875a6286469c882910d8e553e9d Mon Sep 17 00:00:00 2001
From: Shi Yu <shiy.fnst@fujitsu.com>
Date: Wed, 4 Jan 2023 13:37:31 +0800
Subject: [PATCH v1] Ignore dropped columns when checking the column list in
 logical replication

There's a restriction that different column lists for same table can't be used
in the publications of single subscription. But in the check of walsender and
table synchronization, the dropped columns are included when getting the number
of columns in a table. This can lead to unexpected errors in some cases. Fix it
in this patch.
---
 src/backend/replication/logical/tablesync.c | 10 +++---
 src/backend/replication/pgoutput/pgoutput.c | 13 +++++++-
 src/test/subscription/t/031_column_list.pl  | 34 +++++++++++++++++++++
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index 2fdfeb5b4c..9e873881d0 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -817,14 +817,16 @@ fetch_remote_table_info(char *nspname, char *relname,
 		resetStringInfo(&cmd);
 		appendStringInfo(&cmd,
 						 "SELECT DISTINCT"
-						 "  (CASE WHEN (array_length(gpt.attrs, 1) = c.relnatts)"
+						 "  (CASE WHEN (array_length(gpt.attrs, 1) = "
+						 "   (SELECT count(*) FROM pg_attribute a"
+						 "    WHERE a.attrelid = %u AND a.attnum > 0 AND NOT a.attisdropped))"
 						 "   THEN NULL ELSE gpt.attrs END)"
 						 "  FROM pg_publication p,"
-						 "  LATERAL pg_get_publication_tables(p.pubname) gpt,"
-						 "  pg_class c"
-						 " WHERE gpt.relid = %u AND c.oid = gpt.relid"
+						 "  LATERAL pg_get_publication_tables(p.pubname) gpt"
+						 " WHERE gpt.relid = %u"
 						 "   AND p.pubname IN ( %s )",
 						 lrel->remoteid,
+						 lrel->remoteid,
 						 pub_names.data);
 
 		pubres = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data,
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 7737242516..0188b596c1 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -1050,16 +1050,27 @@ pgoutput_column_list_init(PGOutputData *data, List *publications,
 				/* Build the column list bitmap in the per-entry context. */
 				if (!pub_no_list)	/* when not null */
 				{
+					int			i;
+					int			nliveatts = 0;
+					TupleDesc	desc = RelationGetDescr(relation);
+
 					pgoutput_ensure_entry_cxt(data, entry);
 
 					cols = pub_collist_to_bitmapset(cols, cfdatum,
 													entry->entry_cxt);
 
+					/* Get the number of live attributes. */
+					for (i = 0; i < desc->natts; i++)
+					{
+						if (!TupleDescAttr(desc, i)->attisdropped)
+							nliveatts++;
+					}
+
 					/*
 					 * If column list includes all the columns of the table,
 					 * set it to NULL.
 					 */
-					if (bms_num_members(cols) == RelationGetNumberOfAttributes(relation))
+					if (bms_num_members(cols) == nliveatts)
 					{
 						bms_free(cols);
 						cols = NULL;
diff --git a/src/test/subscription/t/031_column_list.pl b/src/test/subscription/t/031_column_list.pl
index 8835ab30ff..58d4f97db6 100644
--- a/src/test/subscription/t/031_column_list.pl
+++ b/src/test/subscription/t/031_column_list.pl
@@ -1184,6 +1184,40 @@ $result = $node_publisher->safe_psql(
 is( $result, qq(t
 t), 'check the number of columns in the old tuple');
 
+# TEST: With a table included in multiple publications with same column lists,
+# it works fine. Dropped columns are not considered.
+$node_publisher->safe_psql(
+	'postgres', qq(
+	CREATE TABLE test_mix_4 (a int PRIMARY KEY, b int, c int);
+	ALTER TABLE test_mix_4 DROP COLUMN c;
+	CREATE PUBLICATION pub_mix_7 FOR TABLE test_mix_4 (a, b);
+	CREATE PUBLICATION pub_mix_8 FOR TABLE test_mix_4;
+));
+
+$node_subscriber->safe_psql(
+	'postgres', qq(
+	DROP SUBSCRIPTION sub1;
+	CREATE TABLE test_mix_4 (a int PRIMARY KEY, b int, c int);
+));
+
+$node_subscriber->safe_psql(
+	'postgres', qq(
+	CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub_mix_7, pub_mix_8;
+));
+
+$node_subscriber->wait_for_subscription_sync;
+
+$node_publisher->safe_psql(
+	'postgres', qq(
+	INSERT INTO test_mix_4 VALUES (1, 2);
+));
+
+$node_publisher->wait_for_catchup('sub1');
+
+# test_mix_4: only (a,b) is replicated
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT * FROM test_mix_4");
+is( $result, qq(1|2|), 'insert on test_mix_4 is replicated after dropping column c');
 
 # TEST: With a table included in multiple publications with different column
 # lists, we should catch the error when creating the subscription.
-- 
2.31.1

#2Amit Kapila
amit.kapila16@gmail.com
In reply to: shiy.fnst@fujitsu.com (#1)
Re: Ignore dropped columns when checking the column list in logical replication

On Wed, Jan 4, 2023 at 12:28 PM shiy.fnst@fujitsu.com
<shiy.fnst@fujitsu.com> wrote:

I tried to fix them in the attached patch.

Don't we need a similar handling for generated columns? We don't send
those to the subscriber side, see checks in proto.c.

--
With Regards,
Amit Kapila.