From bce1c4353ebea12d7f5f19bb18b1ea00acb37085 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Thu, 25 Jul 2019 19:36:40 +0200
Subject: [PATCH] Add facilities to fetch real timeline from SQL

The only way to fetch the timeline from SQL was to query
pg_control_checkpoint() which read the controldata file from disk.

This is fine on a primay cluster, because its controldata file is
synched to disk during important operation, eg. promotion. However, on
a standby, the controldata file is only synched during restartpoint. This
means the timeline read from there can be wrong during several minutes
after a timeline change with recoevery_target_timeline set to latest.

This patch overload pg_current_wal_lsn and pg_last_wal_received_lsn with
new functions taking a boolean as parameter. If true, they will report both
the requested LSN and its timeline.
---
 doc/src/sgml/func.sgml                     |  41 +++++-
 src/backend/access/transam/xlogfuncs.c     | 145 +++++++++++++++++++++
 src/include/catalog/pg_proc.dat            |  13 ++
 src/test/recovery/t/004_timeline_switch.pl |  37 +++++-
 4 files changed, 234 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cc3041f637..853e344b19 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -20338,6 +20338,13 @@ SELECT set_config('log_statement_stats', 'off', false);
        <entry><type>pg_lsn</type></entry>
        <entry>Get current write-ahead log write location</entry>
       </row>
+      <row>
+       <entry>
+        <literal><function>pg_current_wal_lsn(<parameter>with_tli</parameter> <type>boolean</type>)</function></literal>
+        </entry>
+       <entry><type>setof record</type></entry>
+       <entry>Get current write-ahead log write location and timeline f</entry>
+      </row>
       <row>
        <entry>
         <literal><function>pg_start_backup(<parameter>label</parameter> <type>text</type> <optional>, <parameter>fast</parameter> <type>boolean</type> <optional>, <parameter>exclusive</parameter> <type>boolean</type> </optional></optional>)</function></literal>
@@ -20486,7 +20493,15 @@ postgres=# select pg_start_backup('label_goes_here');
 
    <para>
     <function>pg_current_wal_lsn</function> displays the current write-ahead log write
-    location in the same format used by the above functions.  Similarly,
+    location in the same format used by the above functions.
+    There is an optional parameter of type <type>boolean</type>.  If <literal>true</literal>,
+    the result includes the current timeline as second parameter. Do not set this
+    parameter, even to <literal>false</literal>, if you only need the current write-ahead
+    log write location as the result would include a useless <literal>NULL</literal>
+    second field.
+   </para>
+
+   <para>Similarly,
     <function>pg_current_wal_insert_lsn</function> displays the current write-ahead log
     insertion location and <function>pg_current_wal_flush_lsn</function> displays the
     current write-ahead log flush location. The insertion location is the <quote>logical</quote>
@@ -20587,6 +20602,21 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
         started, the function returns NULL.
        </entry>
       </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_wal_receive_lsn(<parameter>with_tli</parameter> <type>boolean</type>)</function></literal>
+        </entry>
+       <entry><type>pg_lsn</type></entry>
+       <entry>Get last write-ahead log location received and synced to disk by
+        streaming replication and optionally its timeline.
+        While streaming replication is in progress
+        this will increase monotonically. If recovery has completed this will
+        remain static at
+        the value of the last WAL record received and synced to disk during
+        recovery. If streaming replication is disabled, or if it has not yet
+        started, the function returns NULL.
+       </entry>
+      </row>
       <row>
        <entry>
         <literal><function>pg_last_wal_replay_lsn()</function></literal>
@@ -20620,6 +20650,15 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
     </tgroup>
    </table>
 
+   <para>
+    <function>pg_last_wal_receive_lsn</function> has an optional parameter of type
+    <type>boolean</type>.  If <literal>true</literal>,
+    the result includes the current timeline as second parameter. Do not set this
+    parameter, even to <literal>false</literal>, if you only need the last write-ahead
+    log location received as the result would include a useless <literal>NULL</literal>
+    second field.
+   </para>
+
    <indexterm>
     <primary>pg_is_wal_replay_paused</primary>
    </indexterm>
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 8f179887ab..1fe8ef1c32 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -360,6 +360,78 @@ pg_current_wal_lsn(PG_FUNCTION_ARGS)
 	PG_RETURN_LSN(current_recptr);
 }
 
+/*
+ * pg_current_wal_lsn_tl: report the current WAL write location (same format
+ * as pg_start_backup etc) and optionally the timeline.
+ *
+ * When the first parameter (variable 'with_tli') is true, returns the current
+ * timeline as second field. If false, second field is null.
+ *
+ * Note: this version is only called if the second parameter is set. It is
+ * 		 overloaded as pg_current_wal_lsn in SQL.
+ */
+Datum
+pg_current_wal_lsn_tl(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	Datum		values[2];
+	bool		nulls[2];
+
+	XLogRecPtr	current_recptr;
+	bool		with_tli = PG_GETARG_BOOL(0);
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	MemSet(values, 0, sizeof(values));
+	MemSet(nulls, 0, sizeof(nulls));
+
+	current_recptr = GetXLogWriteRecPtr();
+
+	values[0] = LSNGetDatum(current_recptr);
+	if (!with_tli)
+		nulls[1] = 1;
+	else
+		values[1] = Int32GetDatum(ThisTimeLineID);
+
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
+
 /*
  * Report the current WAL insert location (same format as pg_start_backup etc)
  *
@@ -421,6 +493,79 @@ pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
 	PG_RETURN_LSN(recptr);
 }
 
+/*
+ * pg_last_wal_receive_lsn_tl: report the last WAL receive location (same format
+ * as pg_start_backup etc) and optionally its timeline.
+ *
+ * When the first parameter (variable 'with_tli') is true, returns the current
+ * timeline as second field. If false, the second field is null.
+ *
+ * Note: this version is only called if the second parameter is set. It is
+ * 		 overloaded as pg_last_wal_receive_lsn in SQL.
+ */
+Datum
+pg_last_wal_receive_lsn_tl(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	Datum		values[2];
+	bool		nulls[2];
+
+	XLogRecPtr	recptr;
+	TimeLineID	lastReceivedTL;
+	bool		with_tli = PG_GETARG_BOOL(0);
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	MemSet(values, 0, sizeof(values));
+	MemSet(nulls, 0, sizeof(nulls));
+
+	recptr = GetWalRcvWriteRecPtr(NULL, &lastReceivedTL);
+
+	if (recptr == 0) {
+		nulls[0] = 1;
+		nulls[1] = 1;
+	}
+	else {
+		values[0] = LSNGetDatum(recptr);
+		if (!with_tli || !lastReceivedTL)
+			nulls[1] = 1;
+		else
+			values[1] = Int32GetDatum(lastReceivedTL);
+	}
+
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
+
 /*
  * Report the last WAL replay location (same format as pg_start_backup etc)
  *
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 58ea5b982b..a612c7955f 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5998,6 +5998,13 @@
 { oid => '2849', descr => 'current wal write location',
   proname => 'pg_current_wal_lsn', provolatile => 'v', prorettype => 'pg_lsn',
   proargtypes => '', prosrc => 'pg_current_wal_lsn' },
+{ oid => '3434',
+  descr => 'current wal write location',
+  proname => 'pg_current_wal_lsn', provolatile => 'v', proisstrict => 'f',
+  proargtypes => 'bool', prorettype => 'record', prorows => '1', proretset => 't',
+  proargmodes => '{i,o,o}', proallargtypes => '{bool,pg_lsn,int4}',
+  proargnames => '{with_tli,lsn,timeline}',
+  prosrc => 'pg_current_wal_lsn_tl' },
 { oid => '2852', descr => 'current wal insert location',
   proname => 'pg_current_wal_insert_lsn', provolatile => 'v',
   prorettype => 'pg_lsn', proargtypes => '',
@@ -6032,6 +6039,12 @@
   proname => 'pg_last_wal_receive_lsn', provolatile => 'v',
   prorettype => 'pg_lsn', proargtypes => '',
   prosrc => 'pg_last_wal_receive_lsn' },
+{ oid => '3435', descr => 'current wal flush location',
+  proname => 'pg_last_wal_receive_lsn', provolatile => 'v', proisstrict => 'f',
+  proargtypes => 'bool', prorettype => 'record', prorows => '1', proretset => 't',
+  proargmodes => '{i,o,o}', proallargtypes => '{bool,pg_lsn,int4}',
+  proargnames => '{with_tli,lsn,timeline}',
+  prosrc => 'pg_last_wal_receive_lsn_tl' },
 { oid => '3821', descr => 'last wal replay location',
   proname => 'pg_last_wal_replay_lsn', provolatile => 'v',
   prorettype => 'pg_lsn', proargtypes => '',
diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl
index 7e952d3667..fd15f3f4aa 100644
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@@ -6,7 +6,7 @@ use warnings;
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 2;
+use Test::More tests => 8;
 
 $ENV{PGDATABASE} = 'postgres';
 
@@ -37,6 +37,27 @@ $node_master->safe_psql('postgres',
 $node_master->wait_for_catchup($node_standby_1, 'replay',
 	$node_master->lsn('write'));
 
+# Check pg_current_wal_lsn(true) result
+my $node_master_lsn = $node_master->safe_psql('postgres',
+	'SELECT timeline, lsn FROM pg_current_wal_lsn(true)');
+my @node_master_lsn = split /\|/, $node_master_lsn;
+is($node_master_lsn[1], $node_master->lsn('write'), 'check pg_current_wal_lsn(true) return values');
+is($node_master_lsn[0], 1, 'current primary is on timeline 1');
+
+# Check pg_last_wal_receive_lsn(true) result
+my $node_standby_1_lsn = $node_standby_1->safe_psql('postgres',
+	'SELECT timeline, lsn FROM pg_last_wal_receive_lsn(true)');
+is($node_standby_1_lsn, $node_master_lsn, 'check pg_last_wal_receive_lsn(true) return values on standby');
+
+# Check pg_current_wal_lsn(true) fails on a standby
+my $psql_err = '';
+$node_standby_1->psql('postgres',
+	'SELECT timeline, lsn FROM pg_current_wal_lsn(true)',
+	stderr => \$psql_err);
+like($psql_err,
+   qr/ERROR:  recovery is in progress\nHINT:  WAL control functions cannot be executed during recovery.\s*$/s,
+   'check pg_current_wal_lsn(true) fails on a standby');
+
 # Stop and remove master
 $node_master->teardown_node;
 
@@ -48,6 +69,20 @@ $node_standby_1->psql(
 	stdout => \$psql_out);
 is($psql_out, 't', "promotion of standby with pg_promote");
 
+# check new timeline and new LSN using pg_current_wal_lsn(true)
+my @old_standby_1_lsn = split /\|/, $node_standby_1_lsn;
+$psql_out = $node_standby_1->safe_psql('postgres',
+	'SELECT timeline, lsn FROM pg_current_wal_lsn(true)');
+my @new_standby_1_lsn = split /\|/, $psql_out;
+is($new_standby_1_lsn[0], $old_standby_1_lsn[0]+1, "check new timeline");
+
+# minimal walrecord size written to WAL after a promotion
+my $END_OF_RECOVERY_SZ = 42;
+$psql_out = $node_standby_1->safe_psql('postgres',
+	qq{SELECT pg_wal_lsn_diff('$new_standby_1_lsn[1]',
+							  '$old_standby_1_lsn[1]') > $END_OF_RECOVERY_SZ});
+is($psql_out, 't', 'check new LSN');
+
 # Switch standby 2 to replay from standby 1
 my $connstr_1 = $node_standby_1->connstr;
 $node_standby_2->append_conf(
-- 
2.20.1

