Add pg_walinspect function with block info columns
Hi,
When using pg_walinspect, and calling functions like
pg_get_wal_records_info(), I often wish that the various information in
the block_ref column was separated out into columns so that I could
easily access them and pass them to various other functions to add
information -- like getting the relname from pg_class like this:
SELECT n.nspname, c.relname, wal_info.*
FROM pg_get_wal_records_extended_info(:start_lsn, :end_lsn) wal_info
JOIN pg_class c
ON wal_info.relfilenode = pg_relation_filenode(c.oid) AND
wal_info.reldatabase IN (0, (SELECT oid FROM pg_database
WHERE datname = current_database()))
JOIN pg_namespace n ON n.oid = c.relnamespace;
This has been mentioned in [1]/messages/by-id/CAH2-Wz=acGKoP8cZ+6Af2inoai0N5cZKCY13DaqXCwQNupK8qg@mail.gmail.com amongst other places.
So, attached is a patch with pg_get_wal_records_extended_info(). I
suspect the name is not very good. Also, it is nearly a direct copy of
pg_get_wal_fpi_infos() except for the helper called to fill in the
tuplestore, so it might be worth doing something about that.
However, I am mainly looking for feedback about whether or not others
would find this useful, and, if so, what columns they would like to see
in the returned tuplestore.
Note that I didn't include the cumulative fpi_len for all the pages
since pg_get_wal_fpi_info() now exists. I noticed that
pg_get_wal_fpi_info() doesn't list compression information (which is in
the block_ref column of pg_get_wal_records_info()). I don't know if this
is worth including in my proposed function
pg_get_wal_records_extended_info().
- Melanie
[1]: /messages/by-id/CAH2-Wz=acGKoP8cZ+6Af2inoai0N5cZKCY13DaqXCwQNupK8qg@mail.gmail.com
Attachments:
v1-0001-Add-extended-block-info-function-to-pg_walinspect.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Add-extended-block-info-function-to-pg_walinspect.patchDownload
From 4066b480c35b5de75589812ae2d3c2fc9626bf0a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 1 Mar 2023 12:16:07 -0500
Subject: [PATCH v1] Add extended block info function to pg_walinspect
Add a function which returns denormalized block data for every block in
every WAL record in the specified range.
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 27 +++++
contrib/pg_walinspect/pg_walinspect.c | 111 ++++++++++++++++++
2 files changed, 138 insertions(+)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 1e9e1e6115..debbe43791 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -22,3 +22,30 @@ LANGUAGE C STRICT PARALLEL SAFE;
REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) FROM PUBLIC;
GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+--
+-- pg_get_wal_records_extended_info()
+--
+CREATE FUNCTION pg_get_wal_records_extended_info(IN start_lsn pg_lsn,
+ IN end_lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
+ OUT block_id int8,
+ OUT relblocknumber int8,
+ OUT reltablespace oid,
+ OUT reldatabase oid,
+ OUT relfilenode oid,
+ OUT forkname text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_get_wal_records_extended_info'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+REVOKE EXECUTE ON FUNCTION pg_get_wal_records_extended_info(pg_lsn, pg_lsn) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_records_extended_info(pg_lsn, pg_lsn) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index b7b0a805ee..aaf7928591 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -33,6 +33,7 @@ PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(pg_get_wal_fpi_info);
PG_FUNCTION_INFO_V1(pg_get_wal_record_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info);
+PG_FUNCTION_INFO_V1(pg_get_wal_records_extended_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info_till_end_of_wal);
PG_FUNCTION_INFO_V1(pg_get_wal_stats);
PG_FUNCTION_INFO_V1(pg_get_wal_stats_till_end_of_wal);
@@ -220,6 +221,7 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
}
+
/*
* Store a set of full page images from a single record.
*/
@@ -342,6 +344,115 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+/*
+ * Store a set of block ref infos from a single record.
+ */
+static void
+GetWALBlockRefInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+{
+#define PG_GET_WAL_BLOCK_REF_INFO_COLS 15
+ StringInfoData rec_desc;
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ uint32 main_data_len = XLogRecGetDataLen(record);
+ RmgrData desc = GetRmgr(XLogRecGetRmid(record));
+ const char *id = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
+
+
+ for (int block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ DecodedBkpBlock *block;
+ Datum values[PG_GET_WAL_BLOCK_REF_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_REF_INFO_COLS] = {0};
+ int i = 0;
+
+ if (!XLogRecHasBlockRef(record, block_id))
+ continue;
+
+ block = XLogRecGetBlock(record, block_id);
+
+ values[i++] = LSNGetDatum(record->ReadRecPtr);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(id);
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(main_data_len);
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ values[i++] = Int64GetDatum(block_id);
+ values[i++] = Int64GetDatum((int64) block->blkno);
+ values[i++] = ObjectIdGetDatum(block->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(block->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(block->rlocator.relNumber);
+
+ if (block->forknum >= 0 && block->forknum <= MAX_FORKNUM)
+ values[i++] = CStringGetTextDatum(forkNames[block->forknum]);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("invalid fork number: %u", block->forknum)));
+
+ Assert(i == PG_GET_WAL_BLOCK_REF_INFO_COLS);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+
+#undef PG_GET_WAL_BLOCK_REF_INFO_COLS
+}
+
+Datum
+pg_get_wal_records_extended_info(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr start_lsn;
+ XLogRecPtr end_lsn;
+ XLogReaderState *xlogreader;
+ MemoryContext old_cxt;
+ MemoryContext tmp_cxt;
+
+ start_lsn = PG_GETARG_LSN(0);
+ end_lsn = PG_GETARG_LSN(1);
+
+ end_lsn = ValidateInputLSNs(false, start_lsn, end_lsn);
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ xlogreader = InitXLogReaderState(start_lsn);
+
+ tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "pg_get_wal_records_extended_info temporary cxt",
+ ALLOCSET_DEFAULT_SIZES);
+
+ while (ReadNextXLogRecord(xlogreader) &&
+ xlogreader->EndRecPtr <= end_lsn)
+ {
+ /* Use the tmp context so we can clean up after each tuple is done */
+ old_cxt = MemoryContextSwitchTo(tmp_cxt);
+
+
+ GetWALBlockRefInfo(fcinfo, xlogreader);
+
+ /* clean up and switch back */
+ MemoryContextSwitchTo(old_cxt);
+ MemoryContextReset(tmp_cxt);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ MemoryContextDelete(tmp_cxt);
+ pfree(xlogreader->private_data);
+ XLogReaderFree(xlogreader);
+
+ PG_RETURN_VOID();
+}
+
+
/*
* Get WAL record info.
*
--
2.37.2
On Wed, Mar 1, 2023 at 12:51 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
When using pg_walinspect, and calling functions like
pg_get_wal_records_info(), I often wish that the various information in
the block_ref column was separated out into columns so that I could
easily access them and pass them to various other functions to add
information -- like getting the relname from pg_class like this:SELECT n.nspname, c.relname, wal_info.*
FROM pg_get_wal_records_extended_info(:start_lsn, :end_lsn) wal_info
JOIN pg_class c
ON wal_info.relfilenode = pg_relation_filenode(c.oid) AND
wal_info.reldatabase IN (0, (SELECT oid FROM pg_database
WHERE datname = current_database()))
JOIN pg_namespace n ON n.oid = c.relnamespace;This has been mentioned in [1] amongst other places.
So, attached is a patch with pg_get_wal_records_extended_info(). I
suspect the name is not very good. Also, it is nearly a direct copy of
pg_get_wal_fpi_infos() except for the helper called to fill in the
tuplestore, so it might be worth doing something about that.However, I am mainly looking for feedback about whether or not others
would find this useful, and, if so, what columns they would like to see
in the returned tuplestore.Note that I didn't include the cumulative fpi_len for all the pages
since pg_get_wal_fpi_info() now exists. I noticed that
pg_get_wal_fpi_info() doesn't list compression information (which is in
the block_ref column of pg_get_wal_records_info()). I don't know if this
is worth including in my proposed function
pg_get_wal_records_extended_info().
Thinking about this more, it could make sense to have a function which
gives you this extended block information and has a parameter like
with_fpi which would include the information returned by
pg_get_wal_fpi_info(). It might be nice to have it still include the
information about the record itself as well.
I don't know if it would be instead of pg_get_wal_fpi_info(), though.
The way I would use this is when I want to see the record level
information but with some additional information aggregated across the
relevant blocks. For example, I could group by the record information
and relfilenode and using the query in my example above, see all the
information for the record along with the relname (when possible).
- Melanie
On Thu, Mar 02, 2023 at 11:17:05AM -0500, Melanie Plageman wrote:
Thinking about this more, it could make sense to have a function which
gives you this extended block information and has a parameter like
with_fpi which would include the information returned by
pg_get_wal_fpi_info(). It might be nice to have it still include the
information about the record itself as well.
Hmm. I am OK if you want to include more information about the
blocks, and it may be nicer to not bloat the interface with more
functions than necessary.
I don't know if it would be instead of pg_get_wal_fpi_info(), though.
The way I would use this is when I want to see the record level
information but with some additional information aggregated across the
relevant blocks. For example, I could group by the record information
and relfilenode and using the query in my example above, see all the
information for the record along with the relname (when possible).
As far as I know, a block reference could have some data or a FPW, so
it is true that pg_get_wal_fpi_info() is not extensive enough if you
want to get more information about the blocks in use for each record,
especially if there is some data, and grouping the information about
whole set of blocks into a single function call can some time.
In order to satisfy your case, why not having one function that does
everything, looping over the blocks of a single record as long as
XLogRecHasBlockRef() is satisfied, returning the FPW if the block
includes an image (or NULL if !XLogRecHasBlockImage()), as well as its
data in bytea if XLogRecGetData() gives something (?).
I am not sure that this should return anything about the record itself
except its ReadRecPtr, though, as ReadRecPtr would be enough to
cross-check with the information provided by GetWALRecordInfo() with a
join. Hence, I guess that we could update the existing FPI function
with:
- the addition of some of the flags of bimg_info, like the compression
type, if they apply, with a text[].
- the addition of bimg_len, if the block has a FPW, or NULL if none.
- the addition of apply_image, if the block has a FPW, or NULL if
none.
- the addition of the block data, if any, or NULL if there is no
data.
- an update for the FPW handling, where we would return NULL if there
is no FPW references in the block, but still return the full,
decompressed 8kB image if it is there.
--
Michael
On Thu, Mar 2, 2023 at 9:47 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
However, I am mainly looking for feedback about whether or not others
would find this useful, and, if so, what columns they would like to see
in the returned tuplestore.
IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info text
So, there can be multiple columns for the same record LSN, which
means, essentially (lsn, block_id) can be a unique value for the row.
If a block has FPI, fpi and fpi_info are non-null, otherwise, nulls.
If needed, this output can be joined with pg_get_wal_records_info on
lsn, to get all the record level details.What do you think? Will this
serve your purpose?
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Mon, 6 Mar 2023 at 15:40, Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
On Thu, Mar 2, 2023 at 9:47 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:However, I am mainly looking for feedback about whether or not others
would find this useful, and, if so, what columns they would like to see
in the returned tuplestore.IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info text
Wouldn't it be more useful to have
type PgXlogRecordBlock
( block_id int
...
, blockimage bytea
, data bytea
)
type PgXlogRecord
( start lsn
, ...
, blocks PgXlogRecordBlock[] -- array of record's registered blocks,
c.q. DecodedBkpBlock->blocks
, main_data bytea
)
which is returned by one sql function, and then used and processed
(unnest()ed) in the relevant views? It would allow anyone to build
their own processing on pg_walinspect where they want or need it,
without us having to decide what the user wants, and without having to
associate blocks with the main xlog record data through the joining of
several (fairly expensive) xlog decoding passes.
The basic idea is to create a single entrypoint to all relevant data
from DecodedXLogRecord in SQL, not multiple.
Kind regards,
Matthias van de Meent
On Mon, Mar 06, 2023 at 04:08:28PM +0100, Matthias van de Meent wrote:
On Mon, 6 Mar 2023 at 15:40, Bharath Rupireddy
IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info text
I would add the length of the block data (without the hole and
compressed, as the FPI data should always be presented as
uncompressed), and the block data if any (without the block data
length as one can guess it based on the bytea data length). Note that
a block can have both a FPI and some data assigned to it, as far as I
recall.
The basic idea is to create a single entrypoint to all relevant data
from DecodedXLogRecord in SQL, not multiple.
While I would agree with this principle on simplicity's ground in
terms of minimizing the SQL interface and the pg_wal/ lookups, I
disagree about it on unsability ground, because we can avoid extra SQL
tweaks with more functions. One recent example I have in mind is
partitionfuncs.c, which can actually be achieved with a WITH RECURSIVE
on the catalogs. There are of course various degrees of complexity,
and perhaps unnest() cannot qualify as one, but having two functions
returning normalized records (one for the record information, and a
second for the block information), is a rather good balance between
usability and interface complexity, in my experience. If you have two
functions, a JOIN is enough to cross-check the block data and the
record data, while an unnest() heavily bloats the main function output
(aka byteas of FPIs in a single array).
--
Michael
At Tue, 7 Mar 2023 09:34:24 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Mon, Mar 06, 2023 at 04:08:28PM +0100, Matthias van de Meent wrote:
On Mon, 6 Mar 2023 at 15:40, Bharath Rupireddy
IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info textI would add the length of the block data (without the hole and
compressed, as the FPI data should always be presented as
uncompressed), and the block data if any (without the block data
length as one can guess it based on the bytea data length). Note that
a block can have both a FPI and some data assigned to it, as far as I
recall.
+1
The basic idea is to create a single entrypoint to all relevant data
from DecodedXLogRecord in SQL, not multiple.While I would agree with this principle on simplicity's ground in
terms of minimizing the SQL interface and the pg_wal/ lookups, I
disagree about it on unsability ground, because we can avoid extra SQL
tweaks with more functions. One recent example I have in mind is
partitionfuncs.c, which can actually be achieved with a WITH RECURSIVE
on the catalogs. There are of course various degrees of complexity,
and perhaps unnest() cannot qualify as one, but having two functions
returning normalized records (one for the record information, and a
second for the block information), is a rather good balance between
usability and interface complexity, in my experience. If you have two
functions, a JOIN is enough to cross-check the block data and the
record data, while an unnest() heavily bloats the main function output
(aka byteas of FPIs in a single array).
FWIW, my initial thought about the proposal was similar to Matthias,
and tried a function that would convert (for simplicity) the block_ref
string to a json object. Although this approach did work, I was not
satisfied with its limited usability and poor performance (mainly the
poor performance is due to text->json conversion, though)..
Finally, I realized that the initial discomfort I experienced stemmed
from the name of the function, which suggests that it returns
information of "records". This discomfort would disappear if the
function were instead named pg_get_wal_blockref_info() or something
similar.
Thus I'm inclined to agree with Michael's suggestion of creating a new
normalized set-returning function that returns information of
"blocks".
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Tue, Mar 07, 2023 at 11:17:45AM +0900, Kyotaro Horiguchi wrote:
Thus I'm inclined to agree with Michael's suggestion of creating a new
normalized set-returning function that returns information of
"blocks".
Just to be clear here, I am not suggesting to add a new function for
only the block information, just a rename of the existing
pg_get_wal_fpi_info() to something like pg_get_wal_block_info() that
includes both the FPI (if any or NULL if none) and the block data (if
any or NULL is none) so as all of them are governed by the same lookup
at pg_wal/. The fpi information (aka compression type) is displayed
if there is a FPI in the block.
--
Michael
At Tue, 7 Mar 2023 14:44:49 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Tue, Mar 07, 2023 at 11:17:45AM +0900, Kyotaro Horiguchi wrote:
Thus I'm inclined to agree with Michael's suggestion of creating a new
normalized set-returning function that returns information of
"blocks".Just to be clear here, I am not suggesting to add a new function for
only the block information, just a rename of the existing
pg_get_wal_fpi_info() to something like pg_get_wal_block_info() that
includes both the FPI (if any or NULL if none) and the block data (if
any or NULL is none) so as all of them are governed by the same lookup
at pg_wal/. The fpi information (aka compression type) is displayed
if there is a FPI in the block.
Ah. Yes, that expansion sounds sensible.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Tue, Mar 07, 2023 at 03:49:02PM +0900, Kyotaro Horiguchi wrote:
Ah. Yes, that expansion sounds sensible.
Okay, so, based on this idea, I have hacked on this stuff and finish
with the attached that shows block data if it exists, as well as FPI
stuff if any. bimg_info is showed as a text[] for its flags.
I guess that I'd better add a test that shows correctly a record with
some block data attached to it, on top of the existing one for FPIs..
Any suggestions? Perhaps just a heap/heap2 record?
Thoughts?
--
Michael
Attachments:
0001-Rework-pg_walinspect-to-retrieve-more-block-informat.patchtext/x-diff; charset=us-asciiDownload
From 237be9d768c20f9c8fec0a11a8fcbee6c4e9bf8a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 7 Mar 2023 16:06:28 +0900
Subject: [PATCH] Rework pg_walinspect to retrieve more block information
---
doc/src/sgml/pgwalinspect.sgml | 36 +++--
.../pg_walinspect/expected/pg_walinspect.out | 16 +--
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 16 ++-
contrib/pg_walinspect/pg_walinspect.c | 134 +++++++++++++-----
contrib/pg_walinspect/sql/pg_walinspect.sql | 16 +--
5 files changed, 145 insertions(+), 73 deletions(-)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 3d7cdb95cc..3a65beb08d 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -190,31 +190,39 @@ combined_size_percentage | 2.8634072910530795
<varlistentry>
<term>
- <function>pg_get_wal_fpi_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
+ <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
</term>
<listitem>
<para>
- Gets a copy of full page images as <type>bytea</type> values (after
- applying decompression when necessary) and their information associated
- with all the valid WAL records between
+ Gets a copy of the block information stored in WAL records. This includes
+ copies of the block data (<literal>NULL</literal> if none) and full page
+ images as <type>bytea</type> values (after
+ applying decompression when necessary, or <literal>NULL</literal> if none)
+ and their information associated with all the valid WAL records between
<replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per full page image.
- If <replaceable>start_lsn</replaceable> or
+ <replaceable>end_lsn</replaceable>. Returns one row per block registered
+ in a WAL record. If <replaceable>start_lsn</replaceable> or
<replaceable>end_lsn</replaceable> are not yet available, the function
will raise an error. For example:
<screen>
-postgres=# SELECT lsn, reltablespace, reldatabase, relfilenode, relblocknumber,
- forkname, substring(fpi for 24) as fpi_trimmed
- FROM pg_get_wal_fpi_info('0/1801690', '0/1825C60');
+postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+ relblocknumber, forkname,
+ substring(blockdata for 24) as block_trimmed,
+ substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
+ FROM pg_get_wal_fpi_info('0/20BC498', '0/20BCD80');
-[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/1807E20
+lsn | 0/20BC498
+blockid | 0
reltablespace | 1663
-reldatabase | 5
-relfilenode | 16396
-relblocknumber | 43
+reldatabase | 16384
+relfilenode | 24576
+relblocknumber | 44
forkname | main
-fpi_trimmed | \x00000000b89e660100000000a003c0030020042000000000
+block_trimmed | null
+fpi_trimmed | \x0000000018ed0a02000000000401a0180020042000000000
+fpilen | 2148
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index 9bcb05354e..b1682582e8 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -74,7 +74,7 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
(1 row)
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
-- Force FPI on the next update.
@@ -83,8 +83,8 @@ CHECKPOINT;
UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
ok
----
t
@@ -116,7 +116,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
has_function_privilege
------------------------
f
@@ -146,7 +146,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -160,7 +160,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_record_info(pg_lsn)', 'EXECUTE'); -- yes
@@ -184,7 +184,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -196,7 +196,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
-- Clean up
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 1e9e1e6115..f6f9c00281 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -4,21 +4,25 @@
\echo Use "ALTER EXTENSION pg_walinspect UPDATE TO '1.1'" to load this file. \quit
--
--- pg_get_wal_fpi_info()
+-- pg_get_wal_block_info()
--
-CREATE FUNCTION pg_get_wal_fpi_info(IN start_lsn pg_lsn,
+CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
OUT lsn pg_lsn,
+ OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
OUT forkname text,
- OUT fpi bytea
+ OUT blockdata bytea,
+ OUT fpi bytea,
+ OUT fpilen int,
+ OUT fpiinfo text[]
)
RETURNS SETOF record
-AS 'MODULE_PATHNAME', 'pg_get_wal_fpi_info'
+AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
LANGUAGE C STRICT PARALLEL SAFE;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) FROM PUBLIC;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index b7b0a805ee..7cc3260022 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
@@ -30,7 +31,7 @@
PG_MODULE_MAGIC;
-PG_FUNCTION_INFO_V1(pg_get_wal_fpi_info);
+PG_FUNCTION_INFO_V1(pg_get_wal_block_info);
PG_FUNCTION_INFO_V1(pg_get_wal_record_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info_till_end_of_wal);
@@ -56,7 +57,7 @@ static void FillXLogStatsRow(const char *name, uint64 n, uint64 total_count,
Datum *values, bool *nulls, uint32 ncols);
static void GetWalStats(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
XLogRecPtr end_lsn, bool stats_per_record);
-static void GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
+static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
/*
* Check if the given LSN is in future. Also, return the LSN up to which the
@@ -221,49 +222,41 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of full page images from a single record.
+ * Store a set of block information from a single record (FPI and block
+ * information).
*/
static void
-GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_FPI_INFO_COLS 7
+#define PG_GET_WAL_BLOCK_INFO_COLS 11
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
+ DecodedBkpBlock *block;
BlockNumber blk;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_FPI_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_FPI_INFO_COLS] = {0};
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int i = 0;
if (!XLogRecHasBlockRef(record, block_id))
continue;
- if (!XLogRecHasBlockImage(record, block_id))
- continue;
-
- page = (Page) buf.data;
-
- if (!RestoreBlockImage(record, block_id, page))
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("%s", record->errormsg_buf)));
+ block = XLogRecGetBlock(record, block_id);
/* Full page exists, so let's save it. */
(void) XLogRecGetBlockTagExtended(record, block_id,
&rnode, &fork, &blk, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = ObjectIdGetDatum(rnode.spcOid);
- values[i++] = ObjectIdGetDatum(rnode.dbOid);
- values[i++] = ObjectIdGetDatum(rnode.relNumber);
- values[i++] = Int64GetDatum((int64) blk);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = ObjectIdGetDatum(block->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(block->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(block->rlocator.relNumber);
+ values[i++] = Int64GetDatum((int64) block->blkno);
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -272,34 +265,101 @@ GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("invalid fork number: %u", fork)));
- /* Initialize bytea buffer to copy the FPI to. */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Block data */
+ if (block->has_data)
+ {
+ bytea *raw_data;
- /* Take a verbatim copy of the FPI. */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
+ /* Initialize bytea buffer to copy the data to. */
+ raw_data = (bytea *) palloc(block->data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, block->data_len + VARHDRSZ);
- values[i++] = PointerGetDatum(raw_page);
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), block->data, block->data_len);
+ values[i++] = PointerGetDatum(raw_data);
+ }
+ else
+ {
+ /* no data, so set this field to NULL */
+ nulls[i++] = true;
+ }
- Assert(i == PG_GET_WAL_FPI_INFO_COLS);
+ /* Full-page image */
+ if (block->has_image)
+ {
+ PGAlignedBlock buf;
+ Page page;
+ bytea *raw_page;
+ int bitcnt;
+ int cnt = 0;
+ Datum *flags;
+ ArrayType *a;
+
+ page = (Page) buf.data;
+
+ if (!RestoreBlockImage(record, block_id, page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /* Initialize bytea buffer to copy the FPI to. */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI. */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
+ values[i++] = Int32GetDatum(block->bimg_len);
+
+ /* FPI flags */
+ bitcnt = pg_popcount((const char *) &block->bimg_info,
+ sizeof(uint8));
+ /* build set of raw flags */
+ flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
+ if ((block->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
+ flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
+ if ((block->bimg_info & BKPIMAGE_APPLY) != 0)
+ flags[cnt++] = CStringGetTextDatum("APPLY");
+ if ((block->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_PGLZ");
+ if ((block->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_LZ4");
+ if ((block->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
+
+ Assert(cnt <= bitcnt);
+ a = construct_array_builtin(flags, cnt, TEXTOID);
+ values[i++] = PointerGetDatum(a);
+ }
+ else
+ {
+ /* No full page image, so store NULLs for all its fields */
+ memset(&nulls[i], true, 3 * sizeof(bool));
+ i += 3;
+ }
+
+ Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
}
-#undef PG_GET_WAL_FPI_INFO_COLS
+#undef PG_GET_WAL_FPI_BLOCK_COLS
}
/*
- * Get full page images with their relation information for all the WAL
- * records between start and end LSNs. Decompression is applied to the
- * blocks, if necessary.
+ * Get information about all the blocks saved in WAL records between start
+ * and end LSNs. This produces information about the full page images with
+ * their relation information, and the data saved in each block associated
+ * to a record. Decompression is applied to the full-page images, if
+ * necessary.
*
* This function emits an error if a future start or end WAL LSN i.e. WAL LSN
* the database system doesn't know about is specified.
*/
Datum
-pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
+pg_get_wal_block_info(PG_FUNCTION_ARGS)
{
XLogRecPtr start_lsn;
XLogRecPtr end_lsn;
@@ -317,7 +377,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
xlogreader = InitXLogReaderState(start_lsn);
tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
- "pg_get_wal_fpi_info temporary cxt",
+ "pg_get_wal_block_info temporary cxt",
ALLOCSET_DEFAULT_SIZES);
while (ReadNextXLogRecord(xlogreader) &&
@@ -326,7 +386,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
- GetWALFPIInfo(fcinfo, xlogreader);
+ GetWALBlockInfo(fcinfo, xlogreader);
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 849201a1f8..10988b4782 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -53,7 +53,7 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
WHERE resource_manager = 'Heap' AND record_type = 'INSERT';
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
@@ -65,8 +65,8 @@ UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
-- ===================================================================
-- Tests for permissions
@@ -83,7 +83,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
-- Functions accessible by users with role pg_read_server_files
@@ -99,7 +99,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE pg_read_server_files FROM regress_pg_walinspect;
@@ -113,7 +113,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
@@ -126,7 +126,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn)
FROM regress_pg_walinspect;
@@ -137,7 +137,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
--
2.39.2
At Tue, 7 Mar 2023 16:18:21 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Tue, Mar 07, 2023 at 03:49:02PM +0900, Kyotaro Horiguchi wrote:
Ah. Yes, that expansion sounds sensible.
Okay, so, based on this idea, I have hacked on this stuff and finish
with the attached that shows block data if it exists, as well as FPI
stuff if any. bimg_info is showed as a text[] for its flags.
# The naming convetion looks inconsistent between
# pg_get_wal_records_info and pg_get_wal_block_info but it's not an
# issue of this patch..
I guess that I'd better add a test that shows correctly a record with
some block data attached to it, on top of the existing one for FPIs..
Any suggestions? Perhaps just a heap/heap2 record?Thoughts?
I thought that we needed a test for block data when I saw the patch.
I don't have great idea but a single insert should work.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Tue, Mar 7, 2023 at 12:48 PM Michael Paquier <michael@paquier.xyz> wrote:
On Tue, Mar 07, 2023 at 03:49:02PM +0900, Kyotaro Horiguchi wrote:
Ah. Yes, that expansion sounds sensible.
Okay, so, based on this idea, I have hacked on this stuff and finish
with the attached that shows block data if it exists, as well as FPI
stuff if any. bimg_info is showed as a text[] for its flags.
+1.
I guess that I'd better add a test that shows correctly a record with
some block data attached to it, on top of the existing one for FPIs..
Any suggestions? Perhaps just a heap/heap2 record?Thoughts?
That would be a lot better. Not just the test, but also the
documentation can have it. Simple way to generate such a record (both
block data and FPI) is to just change the wal_level to logical in
walinspect.conf [1], see code around REGBUF_KEEP_DATA and
RelationIsLogicallyLogged in heapam.c
I had the following comments and fixed them in the attached v2 patch:
1. Still a trace of pg_get_wal_fpi_info in docs, removed it.
2. Used int4 instead of int for fpilen just to be in sync with
fpi_length of pg_get_wal_record_info.
3. Changed to be consistent and use just FPI or "F/full page".
/* FPI flags */
/* No full page image, so store NULLs for all its fields */
/* Full-page image */
/* Full page exists, so let's save it. */
* and end LSNs. This produces information about the full page images with
* to a record. Decompression is applied to the full-page images, if
4. I think we need to free raw_data, raw_page and flags as we loop
over multiple blocks (XLR_MAX_BLOCK_ID) and will leak memory which can
be a problem if we have many blocks assocated with a single WAL
record.
flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
Also, we will leak all CStringGetTextDatum memory in the block_id for loop.
Another way is to use and reset temp memory context in the for loop
over block_ids. I prefer this approach over multiple pfree()s in
block_id for loop.
5. I think it'd be good to say if the FPI is for WAL_VERIFICATION, so
I changed it to the following. Feel free to ignore this if you think
it's not required.
if (blk->apply_image)
flags[cnt++] = CStringGetTextDatum("APPLY");
else
flags[cnt++] = CStringGetTextDatum("WAL_VERIFICATION");
6. Did minor wordsmithing.
7. Added test case which shows both block data and fpi in the documentation.
8. Changed wal_level to logical in walinspect.conf to test case with block data.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v2-0001-Rework-pg_walinspect-to-retrieve-more-block-infor.patchapplication/octet-stream; name=v2-0001-Rework-pg_walinspect-to-retrieve-more-block-infor.patchDownload
From 1472b08f64acb398e2c9bf830939719ca8d722ea Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Tue, 7 Mar 2023 09:41:22 +0000
Subject: [PATCH v2] Rework pg_walinspect to retrieve more block information
---
.../pg_walinspect/expected/pg_walinspect.out | 24 ++-
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 16 +-
contrib/pg_walinspect/pg_walinspect.c | 168 ++++++++++++------
contrib/pg_walinspect/sql/pg_walinspect.sql | 20 ++-
contrib/pg_walinspect/walinspect.conf | 2 +-
doc/src/sgml/pgwalinspect.sgml | 48 +++--
6 files changed, 189 insertions(+), 89 deletions(-)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index 9bcb05354e..413046a7e3 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -74,7 +74,7 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
(1 row)
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
-- Force FPI on the next update.
@@ -82,9 +82,17 @@ CHECKPOINT;
-- Update table to generate an FPI.
UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ ok
+----
+ t
+(1 row)
+
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
ok
----
t
@@ -116,7 +124,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
has_function_privilege
------------------------
f
@@ -146,7 +154,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -160,7 +168,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_record_info(pg_lsn)', 'EXECUTE'); -- yes
@@ -184,7 +192,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -196,7 +204,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
-- Clean up
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 1e9e1e6115..e674ef25aa 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -4,21 +4,25 @@
\echo Use "ALTER EXTENSION pg_walinspect UPDATE TO '1.1'" to load this file. \quit
--
--- pg_get_wal_fpi_info()
+-- pg_get_wal_block_info()
--
-CREATE FUNCTION pg_get_wal_fpi_info(IN start_lsn pg_lsn,
+CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
OUT lsn pg_lsn,
+ OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
OUT forkname text,
- OUT fpi bytea
+ OUT blockdata bytea,
+ OUT fpi bytea,
+ OUT fpilen int4,
+ OUT fpiinfo text[]
)
RETURNS SETOF record
-AS 'MODULE_PATHNAME', 'pg_get_wal_fpi_info'
+AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
LANGUAGE C STRICT PARALLEL SAFE;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) FROM PUBLIC;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index b7b0a805ee..4902d8d7d3 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
@@ -30,7 +31,7 @@
PG_MODULE_MAGIC;
-PG_FUNCTION_INFO_V1(pg_get_wal_fpi_info);
+PG_FUNCTION_INFO_V1(pg_get_wal_block_info);
PG_FUNCTION_INFO_V1(pg_get_wal_record_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info_till_end_of_wal);
@@ -56,7 +57,7 @@ static void FillXLogStatsRow(const char *name, uint64 n, uint64 total_count,
Datum *values, bool *nulls, uint32 ncols);
static void GetWalStats(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
XLogRecPtr end_lsn, bool stats_per_record);
-static void GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
+static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
/*
* Check if the given LSN is in future. Also, return the LSN up to which the
@@ -221,49 +222,49 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of full page images from a single record.
+ * Store a set of block information from a single record (FPI and block
+ * information).
*/
static void
-GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_FPI_INFO_COLS 7
+#define PG_GET_WAL_BLOCK_INFO_COLS 11
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ MemoryContext old_cxt;
+ MemoryContext tmp_cxt;
+
+ tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "GetWALBlockInfo temporary cxt",
+ ALLOCSET_DEFAULT_SIZES);
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
- BlockNumber blk;
+ DecodedBkpBlock *blk;
+ BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_FPI_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_FPI_INFO_COLS] = {0};
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int i = 0;
if (!XLogRecHasBlockRef(record, block_id))
continue;
- if (!XLogRecHasBlockImage(record, block_id))
- continue;
-
- page = (Page) buf.data;
+ /* Use the tmp context so we can clean up after each tuple is done */
+ old_cxt = MemoryContextSwitchTo(tmp_cxt);
- if (!RestoreBlockImage(record, block_id, page))
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("%s", record->errormsg_buf)));
+ blk = XLogRecGetBlock(record, block_id);
- /* Full page exists, so let's save it. */
(void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blk, NULL);
+ &rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = ObjectIdGetDatum(rnode.spcOid);
- values[i++] = ObjectIdGetDatum(rnode.dbOid);
- values[i++] = ObjectIdGetDatum(rnode.relNumber);
- values[i++] = Int64GetDatum((int64) blk);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int64GetDatum((int64) blkno);
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -272,40 +273,115 @@ GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("invalid fork number: %u", fork)));
- /* Initialize bytea buffer to copy the FPI to. */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Block data */
+ if (blk->has_data)
+ {
+ bytea *raw_data;
- /* Take a verbatim copy of the FPI. */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
+ /* Initialize bytea buffer to copy the data to */
+ raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
- values[i++] = PointerGetDatum(raw_page);
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, blk->data_len);
+ values[i++] = PointerGetDatum(raw_data);
+ }
+ else
+ {
+ /* No data, so set this field to NULL */
+ nulls[i++] = true;
+ }
- Assert(i == PG_GET_WAL_FPI_INFO_COLS);
+ if (blk->has_image)
+ {
+ PGAlignedBlock buf;
+ Page page;
+ bytea *raw_page;
+ int bitcnt;
+ int cnt = 0;
+ Datum *flags;
+ ArrayType *a;
+
+ page = (Page) buf.data;
+
+ /* Full page image exists, so let's save it */
+ if (!RestoreBlockImage(record, block_id, page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
+ values[i++] = UInt32GetDatum(blk->bimg_len);
+
+ /* FPI flags */
+ bitcnt = pg_popcount((const char *) &blk->bimg_info,
+ sizeof(uint8));
+ /* Build set of raw flags */
+ flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
+ if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
+ flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
+
+ if (blk->apply_image)
+ flags[cnt++] = CStringGetTextDatum("APPLY");
+ else
+ flags[cnt++] = CStringGetTextDatum("WAL_VERIFICATION");
+
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_PGLZ");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_LZ4");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
+
+ Assert(cnt <= bitcnt);
+ a = construct_array_builtin(flags, cnt, TEXTOID);
+ values[i++] = PointerGetDatum(a);
+ }
+ else
+ {
+ /* No full page image, so store NULLs for all its fields */
+ memset(&nulls[i], true, 3 * sizeof(bool));
+ i += 3;
+ }
+
+ Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
+
+ /* clean up and switch back */
+ MemoryContextSwitchTo(old_cxt);
+ MemoryContextReset(tmp_cxt);
}
-#undef PG_GET_WAL_FPI_INFO_COLS
+ MemoryContextDelete(tmp_cxt);
+
+#undef PG_GET_WAL_FPI_BLOCK_COLS
}
/*
- * Get full page images with their relation information for all the WAL
- * records between start and end LSNs. Decompression is applied to the
- * blocks, if necessary.
+ * Get information about all the blocks saved in WAL records between start
+ * and end LSNs. This produces information about the full page images with
+ * their relation information, and the data saved in each block associated
+ * to a record. Decompression is applied to the full page images, if
+ * necessary.
*
* This function emits an error if a future start or end WAL LSN i.e. WAL LSN
* the database system doesn't know about is specified.
*/
Datum
-pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
+pg_get_wal_block_info(PG_FUNCTION_ARGS)
{
XLogRecPtr start_lsn;
XLogRecPtr end_lsn;
XLogReaderState *xlogreader;
- MemoryContext old_cxt;
- MemoryContext tmp_cxt;
start_lsn = PG_GETARG_LSN(0);
end_lsn = PG_GETARG_LSN(1);
@@ -316,26 +392,14 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
xlogreader = InitXLogReaderState(start_lsn);
- tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
- "pg_get_wal_fpi_info temporary cxt",
- ALLOCSET_DEFAULT_SIZES);
-
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
- /* Use the tmp context so we can clean up after each tuple is done */
- old_cxt = MemoryContextSwitchTo(tmp_cxt);
-
- GetWALFPIInfo(fcinfo, xlogreader);
-
- /* clean up and switch back */
- MemoryContextSwitchTo(old_cxt);
- MemoryContextReset(tmp_cxt);
+ GetWALBlockInfo(fcinfo, xlogreader);
CHECK_FOR_INTERRUPTS();
}
- MemoryContextDelete(tmp_cxt);
pfree(xlogreader->private_data);
XLogReaderFree(xlogreader);
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 849201a1f8..9ad0e29a5c 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -53,7 +53,7 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
WHERE resource_manager = 'Heap' AND record_type = 'INSERT';
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
@@ -64,9 +64,13 @@ CHECKPOINT;
UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
-- ===================================================================
-- Tests for permissions
@@ -83,7 +87,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
-- Functions accessible by users with role pg_read_server_files
@@ -99,7 +103,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE pg_read_server_files FROM regress_pg_walinspect;
@@ -113,7 +117,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
@@ -126,7 +130,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn)
FROM regress_pg_walinspect;
@@ -137,7 +141,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
diff --git a/contrib/pg_walinspect/walinspect.conf b/contrib/pg_walinspect/walinspect.conf
index 67ceb2bb07..367f706651 100644
--- a/contrib/pg_walinspect/walinspect.conf
+++ b/contrib/pg_walinspect/walinspect.conf
@@ -1,2 +1,2 @@
-wal_level = replica
+wal_level = logical
max_replication_slots = 4
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 3d7cdb95cc..d26115d6f2 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -190,31 +190,51 @@ combined_size_percentage | 2.8634072910530795
<varlistentry>
<term>
- <function>pg_get_wal_fpi_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
+ <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
</term>
<listitem>
<para>
- Gets a copy of full page images as <type>bytea</type> values (after
- applying decompression when necessary) and their information associated
- with all the valid WAL records between
+ Gets a copy of the block information stored in WAL records. This includes
+ copies of the block data (<literal>NULL</literal> if none) and full page
+ images as <type>bytea</type> values (after
+ applying decompression when necessary, or <literal>NULL</literal> if none)
+ and their information associated with all the valid WAL records between
<replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per full page image.
- If <replaceable>start_lsn</replaceable> or
+ <replaceable>end_lsn</replaceable>. Returns one row per block registered
+ in a WAL record. If <replaceable>start_lsn</replaceable> or
<replaceable>end_lsn</replaceable> are not yet available, the function
will raise an error. For example:
<screen>
-postgres=# SELECT lsn, reltablespace, reldatabase, relfilenode, relblocknumber,
- forkname, substring(fpi for 24) as fpi_trimmed
- FROM pg_get_wal_fpi_info('0/1801690', '0/1825C60');
+postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+ relblocknumber, forkname,
+ substring(blockdata for 24) as block_trimmed,
+ substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
+ FROM pg_get_wal_block_info('0/1871080', '0/1871440');
-[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/1807E20
+lsn | 0/1871208
+blockid | 0
reltablespace | 1663
-reldatabase | 5
-relfilenode | 16396
-relblocknumber | 43
+reldatabase | 16384
+relfilenode | 1255
+relblocknumber | 96
forkname | main
-fpi_trimmed | \x00000000b89e660100000000a003c0030020042000000000
+block_trimmed | \x0700080009000a00
+fpi_trimmed |
+fpilen |
+fpiinfo |
+-[ RECORD 2 ]--+---------------------------------------------------
+lsn | 0/18712F8
+blockid | 0
+reltablespace | 1663
+reldatabase | 16384
+relfilenode | 16392
+relblocknumber | 0
+forkname | main
+block_trimmed | \x02800128180164000000
+fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
--
2.34.1
On Tue, 7 Mar 2023 at 01:34, Michael Paquier <michael@paquier.xyz> wrote:
On Mon, Mar 06, 2023 at 04:08:28PM +0100, Matthias van de Meent wrote:
On Mon, 6 Mar 2023 at 15:40, Bharath Rupireddy
IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info textI would add the length of the block data (without the hole and
compressed, as the FPI data should always be presented as
uncompressed), and the block data if any (without the block data
length as one can guess it based on the bytea data length). Note that
a block can have both a FPI and some data assigned to it, as far as I
recall.The basic idea is to create a single entrypoint to all relevant data
from DecodedXLogRecord in SQL, not multiple.While I would agree with this principle on simplicity's ground in
terms of minimizing the SQL interface and the pg_wal/ lookups, I
disagree about it on unsability ground, because we can avoid extra SQL
tweaks with more functions. One recent example I have in mind is
partitionfuncs.c, which can actually be achieved with a WITH RECURSIVE
on the catalogs.
Correct, but in that case the user would build the same query (or at
least with the same complexity) as what we're executing under the
hood, right?
There are of course various degrees of complexity,
and perhaps unnest() cannot qualify as one, but having two functions
returning normalized records (one for the record information, and a
second for the block information), is a rather good balance between
usability and interface complexity, in my experience.
I would agree, if it weren't for the reasons written below.
If you have two
functions, a JOIN is enough to cross-check the block data and the
record data,
Joins are expensive on large datasets; and because WAL is one of the
largest datasets in the system, why would we want to force the user to
JOIN them if we can produce the data in one pre-baked data structure
without a need to join?
while an unnest() heavily bloats the main function output
(aka byteas of FPIs in a single array).
I don't see how that would be bad. You can select a subset of columns
without much issue, which can allow you to ignore any and all bloat.
It is also not easy to imagine that we'd have arguments in the
function that determine whether it includes the largest fields (main
data, blocks, block data, and block images) or leaves them NULL so
that we need to pass less data around if the user doesn't want the
data.
Matthias van de Meent
On Tue, Mar 07, 2023 at 03:56:22PM +0530, Bharath Rupireddy wrote:
That would be a lot better. Not just the test, but also the
documentation can have it. Simple way to generate such a record (both
block data and FPI) is to just change the wal_level to logical in
walinspect.conf [1], see code around REGBUF_KEEP_DATA and
RelationIsLogicallyLogged in heapam.c
I don't agree that we need to go down to wal_level=logical for this.
The important part is to check that the non-NULL and NULL paths for
the block data and FPI data are both taken, making 4 paths to check.
So we need two tests at minimum, which would be either:
- One SQL generating no FPI with no block data and a second generating
a FPI with block data. v2 was doing that but did not cover the first
case.
- One SQL generating a FPI with no block data and a second generating
no FPI with block data.
So let's just geenrate a heap record on an UPDATE, for example, like
in the version attached.
2. Used int4 instead of int for fpilen just to be in sync with
fpi_length of pg_get_wal_record_info.
Okay.
3. Changed to be consistent and use just FPI or "F/full page".
/* FPI flags */
/* No full page image, so store NULLs for all its fields */
/* Full-page image */
/* Full page exists, so let's save it. */
* and end LSNs. This produces information about the full page images with
* to a record. Decompression is applied to the full-page images, if
Fine by me.
4. I think we need to free raw_data, raw_page and flags as we loop
over multiple blocks (XLR_MAX_BLOCK_ID) and will leak memory which can
be a problem if we have many blocks assocated with a single WAL
record.
flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
Also, we will leak all CStringGetTextDatum memory in the block_id for loop.
Another way is to use and reset temp memory context in the for loop
over block_ids. I prefer this approach over multiple pfree()s in
block_id for loop.
I disagree, this was on purpose in the last version. This version
finishes by calling AllocSetContextCreate() and MemoryContextDelete()
once per *record*, which will not be free, and we are arguing about
resetting the memory context after scanning up to XLR_MAX_BLOCK_ID
blocks, or 32 blocks which would go up to 32kB per page in the worst
case. That's not going to matter in a large scan for each record, but
the extra AllocSet*() calls could. And we basically do the same thing
on HEAD.
5. I think it'd be good to say if the FPI is for WAL_VERIFICATION, so
I changed it to the following. Feel free to ignore this if you think
it's not required.
if (blk->apply_image)
flags[cnt++] = CStringGetTextDatum("APPLY");
else
flags[cnt++] = CStringGetTextDatum("WAL_VERIFICATION");
Disagreed here as well. WAL_VERIFICATION does not map with any of the
internal flags, and actually it may be finished by not being used
at replay if the LSN of the page read if higher than what the WAL
stores.
7. Added test case which shows both block data and fpi in the
documentation.
Okay on that.
8. Changed wal_level to logical in walinspect.conf to test case with block data.
This change is not necessary, per the argument above.
Any comments?
--
Michael
Attachments:
v3-0001-Rework-pg_walinspect-to-retrieve-more-block-infor.patchtext/x-diff; charset=us-asciiDownload
From 8a2bc9ab2c4d28f37f28b93ad9713fbd8ecb9dd3 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 8 Mar 2023 16:20:33 +0900
Subject: [PATCH v3] Rework pg_walinspect to retrieve more block information
---
doc/src/sgml/pgwalinspect.sgml | 48 ++++--
.../pg_walinspect/expected/pg_walinspect.out | 35 +++--
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 16 +-
contrib/pg_walinspect/pg_walinspect.c | 140 +++++++++++++-----
contrib/pg_walinspect/sql/pg_walinspect.sql | 33 +++--
5 files changed, 186 insertions(+), 86 deletions(-)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 3d7cdb95cc..d26115d6f2 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -190,31 +190,51 @@ combined_size_percentage | 2.8634072910530795
<varlistentry>
<term>
- <function>pg_get_wal_fpi_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
+ <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
</term>
<listitem>
<para>
- Gets a copy of full page images as <type>bytea</type> values (after
- applying decompression when necessary) and their information associated
- with all the valid WAL records between
+ Gets a copy of the block information stored in WAL records. This includes
+ copies of the block data (<literal>NULL</literal> if none) and full page
+ images as <type>bytea</type> values (after
+ applying decompression when necessary, or <literal>NULL</literal> if none)
+ and their information associated with all the valid WAL records between
<replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per full page image.
- If <replaceable>start_lsn</replaceable> or
+ <replaceable>end_lsn</replaceable>. Returns one row per block registered
+ in a WAL record. If <replaceable>start_lsn</replaceable> or
<replaceable>end_lsn</replaceable> are not yet available, the function
will raise an error. For example:
<screen>
-postgres=# SELECT lsn, reltablespace, reldatabase, relfilenode, relblocknumber,
- forkname, substring(fpi for 24) as fpi_trimmed
- FROM pg_get_wal_fpi_info('0/1801690', '0/1825C60');
+postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+ relblocknumber, forkname,
+ substring(blockdata for 24) as block_trimmed,
+ substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
+ FROM pg_get_wal_block_info('0/1871080', '0/1871440');
-[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/1807E20
+lsn | 0/1871208
+blockid | 0
reltablespace | 1663
-reldatabase | 5
-relfilenode | 16396
-relblocknumber | 43
+reldatabase | 16384
+relfilenode | 1255
+relblocknumber | 96
forkname | main
-fpi_trimmed | \x00000000b89e660100000000a003c0030020042000000000
+block_trimmed | \x0700080009000a00
+fpi_trimmed |
+fpilen |
+fpiinfo |
+-[ RECORD 2 ]--+---------------------------------------------------
+lsn | 0/18712F8
+blockid | 0
+reltablespace | 1663
+reldatabase | 16384
+relfilenode | 16392
+relblocknumber | 0
+forkname | main
+block_trimmed | \x02800128180164000000
+fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index 9bcb05354e..e0eb7ca08f 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -74,17 +74,28 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
(1 row)
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
+-- Update table to generate some block data
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
--- Force FPI on the next update.
-CHECKPOINT;
--- Update table to generate an FPI.
-UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ ok
+----
+ t
+(1 row)
+
+-- Force full-page image on the next update.
+SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
+CHECKPOINT;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
+SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
ok
----
t
@@ -116,7 +127,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
has_function_privilege
------------------------
f
@@ -146,7 +157,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -160,7 +171,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_record_info(pg_lsn)', 'EXECUTE'); -- yes
@@ -184,7 +195,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -196,7 +207,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
-- Clean up
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 1e9e1e6115..e674ef25aa 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -4,21 +4,25 @@
\echo Use "ALTER EXTENSION pg_walinspect UPDATE TO '1.1'" to load this file. \quit
--
--- pg_get_wal_fpi_info()
+-- pg_get_wal_block_info()
--
-CREATE FUNCTION pg_get_wal_fpi_info(IN start_lsn pg_lsn,
+CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
OUT lsn pg_lsn,
+ OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
OUT forkname text,
- OUT fpi bytea
+ OUT blockdata bytea,
+ OUT fpi bytea,
+ OUT fpilen int4,
+ OUT fpiinfo text[]
)
RETURNS SETOF record
-AS 'MODULE_PATHNAME', 'pg_get_wal_fpi_info'
+AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
LANGUAGE C STRICT PARALLEL SAFE;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) FROM PUBLIC;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index b7b0a805ee..ee88dc4992 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
@@ -30,7 +31,7 @@
PG_MODULE_MAGIC;
-PG_FUNCTION_INFO_V1(pg_get_wal_fpi_info);
+PG_FUNCTION_INFO_V1(pg_get_wal_block_info);
PG_FUNCTION_INFO_V1(pg_get_wal_record_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info_till_end_of_wal);
@@ -56,7 +57,7 @@ static void FillXLogStatsRow(const char *name, uint64 n, uint64 total_count,
Datum *values, bool *nulls, uint32 ncols);
static void GetWalStats(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
XLogRecPtr end_lsn, bool stats_per_record);
-static void GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
+static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
/*
* Check if the given LSN is in future. Also, return the LSN up to which the
@@ -221,49 +222,40 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of full page images from a single record.
+ * Store a set of block information from a single record (FPI and block
+ * information).
*/
static void
-GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_FPI_INFO_COLS 7
+#define PG_GET_WAL_BLOCK_INFO_COLS 11
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
- BlockNumber blk;
+ DecodedBkpBlock *blk;
+ BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_FPI_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_FPI_INFO_COLS] = {0};
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int i = 0;
if (!XLogRecHasBlockRef(record, block_id))
continue;
- if (!XLogRecHasBlockImage(record, block_id))
- continue;
+ blk = XLogRecGetBlock(record, block_id);
- page = (Page) buf.data;
-
- if (!RestoreBlockImage(record, block_id, page))
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("%s", record->errormsg_buf)));
-
- /* Full page exists, so let's save it. */
(void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blk, NULL);
+ &rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = ObjectIdGetDatum(rnode.spcOid);
- values[i++] = ObjectIdGetDatum(rnode.dbOid);
- values[i++] = ObjectIdGetDatum(rnode.relNumber);
- values[i++] = Int64GetDatum((int64) blk);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int64GetDatum((int64) blkno);
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -272,34 +264,102 @@ GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("invalid fork number: %u", fork)));
- /* Initialize bytea buffer to copy the FPI to. */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Block data */
+ if (blk->has_data)
+ {
+ bytea *raw_data;
- /* Take a verbatim copy of the FPI. */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
+ /* Initialize bytea buffer to copy the data to */
+ raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
- values[i++] = PointerGetDatum(raw_page);
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, blk->data_len);
+ values[i++] = PointerGetDatum(raw_data);
+ }
+ else
+ {
+ /* No data, so set this field to NULL */
+ nulls[i++] = true;
+ }
- Assert(i == PG_GET_WAL_FPI_INFO_COLS);
+ if (blk->has_image)
+ {
+ PGAlignedBlock buf;
+ Page page;
+ bytea *raw_page;
+ int bitcnt;
+ int cnt = 0;
+ Datum *flags;
+ ArrayType *a;
+
+ page = (Page) buf.data;
+
+ /* Full page image exists, so let's save it */
+ if (!RestoreBlockImage(record, block_id, page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
+ values[i++] = UInt32GetDatum(blk->bimg_len);
+
+ /* FPI flags */
+ bitcnt = pg_popcount((const char *) &blk->bimg_info,
+ sizeof(uint8));
+ /* Build set of raw flags */
+ flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
+
+ if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
+ flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
+ if (blk->apply_image)
+ flags[cnt++] = CStringGetTextDatum("APPLY");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_PGLZ");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_LZ4");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
+
+ Assert(cnt <= bitcnt);
+ a = construct_array_builtin(flags, cnt, TEXTOID);
+ values[i++] = PointerGetDatum(a);
+ }
+ else
+ {
+ /* No full page image, so store NULLs for all its fields */
+ memset(&nulls[i], true, 3 * sizeof(bool));
+ i += 3;
+ }
+
+ Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
}
-#undef PG_GET_WAL_FPI_INFO_COLS
+#undef PG_GET_WAL_FPI_BLOCK_COLS
}
/*
- * Get full page images with their relation information for all the WAL
- * records between start and end LSNs. Decompression is applied to the
- * blocks, if necessary.
+ * Get information about all the blocks saved in WAL records between start
+ * and end LSNs. This produces information about the full page images with
+ * their relation information, and the data saved in each block associated
+ * to a record. Decompression is applied to the full page images, if
+ * necessary.
*
* This function emits an error if a future start or end WAL LSN i.e. WAL LSN
* the database system doesn't know about is specified.
*/
Datum
-pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
+pg_get_wal_block_info(PG_FUNCTION_ARGS)
{
XLogRecPtr start_lsn;
XLogRecPtr end_lsn;
@@ -317,7 +377,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
xlogreader = InitXLogReaderState(start_lsn);
tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
- "pg_get_wal_fpi_info temporary cxt",
+ "pg_get_block_fpi_info temporary cxt",
ALLOCSET_DEFAULT_SIZES);
while (ReadNextXLogRecord(xlogreader) &&
@@ -326,7 +386,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
- GetWALFPIInfo(fcinfo, xlogreader);
+ GetWALBlockInfo(fcinfo, xlogreader);
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 849201a1f8..01a120f398 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -53,20 +53,25 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
WHERE resource_manager = 'Heap' AND record_type = 'INSERT';
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
+
+-- Update table to generate some block data
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
-
--- Force FPI on the next update.
-CHECKPOINT;
-
--- Update table to generate an FPI.
-UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+-- Force full-page image on the next update.
+SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
+CHECKPOINT;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
+SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
-- ===================================================================
-- Tests for permissions
@@ -83,7 +88,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
-- Functions accessible by users with role pg_read_server_files
@@ -99,7 +104,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE pg_read_server_files FROM regress_pg_walinspect;
@@ -113,7 +118,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
@@ -126,7 +131,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn)
FROM regress_pg_walinspect;
@@ -137,7 +142,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
--
2.39.2
On Wed, Mar 8, 2023 at 12:58 PM Michael Paquier <michael@paquier.xyz> wrote:
On Tue, Mar 07, 2023 at 03:56:22PM +0530, Bharath Rupireddy wrote:
That would be a lot better. Not just the test, but also the
documentation can have it. Simple way to generate such a record (both
block data and FPI) is to just change the wal_level to logical in
walinspect.conf [1], see code around REGBUF_KEEP_DATA and
RelationIsLogicallyLogged in heapam.cI don't agree that we need to go down to wal_level=logical for this.
The important part is to check that the non-NULL and NULL paths for
the block data and FPI data are both taken, making 4 paths to check.
So we need two tests at minimum, which would be either:
- One SQL generating no FPI with no block data and a second generating
a FPI with block data. v2 was doing that but did not cover the first
case.
- One SQL generating a FPI with no block data and a second generating
no FPI with block data.So let's just geenrate a heap record on an UPDATE, for example, like
in the version attached.
Yup, that should work too because block data gets logged [1]needs_backup = (page_lsn <= RedoRecPtr);.
4. I think we need to free raw_data, raw_page and flags as we loop
over multiple blocks (XLR_MAX_BLOCK_ID) and will leak memory which can
be a problem if we have many blocks assocated with a single WAL
record.
flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
Also, we will leak all CStringGetTextDatum memory in the block_id for loop.
Another way is to use and reset temp memory context in the for loop
over block_ids. I prefer this approach over multiple pfree()s in
block_id for loop.I disagree, this was on purpose in the last version. This version
finishes by calling AllocSetContextCreate() and MemoryContextDelete()
once per *record*, which will not be free, and we are arguing about
resetting the memory context after scanning up to XLR_MAX_BLOCK_ID
blocks, or 32 blocks which would go up to 32kB per page in the worst
case. That's not going to matter in a large scan for each record, but
the extra AllocSet*() calls could. And we basically do the same thing
on HEAD.
It's not just 32kB per page right? 32*8KB on HEAD (no block data,
flags and CStringGetTextDatum there). With the patch, the number of
pallocs for each block_id = 6 CStringGetTextDatum + BLCKSZ (8KB) +
flags (5*size of ptr) + block data_len. In the worst case, all
XLR_MAX_BLOCK_ID can have both FPIs and block data. Furthermore,
imagine if someone initialized their cluster with a higher BLCKSZ (>=
8KB), then the memory leak happens noticeably on a lower-end system.
I understand that performance is critical here but we need to ensure
memory is used wisely. Therefore, I'd still vote to free at least the
major contributors here, that is, pfree(raw_data);, pfree(raw_page);
and pfree(flags); right after they are done using. I'm sure pfree()s
don't hurt more than resetting memory context for every block_id.
Any comments?
I think we need to output block data length (blk->data_len) similar to
fpilen to save users from figuring out how to get the length of a
bytea column. This will also keep block data in sync with FPI info.
[1]: needs_backup = (page_lsn <= RedoRecPtr);
needs_backup = (page_lsn <= RedoRecPtr);
(gdb) p page_lsn
$2 = 21581544
(gdb) p RedoRecPtr
$3 = 21484808
(gdb) p needs_backup
$4 = false
(gdb)
(gdb) bt
#0 XLogRecordAssemble (rmid=10 '\n', info=64 '@',
RedoRecPtr=21484808, doPageWrites=true, fpw_lsn=0x7ffde118d640,
num_fpi=0x7ffde118d634, topxid_included=0x7ffde118d633) at xloginsert.c:582
#1 0x00005598cd9c3ef7 in XLogInsert (rmid=10 '\n', info=64 '@') at
xloginsert.c:497
#2 0x00005598cd930452 in log_heap_update (reln=0x7f4a4c7cd808,
oldbuf=136, newbuf=136, oldtup=0x7ffde118d820,
newtup=0x5598d00cb098, old_key_tuple=0x0,
all_visible_cleared=false, new_all_visible_cleared=false)
at heapam.c:8473
#3 0x00005598cd92876e in heap_update (relation=0x7f4a4c7cd808,
otid=0x7ffde118dab2, newtup=0x5598d00cb098, cid=0,
crosscheck=0x0, wait=true, tmfd=0x7ffde118db60,
lockmode=0x7ffde118da74) at heapam.c:3741
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Wed, Mar 08, 2023 at 04:01:56PM +0530, Bharath Rupireddy wrote:
I understand that performance is critical here but we need to ensure
memory is used wisely. Therefore, I'd still vote to free at least the
major contributors here, that is, pfree(raw_data);, pfree(raw_page);
and pfree(flags); right after they are done using. I'm sure pfree()s
don't hurt more than resetting memory context for every block_id.
Okay by me to have intermediate pfrees between each block scanned if
you feel strongly about it.
I think we need to output block data length (blk->data_len) similar to
fpilen to save users from figuring out how to get the length of a
bytea column. This will also keep block data in sync with FPI info.
length() works fine on bytea, so it can be used on the block data.
fpilen is a very different matter as it would be the length of a page
without a hole, or just something compressed.
--
Michael
On Wed, Mar 8, 2023 at 4:23 PM Michael Paquier <michael@paquier.xyz> wrote:
On Wed, Mar 08, 2023 at 04:01:56PM +0530, Bharath Rupireddy wrote:
I understand that performance is critical here but we need to ensure
memory is used wisely. Therefore, I'd still vote to free at least the
major contributors here, that is, pfree(raw_data);, pfree(raw_page);
and pfree(flags); right after they are done using. I'm sure pfree()s
don't hurt more than resetting memory context for every block_id.Okay by me to have intermediate pfrees between each block scanned if
you feel strongly about it.
Thanks. Attached v4 with that change.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v4-0001-Rework-pg_walinspect-to-retrieve-more-block-infor.patchapplication/x-patch; name=v4-0001-Rework-pg_walinspect-to-retrieve-more-block-infor.patchDownload
From e73245277cfaf95462ef1f1fe5334bf5aba87a2a Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Wed, 8 Mar 2023 14:29:50 +0000
Subject: [PATCH v4] Rework pg_walinspect to retrieve more block information
---
.../pg_walinspect/expected/pg_walinspect.out | 35 ++--
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 16 +-
contrib/pg_walinspect/pg_walinspect.c | 150 +++++++++++++-----
contrib/pg_walinspect/sql/pg_walinspect.sql | 33 ++--
doc/src/sgml/pgwalinspect.sgml | 48 ++++--
5 files changed, 196 insertions(+), 86 deletions(-)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index 9bcb05354e..e0eb7ca08f 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -74,17 +74,28 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
(1 row)
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
+-- Update table to generate some block data
SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
--- Force FPI on the next update.
-CHECKPOINT;
--- Update table to generate an FPI.
-UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ ok
+----
+ t
+(1 row)
+
+-- Force full-page image on the next update.
+SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
+CHECKPOINT;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
+SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
ok
----
t
@@ -116,7 +127,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
has_function_privilege
------------------------
f
@@ -146,7 +157,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -160,7 +171,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_record_info(pg_lsn)', 'EXECUTE'); -- yes
@@ -184,7 +195,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -196,7 +207,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
-- Clean up
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 1e9e1e6115..e674ef25aa 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -4,21 +4,25 @@
\echo Use "ALTER EXTENSION pg_walinspect UPDATE TO '1.1'" to load this file. \quit
--
--- pg_get_wal_fpi_info()
+-- pg_get_wal_block_info()
--
-CREATE FUNCTION pg_get_wal_fpi_info(IN start_lsn pg_lsn,
+CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
OUT lsn pg_lsn,
+ OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
OUT forkname text,
- OUT fpi bytea
+ OUT blockdata bytea,
+ OUT fpi bytea,
+ OUT fpilen int4,
+ OUT fpiinfo text[]
)
RETURNS SETOF record
-AS 'MODULE_PATHNAME', 'pg_get_wal_fpi_info'
+AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
LANGUAGE C STRICT PARALLEL SAFE;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) FROM PUBLIC;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index b7b0a805ee..5ead30628e 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
@@ -30,7 +31,7 @@
PG_MODULE_MAGIC;
-PG_FUNCTION_INFO_V1(pg_get_wal_fpi_info);
+PG_FUNCTION_INFO_V1(pg_get_wal_block_info);
PG_FUNCTION_INFO_V1(pg_get_wal_record_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info);
PG_FUNCTION_INFO_V1(pg_get_wal_records_info_till_end_of_wal);
@@ -56,7 +57,7 @@ static void FillXLogStatsRow(const char *name, uint64 n, uint64 total_count,
Datum *values, bool *nulls, uint32 ncols);
static void GetWalStats(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
XLogRecPtr end_lsn, bool stats_per_record);
-static void GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
+static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
/*
* Check if the given LSN is in future. Also, return the LSN up to which the
@@ -221,49 +222,43 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of full page images from a single record.
+ * Store a set of block information from a single record (FPI and block
+ * information).
*/
static void
-GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_FPI_INFO_COLS 7
+#define PG_GET_WAL_BLOCK_INFO_COLS 11
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
- BlockNumber blk;
+ DecodedBkpBlock *blk;
+ BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_FPI_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_FPI_INFO_COLS] = {0};
+ bytea *raw_data = NULL;
+ bytea *raw_page = NULL;
+ Datum *flags = NULL;
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int i = 0;
if (!XLogRecHasBlockRef(record, block_id))
continue;
- if (!XLogRecHasBlockImage(record, block_id))
- continue;
-
- page = (Page) buf.data;
-
- if (!RestoreBlockImage(record, block_id, page))
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("%s", record->errormsg_buf)));
+ blk = XLogRecGetBlock(record, block_id);
- /* Full page exists, so let's save it. */
(void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blk, NULL);
+ &rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = ObjectIdGetDatum(rnode.spcOid);
- values[i++] = ObjectIdGetDatum(rnode.dbOid);
- values[i++] = ObjectIdGetDatum(rnode.relNumber);
- values[i++] = Int64GetDatum((int64) blk);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int64GetDatum((int64) blkno);
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -272,34 +267,109 @@ GetWALFPIInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("invalid fork number: %u", fork)));
- /* Initialize bytea buffer to copy the FPI to. */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Block data */
+ if (blk->has_data)
+ {
+ /* Initialize bytea buffer to copy the data to */
+ raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
- /* Take a verbatim copy of the FPI. */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, blk->data_len);
+ values[i++] = PointerGetDatum(raw_data);
+ }
+ else
+ {
+ /* No data, so set this field to NULL */
+ nulls[i++] = true;
+ }
- values[i++] = PointerGetDatum(raw_page);
+ if (blk->has_image)
+ {
+ PGAlignedBlock buf;
+ Page page;
+ int bitcnt;
+ int cnt = 0;
+ ArrayType *a;
+
+ page = (Page) buf.data;
+
+ /* Full page image exists, so let's save it */
+ if (!RestoreBlockImage(record, block_id, page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
+ values[i++] = UInt32GetDatum(blk->bimg_len);
+
+ /* FPI flags */
+ bitcnt = pg_popcount((const char *) &blk->bimg_info,
+ sizeof(uint8));
+ /* Build set of raw flags */
+ flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
+
+ if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
+ flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
+ if (blk->apply_image)
+ flags[cnt++] = CStringGetTextDatum("APPLY");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_PGLZ");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_LZ4");
+ if ((blk->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+ flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
+
+ Assert(cnt <= bitcnt);
+ a = construct_array_builtin(flags, cnt, TEXTOID);
+ values[i++] = PointerGetDatum(a);
+ }
+ else
+ {
+ /* No full page image, so store NULLs for all its fields */
+ memset(&nulls[i], true, 3 * sizeof(bool));
+ i += 3;
+ }
- Assert(i == PG_GET_WAL_FPI_INFO_COLS);
+ Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
+
+ /*
+ * Free up some space to not leak memory while traversing all block
+ * IDs.
+ */
+ if (raw_data)
+ pfree(raw_data);
+ if (raw_page)
+ pfree(raw_page);
+ if (flags)
+ pfree(flags);
}
-#undef PG_GET_WAL_FPI_INFO_COLS
+#undef PG_GET_WAL_FPI_BLOCK_COLS
}
/*
- * Get full page images with their relation information for all the WAL
- * records between start and end LSNs. Decompression is applied to the
- * blocks, if necessary.
+ * Get information about all the blocks saved in WAL records between start
+ * and end LSNs. This produces information about the full page images with
+ * their relation information, and the data saved in each block associated
+ * to a record. Decompression is applied to the full page images, if
+ * necessary.
*
* This function emits an error if a future start or end WAL LSN i.e. WAL LSN
* the database system doesn't know about is specified.
*/
Datum
-pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
+pg_get_wal_block_info(PG_FUNCTION_ARGS)
{
XLogRecPtr start_lsn;
XLogRecPtr end_lsn;
@@ -317,7 +387,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
xlogreader = InitXLogReaderState(start_lsn);
tmp_cxt = AllocSetContextCreate(CurrentMemoryContext,
- "pg_get_wal_fpi_info temporary cxt",
+ "pg_get_block_fpi_info temporary cxt",
ALLOCSET_DEFAULT_SIZES);
while (ReadNextXLogRecord(xlogreader) &&
@@ -326,7 +396,7 @@ pg_get_wal_fpi_info(PG_FUNCTION_ARGS)
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
- GetWALFPIInfo(fcinfo, xlogreader);
+ GetWALBlockInfo(fcinfo, xlogreader);
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 849201a1f8..01a120f398 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -53,20 +53,25 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2'
WHERE resource_manager = 'Heap' AND record_type = 'INSERT';
-- ===================================================================
--- Tests to get full page image (FPI) from WAL record
+-- Tests to get block information from WAL record
-- ===================================================================
-SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
-
--- Force FPI on the next update.
-CHECKPOINT;
--- Update table to generate an FPI.
-UPDATE sample_tbl SET col1 = col1 * 100 WHERE col1 = 1;
+-- Update table to generate some block data
+SELECT pg_current_wal_lsn() AS wal_lsn3 \gset
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
+-- Check if we get block data from WAL record.
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
+ WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+-- Force full-page image on the next update.
+SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
+CHECKPOINT;
+UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
+SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
-SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_fpi_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid';
+SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
-- ===================================================================
-- Tests for permissions
@@ -83,7 +88,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
-- Functions accessible by users with role pg_read_server_files
@@ -99,7 +104,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE pg_read_server_files FROM regress_pg_walinspect;
@@ -113,7 +118,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
@@ -126,7 +131,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_fpi_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn)
FROM regress_pg_walinspect;
@@ -137,7 +142,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_fpi_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
-- ===================================================================
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 3d7cdb95cc..d26115d6f2 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -190,31 +190,51 @@ combined_size_percentage | 2.8634072910530795
<varlistentry>
<term>
- <function>pg_get_wal_fpi_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
+ <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
</term>
<listitem>
<para>
- Gets a copy of full page images as <type>bytea</type> values (after
- applying decompression when necessary) and their information associated
- with all the valid WAL records between
+ Gets a copy of the block information stored in WAL records. This includes
+ copies of the block data (<literal>NULL</literal> if none) and full page
+ images as <type>bytea</type> values (after
+ applying decompression when necessary, or <literal>NULL</literal> if none)
+ and their information associated with all the valid WAL records between
<replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per full page image.
- If <replaceable>start_lsn</replaceable> or
+ <replaceable>end_lsn</replaceable>. Returns one row per block registered
+ in a WAL record. If <replaceable>start_lsn</replaceable> or
<replaceable>end_lsn</replaceable> are not yet available, the function
will raise an error. For example:
<screen>
-postgres=# SELECT lsn, reltablespace, reldatabase, relfilenode, relblocknumber,
- forkname, substring(fpi for 24) as fpi_trimmed
- FROM pg_get_wal_fpi_info('0/1801690', '0/1825C60');
+postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+ relblocknumber, forkname,
+ substring(blockdata for 24) as block_trimmed,
+ substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
+ FROM pg_get_wal_block_info('0/1871080', '0/1871440');
-[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/1807E20
+lsn | 0/1871208
+blockid | 0
reltablespace | 1663
-reldatabase | 5
-relfilenode | 16396
-relblocknumber | 43
+reldatabase | 16384
+relfilenode | 1255
+relblocknumber | 96
forkname | main
-fpi_trimmed | \x00000000b89e660100000000a003c0030020042000000000
+block_trimmed | \x0700080009000a00
+fpi_trimmed |
+fpilen |
+fpiinfo |
+-[ RECORD 2 ]--+---------------------------------------------------
+lsn | 0/18712F8
+blockid | 0
+reltablespace | 1663
+reldatabase | 16384
+relfilenode | 16392
+relblocknumber | 0
+forkname | main
+block_trimmed | \x02800128180164000000
+fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
--
2.34.1
At Wed, 8 Mar 2023 20:18:06 +0530, Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> wrote in
On Wed, Mar 8, 2023 at 4:23 PM Michael Paquier <michael@paquier.xyz> wrote:
On Wed, Mar 08, 2023 at 04:01:56PM +0530, Bharath Rupireddy wrote:
I understand that performance is critical here but we need to ensure
memory is used wisely. Therefore, I'd still vote to free at least the
major contributors here, that is, pfree(raw_data);, pfree(raw_page);
and pfree(flags); right after they are done using. I'm sure pfree()s
don't hurt more than resetting memory context for every block_id.Okay by me to have intermediate pfrees between each block scanned if
you feel strongly about it.Thanks. Attached v4 with that change.
Although I'm not strongly opposed to pfreeing them, I'm not sure I
like the way the patch frees them. The life times of all of raw_data,
raw_page and flags are within a block. They can be freed
unconditionally after they are actually used and the scope of the
pointer variables can be properly narowed.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Thu, Mar 09, 2023 at 09:46:12AM +0900, Kyotaro Horiguchi wrote:
Although I'm not strongly opposed to pfreeing them, I'm not sure I
like the way the patch frees them. The life times of all of raw_data,
raw_page and flags are within a block. They can be freed
unconditionally after they are actually used and the scope of the
pointer variables can be properly narowed.
The thing is that you cannot keep them inside each individual blocks
because they have to be freed once their values are stored in the
tuplestore, which is why I guess Bharath has done things this way.
After sleeping on that, I tend to prefer the simplicity of v3 where we
keep track of the block and fpi data in each of their respective
blocks. It means that we lose track of them each time we go to a
different block, but the memory context reset done after each record
means that scanning through a large WAL history will not cause a leak
across the function call.
The worst scenario with v3 is a record that makes use of all the 32
blocks with a hell lot of block data in each one of them, which is
possible in theory, but very unlikely in practice except if someone
uses a custom RGMR to generate crazily-shaped WAL records. I am aware
of the fact that it is possible to generate such records if you are
really willing to do so, aka this thread:
/messages/by-id/CAEze2WgGiw+LZt+vHf8tWqB_6VxeLsMeoAuod0N=ij1q17n5pw@mail.gmail.com
In short, my choice would still be simplicity here with v3, I guess.
--
Michael
At Thu, 9 Mar 2023 10:15:39 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Thu, Mar 09, 2023 at 09:46:12AM +0900, Kyotaro Horiguchi wrote:
Although I'm not strongly opposed to pfreeing them, I'm not sure I
like the way the patch frees them. The life times of all of raw_data,
raw_page and flags are within a block. They can be freed
unconditionally after they are actually used and the scope of the
pointer variables can be properly narowed.The thing is that you cannot keep them inside each individual blocks
because they have to be freed once their values are stored in the
tuplestore, which is why I guess Bharath has done things this way.
Ugh.. Right.
After sleeping on that, I tend to prefer the simplicity of v3 where we
keep track of the block and fpi data in each of their respective
blocks. It means that we lose track of them each time we go to a
different block, but the memory context reset done after each record
means that scanning through a large WAL history will not cause a leak
across the function call.The worst scenario with v3 is a record that makes use of all the 32
blocks with a hell lot of block data in each one of them, which is
possible in theory, but very unlikely in practice except if someone
uses a custom RGMR to generate crazily-shaped WAL records. I am aware
of the fact that it is possible to generate such records if you are
really willing to do so, aka this thread:
/messages/by-id/CAEze2WgGiw+LZt+vHf8tWqB_6VxeLsMeoAuod0N=ij1q17n5pw@mail.gmail.com
I agree to the view that that "leakage" for at-most 32 blocks and
typically 0 to 2 blcoks won't be a matter.
In short, my choice would still be simplicity here with v3, I guess.
FWIW, I slightly prefer v3 for the reason I mentioned above.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Thu, Mar 9, 2023 at 7:34 AM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
In short, my choice would still be simplicity here with v3, I guess.
FWIW, I slightly prefer v3 for the reason I mentioned above.
Hm, then, +1 for v3.
FWIW, I quickly tried to hit that case where a single WAL record has
max_block_id = XLR_MAX_BLOCK_ID with both FPIs and block data, but I
couldn't. I could generate WAL records with 45K FPIs, 10mn block data
and the total palloc'd length in the block_id for loop has not crossed
8K.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Thu, Mar 09, 2023 at 09:52:57AM +0530, Bharath Rupireddy wrote:
Hm, then, +1 for v3.
Okay, thanks. Let's use that, then.
--
Michael
On Thu, Mar 09, 2023 at 03:37:21PM +0900, Michael Paquier wrote:
Okay, thanks. Let's use that, then.
I have done one pass over that today, and applied it. Thanks!
I'd really like to do something about the errors we raise in the
module when specifying LSNs in the future for this release, now. I
got annoyed by it again this morning while doing \watch queries that
kept failing randomly while stressing this patch.
--
Michael
On Fri, Mar 10, 2023 at 6:43 AM Michael Paquier <michael@paquier.xyz> wrote:
I'd really like to do something about the errors we raise in the
module when specifying LSNs in the future for this release, now. I
got annoyed by it again this morning while doing \watch queries that
kept failing randomly while stressing this patch.
Perhaps what is proposed here
/messages/by-id/CALj2ACWqJ+m0HoQj9qkAV2uQfq97yk5jN2MOdfKcXusXsyptKQ@mail.gmail.com
might help and avoid many errors around input LSN validations. Let's
discuss that in that thread.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Fri, Mar 10, 2023 at 06:50:05AM +0530, Bharath Rupireddy wrote:
Perhaps what is proposed here
/messages/by-id/CALj2ACWqJ+m0HoQj9qkAV2uQfq97yk5jN2MOdfKcXusXsyptKQ@mail.gmail.com
might help and avoid many errors around input LSN validations. Let's
discuss that in that thread.
Yep, I am going to look at your proposal.
--
Michael
I'm excited to see that pg_get_wal_block_info() was merged. Thanks for
working on this!
Apologies for jumping back in here a bit late. I've been playing around
with it and wanted to comment on the performance of JOINing to
pg_get_wal_records_info().
On Tue, Mar 7, 2023 at 7:08 AM Matthias van de Meent
<boekewurm+postgres@gmail.com> wrote:
On Tue, 7 Mar 2023 at 01:34, Michael Paquier <michael@paquier.xyz> wrote:
On Mon, Mar 06, 2023 at 04:08:28PM +0100, Matthias van de Meent wrote:
On Mon, 6 Mar 2023 at 15:40, Bharath Rupireddy
IMO, pg_get_wal_records_extended_info as proposed doesn't look good to
me as it outputs most of the columns that are already given by
pg_get_wal_records_info.What I think the best way at this point is to
make it return the following:
lsn pg_lsn
block_id int8
spcOid oid
dbOid oid
relNumber oid
forkNames text
fpi bytea
fpi_info textI would add the length of the block data (without the hole and
compressed, as the FPI data should always be presented as
uncompressed), and the block data if any (without the block data
length as one can guess it based on the bytea data length). Note that
a block can have both a FPI and some data assigned to it, as far as I
recall.The basic idea is to create a single entrypoint to all relevant data
from DecodedXLogRecord in SQL, not multiple.While I would agree with this principle on simplicity's ground in
terms of minimizing the SQL interface and the pg_wal/ lookups, I
disagree about it on unsability ground, because we can avoid extra SQL
tweaks with more functions. One recent example I have in mind is
partitionfuncs.c, which can actually be achieved with a WITH RECURSIVE
on the catalogs.Correct, but in that case the user would build the same query (or at
least with the same complexity) as what we're executing under the
hood, right?There are of course various degrees of complexity,
and perhaps unnest() cannot qualify as one, but having two functions
returning normalized records (one for the record information, and a
second for the block information), is a rather good balance between
usability and interface complexity, in my experience.I would agree, if it weren't for the reasons written below.
If you have two
functions, a JOIN is enough to cross-check the block data and the
record data,Joins are expensive on large datasets; and because WAL is one of the
largest datasets in the system, why would we want to force the user to
JOIN them if we can produce the data in one pre-baked data structure
without a need to join?
I wanted to experiment to see how much slower it is to do the join
between pg_get_wal_block_info() and pg_get_wal_records_info() and
profile where the time was spent.
I saved the wal lsn before and after a bunch of inserts (generates
~1,000,000 records).
On master, I did a join like this:
SELECT count(*) FROM
pg_get_wal_block_info(:start_lsn, :end_lsn) b JOIN
pg_get_wal_records_info(:start_lsn, :end_lsn) w ON w.start_lsn = b.lsn;
which took 1191 ms.
After patching master to add in the columns from
pg_get_wal_records_info() which are not returned by
pg_get_wal_block_info() (except block_ref column of course), this query:
SELECT COUNT(*) FROM pg_get_wal_block_info(:start_lsn, :end_lsn);
took 467 ms.
Perhaps this difference isn't important, but I found it noticeable.
A large chunk of the time is spent joining the tuplestores.
The second largest chunk of time seems to be in GetWalRecordInfo()'s
calls to XLogRecGetBlockRefInfo(), which spends quite a bit of time in
string construction -- mainly for strings we wouldn't end up needing
after joining to the block info function.
Surprisingly, the string construction seemed to overshadow the
performance impact of doubling the decoding passes over the WAL records.
So maybe it is worth including more record-level info?
On an unrelated note, I had thought that we should have some kind of
parameter to pg_get_wal_block_info() to control whether or not we output
the FPI to save us from wasting time decompressing the FPIs.
AFAIK, we don't have access to projection information from inside the
function, so a parameter like "output_fpi" or the like would have to do.
I wanted to share what I found in trying this in case someone else had
had that thought.
TL;DR, it doesn't seem to matter from a performance perspective if we
skip decompressing the FPIs in pg_get_wal_block_info().
I hacked "output_fpi" into pg_get_wal_block_info(), enabled pglz wal
compression, and generated a boatload of FPIs by dirtying buffers, doing
a checkpoint and then updating those pages again right after the
checkpoint. With output_fpi = true (same as master), my call to
pg_get_wal_block_info() took around 7 seconds and with output_fpi =
false, it took around 6 seconds. Not an impressive difference.
I noticed that only around 2% of the time is spent in pglz_decompress().
Most of the time (about half) is spent in building tuples for the
tuplestore, copying memory around, and writing the tuplestore out to a
file. Another 10-20% is spent decoding the records--which has to be done
regardless.
I wonder if there are cases where the decompression overhead would matter.
My conclusion is that it isn't worth bothering with such a parameter.
- Melanie
On Tue, Mar 14, 2023 at 3:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
After patching master to add in the columns from
pg_get_wal_records_info() which are not returned by
pg_get_wal_block_info() (except block_ref column of course), this query:SELECT COUNT(*) FROM pg_get_wal_block_info(:start_lsn, :end_lsn);
took 467 ms.
Perhaps this difference isn't important, but I found it noticeable.
This seems weird to me too. It's not so much the performance overhead
that bothers me (though that's not great either). It seems *illogical*
to me. The query you end up writing must do two passes over the WAL
records, but its structure almost suggests that it's necessary to do
two separate passes over distinct "streams".
Why doesn't it already work like this? Why do we need a separate
pg_get_wal_block_info() function at all?
--
Peter Geoghegan
On Tue, Mar 14, 2023 at 6:57 PM Peter Geoghegan <pg@bowt.ie> wrote:
On Tue, Mar 14, 2023 at 3:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:After patching master to add in the columns from
pg_get_wal_records_info() which are not returned by
pg_get_wal_block_info() (except block_ref column of course), this query:SELECT COUNT(*) FROM pg_get_wal_block_info(:start_lsn, :end_lsn);
took 467 ms.
Perhaps this difference isn't important, but I found it noticeable.
This seems weird to me too. It's not so much the performance overhead
that bothers me (though that's not great either). It seems *illogical*
to me. The query you end up writing must do two passes over the WAL
records, but its structure almost suggests that it's necessary to do
two separate passes over distinct "streams".Why doesn't it already work like this? Why do we need a separate
pg_get_wal_block_info() function at all?
Well, I think if you only care about the WAL record-level information
and not the block-level information, having the WAL record information
denormalized like that with all the block information would be a
nuisance.
But, perhaps you are suggesting a parameter to pg_get_wal_records_info()
like "with_block_info" or something, which produces the full
denormalized block + record output?
- Melanie
On Tue, Mar 14, 2023 at 5:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Tue, Mar 14, 2023 at 6:57 PM Peter Geoghegan <pg@bowt.ie> wrote:
Why doesn't it already work like this? Why do we need a separate
pg_get_wal_block_info() function at all?Well, I think if you only care about the WAL record-level information
and not the block-level information, having the WAL record information
denormalized like that with all the block information would be a
nuisance.
I generally care about both. When I want to look at things at the
pg_get_wal_records_info() level (as opposed to a summary), the
block_ref information is *always* of primary importance. I don't want
to have to write my own bug-prone parser for block_ref, but why should
the only alternative be joining against pg_get_wal_block_info()? The
information that I'm interested in is "close at hand" to
pg_get_wal_records_info() already.
I understand that in the general case there might be quite a few
blocks associated with a WAL record. For complicated cases,
pg_get_wal_block_info() does make sense. However, the vast majority of
individual WAL records (and possibly most WAL record types) are
related to one block only. One block that is generally from the
relation's main fork.
But, perhaps you are suggesting a parameter to pg_get_wal_records_info()
like "with_block_info" or something, which produces the full
denormalized block + record output?
I was thinking of something like that, yes -- though it wouldn't
necessarily have to be the *full* denormalized block_ref info, the FPI
itself, etc. Just the more useful stuff.
It occurs to me that my concern about the information that
pg_get_wal_records_info() lacks could be restated as a concern about
what pg_get_wal_block_info() lacks: pg_get_wal_block_info() fails to
show basic information about the WAL record whose blocks it reports
on, even though it could easily show all of the
pg_get_wal_records_info() info once per block (barring block_ref). So
addressing my concern by adjusting pg_get_wal_block_info() might be
the best approach. I'd probably be happy with that -- I'd likely just
stop using pg_get_wal_records_info() completely under this scheme.
Overall, I'm concerned that we may have missed the opportunity to make
simple things easier. Again, wanting to see (say) all of the PRUNE
records and VACUUM records with an "order by relfilenode,
block_number, lsn" seems likely to be a very common requirement to me.
It's exactly the kind of thing that you'd expect an SQL interface to
make easy.
--
Peter Geoghegan
On Wed, Mar 15, 2023 at 7:20 AM Peter Geoghegan <pg@bowt.ie> wrote:
But, perhaps you are suggesting a parameter to pg_get_wal_records_info()
like "with_block_info" or something, which produces the full
denormalized block + record output?I was thinking of something like that, yes -- though it wouldn't
necessarily have to be the *full* denormalized block_ref info, the FPI
itself, etc. Just the more useful stuff.It occurs to me that my concern about the information that
pg_get_wal_records_info() lacks could be restated as a concern about
what pg_get_wal_block_info() lacks: pg_get_wal_block_info() fails to
show basic information about the WAL record whose blocks it reports
on, even though it could easily show all of the
pg_get_wal_records_info() info once per block (barring block_ref). So
addressing my concern by adjusting pg_get_wal_block_info() might be
the best approach. I'd probably be happy with that -- I'd likely just
stop using pg_get_wal_records_info() completely under this scheme.
How about something like the attached? It adds the per-record columns
to pg_get_wal_block_info() avoiding "possibly expensive" joins with
pg_get_wal_records_info().
With this, pg_get_wal_records_info() too will be useful for users
scanning WAL at record level. That is to say that we can retain both
pg_get_wal_records_info() and pg_get_wal_block_info().
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v1-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v1-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From c98e250011d9944b44fe166bb8bdf8ac81ac9955 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Wed, 15 Mar 2023 06:33:31 +0000
Subject: [PATCH v1] Emit WAL record info via pg_get_wal_block_info()
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 10 +++-
contrib/pg_walinspect/pg_walinspect.c | 48 ++++++++++++++-----
doc/src/sgml/pgwalinspect.sgml | 38 +++++++++------
3 files changed, 68 insertions(+), 28 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..46fcaff3a1 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,7 +12,15 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 3b3215daf5..d766a050a4 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -43,7 +43,8 @@ static XLogRecPtr GetCurrentLSN(void);
static XLogReaderState *InitXLogReaderState(XLogRecPtr lsn);
static XLogRecord *ReadNextXLogRecord(XLogReaderState *xlogreader);
static void GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols);
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref);
static void GetWALRecordsInfo(FunctionCallInfo fcinfo,
XLogRecPtr start_lsn,
XLogRecPtr end_lsn);
@@ -180,7 +181,8 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols)
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref)
{
const char *id;
RmgrData desc;
@@ -200,8 +202,11 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
desc.rm_desc(&rec_desc, record);
/* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ if (fetch_blk_ref)
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
main_data_len = XLogRecGetDataLen(record);
@@ -213,9 +218,14 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(main_data_len);
- values[i++] = UInt32GetDatum(fpi_len);
+
+ if (fetch_blk_ref)
+ values[i++] = UInt32GetDatum(fpi_len);
+
values[i++] = CStringGetTextDatum(rec_desc.data);
- values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+
+ if (fetch_blk_ref)
+ values[i++] = CStringGetTextDatum(rec_blk_ref.data);
Assert(i == ncols);
}
@@ -228,19 +238,30 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
+#define PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS 9
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ /* Get per-record info. */
+ GetWALRecordInfo(record, values, nulls,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS, false);
+
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- int i = 0;
+ int i = PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS;
+
+ /* Reset only the per-block output columns. */
+ memset(&nulls[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
+ memset(&values[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
if (!XLogRecHasBlockRef(record, block_id))
continue;
@@ -250,7 +271,6 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(void) XLogRecGetBlockTagExtended(record, block_id,
&rnode, &fork, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = Int16GetDatum(block_id);
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
@@ -345,6 +365,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
values, nulls);
}
+#undef PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS
#undef PG_GET_WAL_FPI_BLOCK_COLS
}
@@ -434,7 +455,8 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS)
errmsg("could not read WAL at %X/%X",
LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
- GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS);
+ GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS,
+ true);
pfree(xlogreader->private_data);
XLogReaderFree(xlogreader);
@@ -505,7 +527,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
old_cxt = MemoryContextSwitchTo(tmp_cxt);
GetWALRecordInfo(xlogreader, values, nulls,
- PG_GET_WAL_RECORDS_INFO_COLS);
+ PG_GET_WAL_RECORDS_INFO_COLS, true);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..cf94f36200 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -177,23 +177,33 @@ combined_size_percentage | 2.8634072910530795
<replaceable>start_lsn</replaceable> is not available. For example,
usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, xid, resource_manager,
+ record_type, record_length, main_data_length, description,
+ blockid, reltablespace, reldatabase, relfilenode,
relblocknumber, forkname,
substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/15457B0', '0/1545A40');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/1545860
+end_lsn | 0/1545998
+prev_lsn | 0/15457E8
+xid | 750
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 305
+main_data_length | 14
+description | off 2 xmax 750 flags 0x00 ; new off 6 xmax 0
+blockid | 0
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 24594
+relblocknumber | 0
+forkname | main
+block_trimmed |
+fpi_trimmed | \x00000000203a5401000000003000401f00200420ed020000
+fpilen | 240
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
--
2.34.1
On Tue, Mar 14, 2023 at 06:50:15PM -0700, Peter Geoghegan wrote:
On Tue, Mar 14, 2023 at 5:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:Well, I think if you only care about the WAL record-level information
and not the block-level information, having the WAL record information
denormalized like that with all the block information would be a
nuisance.I generally care about both. When I want to look at things at the
pg_get_wal_records_info() level (as opposed to a summary), the
block_ref information is *always* of primary importance. I don't want
to have to write my own bug-prone parser for block_ref, but why should
the only alternative be joining against pg_get_wal_block_info()? The
information that I'm interested in is "close at hand" to
pg_get_wal_records_info() already.
I am not sure to get the concern here. As long as one is smart enough
with SQL, there is no need to perform a double scan of the contents of
pg_wal with a large scan on the start LSN. If one wishes to only
extract some block for a given record type, or for a filter of your
choice, it is possible to use a LATERAL on pg_get_wal_block_info(),
say:
SELECT r.start_lsn, b.blockid
FROM pg_get_wal_records_info('0/01000028', '0/1911AA8') AS r,
LATERAL pg_get_wal_block_info(start_lsn, end_lsn) as b
WHERE r.resource_manager = 'Heap2';
This will extract the block information that you'd want for a given
record type.
I understand that in the general case there might be quite a few
blocks associated with a WAL record. For complicated cases,
pg_get_wal_block_info() does make sense. However, the vast majority of
individual WAL records (and possibly most WAL record types) are
related to one block only. One block that is generally from the
relation's main fork.
Sure, though there may be more complicated scenarios, like custom
RMGRs. At the end it comes to how much normalization should be
applied to the data extracted. FWIW, I think that the current
interface is a pretty good balance in usability.
--
Michael
On Wed, Mar 15, 2023 at 12:13:56PM +0530, Bharath Rupireddy wrote:
How about something like the attached? It adds the per-record columns
to pg_get_wal_block_info() avoiding "possibly expensive" joins with
pg_get_wal_records_info().With this, pg_get_wal_records_info() too will be useful for users
scanning WAL at record level. That is to say that we can retain both
pg_get_wal_records_info() and pg_get_wal_block_info().
FWIW, I am not convinced that there is any need to bloat more the
attributes of these functions, as filters for records could basically
touch all the fields returned by pg_get_wal_records_info(). What
about adding an example in the docs with the LATERAL query I mentioned
previously?
--
Michael
On Wed, Mar 15, 2023 at 12:20 PM Michael Paquier <michael@paquier.xyz> wrote:
On Tue, Mar 14, 2023 at 06:50:15PM -0700, Peter Geoghegan wrote:
On Tue, Mar 14, 2023 at 5:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:Well, I think if you only care about the WAL record-level information
and not the block-level information, having the WAL record information
denormalized like that with all the block information would be a
nuisance.I generally care about both. When I want to look at things at the
pg_get_wal_records_info() level (as opposed to a summary), the
block_ref information is *always* of primary importance. I don't want
to have to write my own bug-prone parser for block_ref, but why should
the only alternative be joining against pg_get_wal_block_info()? The
information that I'm interested in is "close at hand" to
pg_get_wal_records_info() already.I am not sure to get the concern here. As long as one is smart enough
with SQL, there is no need to perform a double scan of the contents of
pg_wal with a large scan on the start LSN. If one wishes to only
extract some block for a given record type, or for a filter of your
choice, it is possible to use a LATERAL on pg_get_wal_block_info(),
say:
SELECT r.start_lsn, b.blockid
FROM pg_get_wal_records_info('0/01000028', '0/1911AA8') AS r,
LATERAL pg_get_wal_block_info(start_lsn, end_lsn) as b
WHERE r.resource_manager = 'Heap2';This will extract the block information that you'd want for a given
record type.
It looks like nested-loop join is chosen for LATERAL query [1]postgres=# EXPLAIN (ANALYZE) SELECT * FROM pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS r, LATERAL pg_get_wal_block_info(start_lsn, end_lsn) AS b WHERE r.resource_manager = 'Heap'; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- Nested Loop (cost=0.01..112.50 rows=5000 width=330) (actual time=3175.114..49596.749 rows=5000019 loops=1) -> Function Scan on pg_get_wal_records_info r (cost=0.00..12.50 rows=5 width=168) (actual time=3175.058..4142.507 rows=4000019 loops=1) Filter: (resource_manager = 'Heap'::text) Rows Removed by Filter: 52081 -> Function Scan on pg_get_wal_block_info b (cost=0.00..10.00 rows=1000 width=162) (actual time=0.011..0.011 rows=1 loops=4000019) Planning Time: 0.076 ms Execution Time: 49998.850 ms (7 rows), that
is, for every start_lsn and end_lsn that we get from
pg_get_wal_records_info, pg_get_wal_block_info gets called. Whereas,
for non-LATERAL join [2]postgres=# EXPLAIN (ANALYZE) SELECT * FROM pg_get_wal_block_info(:'start_lsn', :'end_lsn') AS b JOIN pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS w ON w.start_lsn = b.lsn WHERE w.resource_manager = 'Heap';, hash/merge join is chosen which is pretty
fast (5x) over 5mn WAL records. Therefore, I'm not sure if adding the
LATERAL query as an example is a better idea.
IIUC, the concern raised so far in this thread is not just on the
performance of JOIN queries to get both block info and record level
info, but on ease of using pg_walinspect functions. If
pg_get_wal_block_info emits the record level information too (which
turns out to be 50 LOC more), one doesn't have to be expert at writing
JOIN queries or such, but just can run the function, which actually
takes way less time (3sec) to scan the same 5mn WAL records [3]postgres=# EXPLAIN ANALYZE SELECT * FROM pg_get_wal_block_info(:'start_lsn', :'end_lsn'); QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------- Function Scan on pg_get_wal_block_info (cost=0.00..10.00 rows=1000 width=286) (actual time=2617.755..3081.526 rows=5004478 loops=1) Planning Time: 0.039 ms Execution Time: 3301.217 ms (3 rows).
[1]: postgres=# EXPLAIN (ANALYZE) SELECT * FROM pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS r, LATERAL pg_get_wal_block_info(start_lsn, end_lsn) AS b WHERE r.resource_manager = 'Heap'; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- Nested Loop (cost=0.01..112.50 rows=5000 width=330) (actual time=3175.114..49596.749 rows=5000019 loops=1) -> Function Scan on pg_get_wal_records_info r (cost=0.00..12.50 rows=5 width=168) (actual time=3175.058..4142.507 rows=4000019 loops=1) Filter: (resource_manager = 'Heap'::text) Rows Removed by Filter: 52081 -> Function Scan on pg_get_wal_block_info b (cost=0.00..10.00 rows=1000 width=162) (actual time=0.011..0.011 rows=1 loops=4000019) Planning Time: 0.076 ms Execution Time: 49998.850 ms (7 rows)
postgres=# EXPLAIN (ANALYZE) SELECT * FROM
pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS r,
LATERAL pg_get_wal_block_info(start_lsn, end_lsn) AS b WHERE
r.resource_manager = 'Heap';
QUERY
PLAN
---------------------------------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.01..112.50 rows=5000 width=330) (actual
time=3175.114..49596.749 rows=5000019 loops=1)
-> Function Scan on pg_get_wal_records_info r (cost=0.00..12.50
rows=5 width=168) (actual time=3175.058..4142.507 rows=4000019
loops=1)
Filter: (resource_manager = 'Heap'::text)
Rows Removed by Filter: 52081
-> Function Scan on pg_get_wal_block_info b (cost=0.00..10.00
rows=1000 width=162) (actual time=0.011..0.011 rows=1 loops=4000019)
Planning Time: 0.076 ms
Execution Time: 49998.850 ms
(7 rows)
Time: 49999.203 ms (00:49.999)
[2]: postgres=# EXPLAIN (ANALYZE) SELECT * FROM pg_get_wal_block_info(:'start_lsn', :'end_lsn') AS b JOIN pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS w ON w.start_lsn = b.lsn WHERE w.resource_manager = 'Heap';
postgres=# EXPLAIN (ANALYZE) SELECT * FROM
pg_get_wal_block_info(:'start_lsn', :'end_lsn') AS b
JOIN pg_get_wal_records_info(:'start_lsn', :'end_lsn') AS w ON
w.start_lsn = b.lsn WHERE w.resource_manager = 'Heap';
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------
Hash Join (cost=12.57..26.57 rows=25 width=330) (actual
time=6241.449..9901.715 rows=5000019 loops=1)
Hash Cond: (b.lsn = w.start_lsn)
-> Function Scan on pg_get_wal_block_info b (cost=0.00..10.00
rows=1000 width=162) (actual time=1415.815..1870.522 rows=5067960
loops=1)
-> Hash (cost=12.50..12.50 rows=5 width=168) (actual
time=4665.292..4665.292 rows=4000019 loops=1)
Buckets: 65536 (originally 1024) Batches: 128 (originally 1)
Memory Usage: 7681kB
-> Function Scan on pg_get_wal_records_info w
(cost=0.00..12.50 rows=5 width=168) (actual time=3160.010..3852.332
rows=4000019 loops=1)
Filter: (resource_manager = 'Heap'::text)
Rows Removed by Filter: 52081
Planning Time: 0.082 ms
Execution Time: 10159.066 ms
(10 rows)
Time: 10159.465 ms (00:10.159)
[3]: postgres=# EXPLAIN ANALYZE SELECT * FROM pg_get_wal_block_info(:'start_lsn', :'end_lsn'); QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------- Function Scan on pg_get_wal_block_info (cost=0.00..10.00 rows=1000 width=286) (actual time=2617.755..3081.526 rows=5004478 loops=1) Planning Time: 0.039 ms Execution Time: 3301.217 ms (3 rows)
postgres=# EXPLAIN ANALYZE SELECT * FROM
pg_get_wal_block_info(:'start_lsn', :'end_lsn');
QUERY
PLAN
--------------------------------------------------------------------------------------------------------------------------------------
Function Scan on pg_get_wal_block_info (cost=0.00..10.00 rows=1000
width=286) (actual time=2617.755..3081.526 rows=5004478 loops=1)
Planning Time: 0.039 ms
Execution Time: 3301.217 ms
(3 rows)
Time: 3301.817 ms (00:03.302)
postgres=#
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Thu, Mar 16, 2023 at 2:19 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
On Wed, Mar 15, 2023 at 12:20 PM Michael Paquier <michael@paquier.xyz> wrote:
I am not sure to get the concern here. As long as one is smart enough
with SQL, there is no need to perform a double scan of the contents of
pg_wal with a large scan on the start LSN. If one wishes to only
extract some block for a given record type, or for a filter of your
choice, it is possible to use a LATERAL on pg_get_wal_block_info(),
say:
SELECT r.start_lsn, b.blockid
FROM pg_get_wal_records_info('0/01000028', '0/1911AA8') AS r,
LATERAL pg_get_wal_block_info(start_lsn, end_lsn) as b
WHERE r.resource_manager = 'Heap2';This will extract the block information that you'd want for a given
record type.
The same information *already* appears in pg_get_wal_records_info()'s
block_ref output! Why should the user be expected to use a LATERAL
join (or any type of join) to get _the same information_, just in a
usable form?
IIUC, the concern raised so far in this thread is not just on the
performance of JOIN queries to get both block info and record level
info, but on ease of using pg_walinspect functions. If
pg_get_wal_block_info emits the record level information too (which
turns out to be 50 LOC more), one doesn't have to be expert at writing
JOIN queries or such, but just can run the function, which actually
takes way less time (3sec) to scan the same 5mn WAL records [3].
That's exactly my concern, yes. As you say, it's not just the
performance aspect. Requiring users to write a needlessly ornamental
query is actively misleading. It suggests that block_ref is distinct
information from the blocks output by pg_get_wal_block_info().
--
Peter Geoghegan
On Fri, Mar 17, 2023 at 7:33 AM Peter Geoghegan <pg@bowt.ie> wrote:
IIUC, the concern raised so far in this thread is not just on the
performance of JOIN queries to get both block info and record level
info, but on ease of using pg_walinspect functions. If
pg_get_wal_block_info emits the record level information too (which
turns out to be 50 LOC more), one doesn't have to be expert at writing
JOIN queries or such, but just can run the function, which actually
takes way less time (3sec) to scan the same 5mn WAL records [3].That's exactly my concern, yes. As you say, it's not just the
performance aspect. Requiring users to write a needlessly ornamental
query is actively misleading. It suggests that block_ref is distinct
information from the blocks output by pg_get_wal_block_info().
+1 for pg_get_wal_block_info emitting per-record WAL info too along
with block info, attached v2 patch does that. IMO, usability wins the
race here.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v2-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/octet-stream; name=v2-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 5874b0028c6b7f1ef0ed37bc31a343118a7c2a20 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Fri, 17 Mar 2023 07:16:38 +0000
Subject: [PATCH v2] Emit WAL record info via pg_get_wal_block_info()
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 10 +++-
contrib/pg_walinspect/pg_walinspect.c | 50 +++++++++++++------
doc/src/sgml/pgwalinspect.sgml | 38 ++++++++------
3 files changed, 69 insertions(+), 29 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..46fcaff3a1 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,7 +12,15 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 3b3215daf5..3019b18382 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -43,7 +43,8 @@ static XLogRecPtr GetCurrentLSN(void);
static XLogReaderState *InitXLogReaderState(XLogRecPtr lsn);
static XLogRecord *ReadNextXLogRecord(XLogReaderState *xlogreader);
static void GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols);
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref);
static void GetWALRecordsInfo(FunctionCallInfo fcinfo,
XLogRecPtr start_lsn,
XLogRecPtr end_lsn);
@@ -180,7 +181,8 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols)
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref)
{
const char *id;
RmgrData desc;
@@ -199,9 +201,12 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ /* Fetch block references and FPI length if asked. */
+ if (fetch_blk_ref)
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
main_data_len = XLogRecGetDataLen(record);
@@ -213,9 +218,14 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(main_data_len);
- values[i++] = UInt32GetDatum(fpi_len);
+
+ if (fetch_blk_ref)
+ values[i++] = UInt32GetDatum(fpi_len);
+
values[i++] = CStringGetTextDatum(rec_desc.data);
- values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+
+ if (fetch_blk_ref)
+ values[i++] = CStringGetTextDatum(rec_blk_ref.data);
Assert(i == ncols);
}
@@ -228,19 +238,30 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
+#define PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS 9
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ /* Get per-record info. */
+ GetWALRecordInfo(record, values, nulls,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS, false);
+
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- int i = 0;
+ int i = PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS;
+
+ /* Reset only per-block output columns, keep per-record info as-is. */
+ memset(&nulls[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
+ memset(&values[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
if (!XLogRecHasBlockRef(record, block_id))
continue;
@@ -250,7 +271,6 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(void) XLogRecGetBlockTagExtended(record, block_id,
&rnode, &fork, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = Int16GetDatum(block_id);
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
@@ -345,6 +365,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
values, nulls);
}
+#undef PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS
#undef PG_GET_WAL_FPI_BLOCK_COLS
}
@@ -434,7 +455,8 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS)
errmsg("could not read WAL at %X/%X",
LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
- GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS);
+ GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS,
+ true);
pfree(xlogreader->private_data);
XLogReaderFree(xlogreader);
@@ -505,7 +527,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
old_cxt = MemoryContextSwitchTo(tmp_cxt);
GetWALRecordInfo(xlogreader, values, nulls,
- PG_GET_WAL_RECORDS_INFO_COLS);
+ PG_GET_WAL_RECORDS_INFO_COLS, true);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..864ea298f3 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -177,23 +177,33 @@ combined_size_percentage | 2.8634072910530795
<replaceable>start_lsn</replaceable> is not available. For example,
usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, xid, resource_manager,
+ record_type, record_length, main_data_length, description,
+ blockid, reltablespace, reldatabase, relfilenode,
relblocknumber, forkname,
substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/15457B0', '0/1545A40');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/1545860
+end_lsn | 0/1545998
+prev_lsn | 0/15457E8
+xid | 750
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 305
+main_data_length | 14
+description | off 2 xmax 750 flags 0x00 ; new off 6 xmax 0
+blockid | 0
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 24594
+relblocknumber | 0
+forkname | main
+block_trimmed | \x02800128180164000000
+fpi_trimmed | \x00000000203a5401000000003000401f00200420ed020000
+fpilen | 240
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
--
2.34.1
On Fri, Mar 17, 2023 at 12:20 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
+1 for pg_get_wal_block_info emitting per-record WAL info too along
with block info, attached v2 patch does that. IMO, usability wins the
race here.
I think that this direction makes a lot of sense. Under this scheme,
we still have pg_get_wal_records_info(), which is more or less an SQL
interface to the information that pg_waldump presents by default.
That's important when the record view of things is of primary
importance. But now you also have a "block oriented" view of WAL
presented by pg_get_wal_block_info(), which is useful when particular
blocks are of primary interest. I think that I'll probably end up
using both, while primarily using pg_get_wal_block_info() for more
advanced analysis that focuses on what happened to particular blocks
over time.
It makes sense to present pg_get_wal_block_info() immediately after
pg_get_wal_records_info() in the documentation under this scheme,
since they're closely related. It would make sense to explain the
relationship directly: pg_get_wal_block_info() doesn't have the
block_ref column because it breaks that same information out by block
instead, occasionally showing multiple rows for particular record
types (which is what its "extra" columns describe). And,
pg_get_wal_block_info() won't show anything for those records whose
block_ref column is null according to pg_get_wal_records_info(), such
as commit records.
(Checks pg_walinspect once more...)
Actually, I now see that block_ref won't be NULL for those records
that have no block references at all -- it just outputs an empty
string. But wouldn't it be better if it actually output NULL? Better
for its own sake, but also better because doing so enables describing
the relationship between the two functions with reference to
block_ref. It seems particularly helpful to me to be able to say that
pg_get_wal_block_info() doesn't show anything for precisely those WAL
records whose block_ref is NULL according to
pg_get_wal_records_info().
--
Peter Geoghegan
On Fri, Mar 17, 2023 at 12:36:04PM -0700, Peter Geoghegan wrote:
I think that this direction makes a lot of sense. Under this scheme,
we still have pg_get_wal_records_info(), which is more or less an SQL
interface to the information that pg_waldump presents by default.
That's important when the record view of things is of primary
importance. But now you also have a "block oriented" view of WAL
presented by pg_get_wal_block_info(), which is useful when particular
blocks are of primary interest. I think that I'll probably end up
using both, while primarily using pg_get_wal_block_info() for more
advanced analysis that focuses on what happened to particular blocks
over time.
FWIW, I am not sure that it is a good idea and that we'd better not
encourage too much the use of block_info() across a large range of
WAL, which is what this function will make users eager to do in this
case as it is possible to apply directly more filters to it. This is
a community, so, anyway, if you feel strongly about doing this change,
feel free to.
--
Michael
On Fri, Mar 17, 2023 at 4:11 PM Michael Paquier <michael@paquier.xyz> wrote:
FWIW, I am not sure that it is a good idea and that we'd better not
encourage too much the use of block_info() across a large range of
WAL, which is what this function will make users eager to do in this
case as it is possible to apply directly more filters to it.
I'm sure that they will do that much more than they would have
otherwise. Since we'll have made pg_get_wal_block_info() so much more
useful than pg_get_wal_records_info() for many important use cases.
Why is that a bad thing? Are you concerned about the overhead of
pulling in FPIs when pg_get_wal_block_info() is run, if Bharath's
patch is committed? That could be a problem, I suppose -- but it would
be good to get more data on that. Do you think that this will be much
of an issue, Bharath?
I have pushed pg_walinspect to its limits myself (which is how I found
that memory leak). Performance matters a great deal when you're doing
an analysis of how blocks change over time, on a system that has
written a realistically large amount of WAL over minutes or even
hours. Why shouldn't that be a priority for pg_walinspect? My concerns
have little to do with aesthetics, and everything to do with making
those kinds of queries feasible.
If the FPI thing is a problem then it seems to me that it should be
addressed directly. For example, perhaps it would make sense to add a
way to not incur the overhead of decompressing FPIs uselessly in cases
where they're of no interest to us (likely the majority of cases once
the patch is committed). It also might well make sense to rename
pg_get_wal_block_info() to something more general, to reflect its more
general purpose once it is expanded by Bharath's patch. As I said, it
will become a lot closer to pg_get_wal_records_info(). We should be
clear on that.
--
Peter Geoghegan
On Fri, Mar 17, 2023 at 04:36:58PM -0700, Peter Geoghegan wrote:
I'm sure that they will do that much more than they would have
otherwise. Since we'll have made pg_get_wal_block_info() so much more
useful than pg_get_wal_records_info() for many important use cases.
Why is that a bad thing? Are you concerned about the overhead of
pulling in FPIs when pg_get_wal_block_info() is run, if Bharath's
patch is committed? That could be a problem, I suppose -- but it would
be good to get more data on that. Do you think that this will be much
of an issue, Bharath?
Yes. The CPU cost is one thing, but I am also worrying about the
I/O cost with a tuplestore spilling to disk a large number of FPIs,
and some workloads can generate WAL so as FPIs is what makes for most
of the contents stored in the WAL. (wal_compression is very effective
in such cases, for example.)
It is true that it is possible to tweak SQLs that exactly do that with
a large amount of data materialized, or just eat so much CPU that they
basically DoS the backend. Still I'd rather keep a minimalistic
design for each function with block_info having only one field able to
track back to which record a block information refers to, and I'd like
to think one able to look at WAL internals will be smart enough to
write SQL in such a way that they avoid that on a production machine.
The current design allows to do that in this view, but that's just one
way I see how to represent structures at SQL level. Extending
block_info() with more record-level attributes allows that as well,
still it bloats its interface unnecessarily. Designing software is
hard, and it looks like our point of view on that is different. If
you wish to change the current interface of block_info, feel free to
do so. It does not mean that it cannot be changed, just that I
recommend not to do that, and that's just one opinion.
This said, your point about having rec_blk_ref reported as an empty
string rather than NULL if there are no block references does not feel
natural to me, either.. Reporting NULL would be better.
--
Michael
On Fri, Mar 17, 2023 at 5:51 PM Michael Paquier <michael@paquier.xyz> wrote:
Yes. The CPU cost is one thing, but I am also worrying about the
I/O cost with a tuplestore spilling to disk a large number of FPIs,
and some workloads can generate WAL so as FPIs is what makes for most
of the contents stored in the WAL. (wal_compression is very effective
in such cases, for example.)It is true that it is possible to tweak SQLs that exactly do that with
a large amount of data materialized, or just eat so much CPU that they
basically DoS the backend. Still I'd rather keep a minimalistic
design for each function with block_info having only one field able to
track back to which record a block information refers to, and I'd like
to think one able to look at WAL internals will be smart enough to
write SQL in such a way that they avoid that on a production machine.
The current design allows to do that in this view, but that's just one
way I see how to represent structures at SQL level.
Not really. It has nothing to do with some abstract ideal about how
the data should be logically structured. It is about how the actual
underlying physical data structures work, and are accessed in
practice, during query execution. And its about the constraints placed
on us by the laws of physics. Some ways of doing this are measurably,
provably much faster than other ways. It's very much not like we're
querying tables whose general structure is under our control, via
schema design, where the optimizer could reasonably be expected to
make better choices as the data distribution changes. So why treat it
like that?
Right now, you're still basically standing by a design that is
*fundamentally* less efficient for certain types of queries -- queries
that I am very interested in, that I'm sure many of us will be
interested in. It's not a matter of opinion. It is very much in
evidence from Bharath's analysis. If a similar analysis reached the
opposite conclusion, then you would be right and I would be wrong. It
really is that simple.
This said, your point about having rec_blk_ref reported as an empty
string rather than NULL if there are no block references does not feel
natural to me, either.. Reporting NULL would be better.
You have it backwards. It outputs an empty string right now. I want to
change that, so that it outputs NULLs instead.
--
Peter Geoghegan
On Fri, Mar 17, 2023 at 06:09:05PM -0700, Peter Geoghegan wrote:
This said, your point about having rec_blk_ref reported as an empty
string rather than NULL if there are no block references does not feel
natural to me, either.. Reporting NULL would be better.You have it backwards. It outputs an empty string right now. I want to
change that, so that it outputs NULLs instead.
My previous paragraph means exactly the same? I have just read what I
wrote again. Twice. So, yes, I agree with this point. Sorry if my
words meant the contrary to you. :)
--
Michael
On Sat, Mar 18, 2023 at 1:06 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Fri, Mar 17, 2023 at 12:20 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:+1 for pg_get_wal_block_info emitting per-record WAL info too along
with block info, attached v2 patch does that. IMO, usability wins the
race here.I think that this direction makes a lot of sense. Under this scheme,
we still have pg_get_wal_records_info(), which is more or less an SQL
interface to the information that pg_waldump presents by default.
That's important when the record view of things is of primary
importance. But now you also have a "block oriented" view of WAL
presented by pg_get_wal_block_info(), which is useful when particular
blocks are of primary interest. I think that I'll probably end up
using both, while primarily using pg_get_wal_block_info() for more
advanced analysis that focuses on what happened to particular blocks
over time.
Hm.
It makes sense to present pg_get_wal_block_info() immediately after
pg_get_wal_records_info() in the documentation under this scheme,
since they're closely related.
-1. I don't think we need that and even if we did, it's hard to
maintain that ordering in future. One who knows to use these functions
will anyway get to know how they're related.
(Checks pg_walinspect once more...)
Actually, I now see that block_ref won't be NULL for those records
that have no block references at all -- it just outputs an empty
string.
Yes, that's unnecessary.
But wouldn't it be better if it actually output NULL?
+1 done so in the attached 0001 patch.
Better
for its own sake, but also better because doing so enables describing
the relationship between the two functions with reference to
block_ref. It seems particularly helpful to me to be able to say that
pg_get_wal_block_info() doesn't show anything for precisely those WAL
records whose block_ref is NULL according to
pg_get_wal_records_info().
Hm.
Attaching v3 patch set - 0001 optimizations around block references,
0002 enables pg_get_wal_block_info() to emit per-record info. Any
thoughts?
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v3-0001-Few-optimizations-around-block-references-in-pg_w.patchapplication/x-patch; name=v3-0001-Few-optimizations-around-block-references-in-pg_w.patchDownload
From 153d1d8ae7f03471aa74b685d20eb110ba7eda98 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Sat, 18 Mar 2023 04:01:46 +0000
Subject: [PATCH v3] Few optimizations around block references in pg_walinspect
---
contrib/pg_walinspect/pg_walinspect.c | 29 +++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 3b3215daf5..c3ed31db8f 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -199,9 +199,12 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ /* Get block references, if any. */
+ if (XLogRecHasAnyBlockRefs(record))
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
main_data_len = XLogRecGetDataLen(record);
@@ -215,7 +218,12 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = UInt32GetDatum(main_data_len);
values[i++] = UInt32GetDatum(fpi_len);
values[i++] = CStringGetTextDatum(rec_desc.data);
- values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+
+ /* Output block references, if any. */
+ if (XLogRecHasAnyBlockRefs(record))
+ values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+ else
+ nulls[i++] = true;
Assert(i == ncols);
}
@@ -377,6 +385,12 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ CHECK_FOR_INTERRUPTS();
+
+ /* Get block references, if any, otherwise continue. */
+ if (!XLogRecHasAnyBlockRefs(xlogreader))
+ continue;
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
@@ -385,8 +399,6 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
MemoryContextReset(tmp_cxt);
-
- CHECK_FOR_INTERRUPTS();
}
MemoryContextDelete(tmp_cxt);
@@ -483,8 +495,6 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
#define PG_GET_WAL_RECORDS_INFO_COLS 11
XLogReaderState *xlogreader;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
- Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
MemoryContext old_cxt;
MemoryContext tmp_cxt;
@@ -501,6 +511,9 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
--
2.34.1
v3-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v3-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From dabc3c0a5625a45e0c179f1a0b1ede5187702198 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Sat, 18 Mar 2023 04:06:27 +0000
Subject: [PATCH v3] Emit WAL record info via pg_get_wal_block_info()
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 10 +++-
contrib/pg_walinspect/pg_walinspect.c | 47 +++++++++++++------
doc/src/sgml/pgwalinspect.sgml | 38 +++++++++------
3 files changed, 65 insertions(+), 30 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..46fcaff3a1 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,7 +12,15 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index c3ed31db8f..520d39a27f 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -43,7 +43,8 @@ static XLogRecPtr GetCurrentLSN(void);
static XLogReaderState *InitXLogReaderState(XLogRecPtr lsn);
static XLogRecord *ReadNextXLogRecord(XLogReaderState *xlogreader);
static void GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols);
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref);
static void GetWALRecordsInfo(FunctionCallInfo fcinfo,
XLogRecPtr start_lsn,
XLogRecPtr end_lsn);
@@ -180,7 +181,8 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
- bool *nulls, uint32 ncols)
+ bool *nulls, uint32 ncols,
+ bool fetch_blk_ref)
{
const char *id;
RmgrData desc;
@@ -199,8 +201,8 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
- /* Get block references, if any. */
- if (XLogRecHasAnyBlockRefs(record))
+ /* Get block references, if any and if asked. */
+ if (XLogRecHasAnyBlockRefs(record) && fetch_blk_ref)
{
initStringInfo(&rec_blk_ref);
XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
@@ -216,13 +218,16 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(main_data_len);
- values[i++] = UInt32GetDatum(fpi_len);
+
+ if (fetch_blk_ref)
+ values[i++] = UInt32GetDatum(fpi_len);
+
values[i++] = CStringGetTextDatum(rec_desc.data);
- /* Output block references, if any. */
- if (XLogRecHasAnyBlockRefs(record))
+ /* Output block references, if any and if asked. */
+ if (XLogRecHasAnyBlockRefs(record) && fetch_blk_ref)
values[i++] = CStringGetTextDatum(rec_blk_ref.data);
- else
+ else if (fetch_blk_ref)
nulls[i++] = true;
Assert(i == ncols);
@@ -236,19 +241,30 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
+#define PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS 9
+ Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ /* Get per-record info. */
+ GetWALRecordInfo(record, values, nulls,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS, false);
+
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
ForkNumber fork;
- Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
- int i = 0;
+ int i = PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS;
+
+ /* Reset only per-block output columns, keep per-record info as-is. */
+ memset(&nulls[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
+ memset(&values[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
if (!XLogRecHasBlockRef(record, block_id))
continue;
@@ -258,7 +274,6 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(void) XLogRecGetBlockTagExtended(record, block_id,
&rnode, &fork, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = Int16GetDatum(block_id);
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
@@ -353,6 +368,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
values, nulls);
}
+#undef PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS
#undef PG_GET_WAL_FPI_BLOCK_COLS
}
@@ -446,7 +462,8 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS)
errmsg("could not read WAL at %X/%X",
LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
- GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS);
+ GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS,
+ true);
pfree(xlogreader->private_data);
XLogReaderFree(xlogreader);
@@ -518,7 +535,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
old_cxt = MemoryContextSwitchTo(tmp_cxt);
GetWALRecordInfo(xlogreader, values, nulls,
- PG_GET_WAL_RECORDS_INFO_COLS);
+ PG_GET_WAL_RECORDS_INFO_COLS, true);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..864ea298f3 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -177,23 +177,33 @@ combined_size_percentage | 2.8634072910530795
<replaceable>start_lsn</replaceable> is not available. For example,
usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, xid, resource_manager,
+ record_type, record_length, main_data_length, description,
+ blockid, reltablespace, reldatabase, relfilenode,
relblocknumber, forkname,
substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/15457B0', '0/1545A40');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/1545860
+end_lsn | 0/1545998
+prev_lsn | 0/15457E8
+xid | 750
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 305
+main_data_length | 14
+description | off 2 xmax 750 flags 0x00 ; new off 6 xmax 0
+blockid | 0
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 24594
+relblocknumber | 0
+forkname | main
+block_trimmed | \x02800128180164000000
+fpi_trimmed | \x00000000203a5401000000003000401f00200420ed020000
+fpilen | 240
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
</listitem>
--
2.34.1
At Sat, 18 Mar 2023 10:08:53 +0530, Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> wrote in
On Sat, Mar 18, 2023 at 1:06 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Fri, Mar 17, 2023 at 12:20 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:+1 for pg_get_wal_block_info emitting per-record WAL info too along
with block info, attached v2 patch does that. IMO, usability wins the
race here.I think that this direction makes a lot of sense. Under this scheme,
we still have pg_get_wal_records_info(), which is more or less an SQL
interface to the information that pg_waldump presents by default.
That's important when the record view of things is of primary
importance. But now you also have a "block oriented" view of WAL
presented by pg_get_wal_block_info(), which is useful when particular
blocks are of primary interest. I think that I'll probably end up
using both, while primarily using pg_get_wal_block_info() for more
advanced analysis that focuses on what happened to particular blocks
over time.Hm.
Even though I haven't explored every aspect of the view, I believe it
makes sense to have at least the record_type in the block data. I
don't know how often it will be used, but considering that, I have no
objections to adding all the record information (apart from the block
data itself) to the block data.
It makes sense to present pg_get_wal_block_info() immediately after
pg_get_wal_records_info() in the documentation under this scheme,
since they're closely related.-1. I don't think we need that and even if we did, it's hard to
maintain that ordering in future. One who knows to use these functions
will anyway get to know how they're related.
The documentation has just one section titled "General Functions"
which directly contains detailed explation of four functions, making
it hard to get clear understanding of the available functions. I
considered breaking it down into a few subsections, but that wouldn't
look great since most of them would only contain one function.
However, I feel it would be helpful to add a list of all functions at
the beginning of the section.
(Checks pg_walinspect once more...)
Actually, I now see that block_ref won't be NULL for those records
that have no block references at all -- it just outputs an empty
string.Yes, that's unnecessary.
But wouldn't it be better if it actually output NULL?
+1 done so in the attached 0001 patch.
Better
for its own sake, but also better because doing so enables describing
the relationship between the two functions with reference to
block_ref. It seems particularly helpful to me to be able to say that
pg_get_wal_block_info() doesn't show anything for precisely those WAL
records whose block_ref is NULL according to
pg_get_wal_records_info().Hm.
I agree that adding a note about the characteristics would helpful to
avoid the misuse of pg_get_wal_block_info(). How about something like,
"Note that pg_get_wal_block_info() omits records that contains no
block references."?
Attaching v3 patch set - 0001 optimizations around block references,
0002 enables pg_get_wal_block_info() to emit per-record info. Any
thoughts?
+ /* Get block references, if any, otherwise continue. */
+ if (!XLogRecHasAnyBlockRefs(xlogreader))
+ continue;
I'm not sure, but the "continue" might be confusing since the code
"continue"s if the condition is true and continues the process
otherwise.. And it seems like a kind of "explaination of what the
code does". I feel we don't need the a comment there.
It is not an issue with this patch, but as I look at this version, I'm
starting to feel uneasy about the subtle differences between what
GetWALRecordsInfo and GetWALBlockInfo do. One solution might be to
have GetWALBlockInfo return a values array for each block, but that
could make things more complex than needed. Alternatively, could we
get GetWALRecordsInfo to call tuplestore_putvalues() internally? This
way, both functions can manage the temporary memory context within
themselves.
About 0002:
+ /* Reset only per-block output columns, keep per-record info as-is. */
+ memset(&nulls[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
+ memset(&values[PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS], 0,
+ PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS * sizeof(bool));
sizeof(*values) is not sizeof(bool), but sizeof(Datum).
It seems to me that the starting elemnt of the arrays is
(PG_GET_WAL_BLOCK_INFO_COLS -
PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS). But I don't think simply
rewriting that way is great.
#define PG_GET_WAL_RECORD_INFO_COLS 11
...
+#define PG_GET_WAL_BLOCK_INFO_PER_RECORD_COLS 9
This means GetWALBlockInfo overwrites the last two columns generated
by GetWalRecordInfo, but I don't think this approach is clean and
stable. I agree we don't want the final columns in a block info tuple
but we don't want to duplicate the common code path.
I initially thought we could devide the function into
GetWALCommonInfo(), GetWALRecordInfo() and GetWALBlockInfo(), but it
doesn't seem that simple.. In the end, I think we should have separate
GetWALRecordInfo() and GetWALBlockInfo() that have duplicate
"values[i++] = .." lines.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Sun, Mar 19, 2023 at 8:21 PM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
The documentation has just one section titled "General Functions"
which directly contains detailed explation of four functions, making
it hard to get clear understanding of the available functions. I
considered breaking it down into a few subsections, but that wouldn't
look great since most of them would only contain one function.
However, I feel it would be helpful to add a list of all functions at
the beginning of the section.
I like the idea of sections, even if there is only one function per
section in some cases.
I also think that we should add a "Tip" that advises users that they
may use an "end LSN" that is the largest possible LSN,
'FFFFFFFF/FFFFFFFF' to get information about records up until the
current LSN of the cluster (per commit 5c1b6628).
Is there a straightforward way to get a usable LSN constant for this
purpose? The simplest way I could come up with quickly is "SELECT
pg_lsn(2^64.-1)" -- which still isn't very simple. Actually, it might
be even worse than 'FFFFFFFF/FFFFFFFF', so perhaps we should just use
that in the docs new "Tip".
I agree that adding a note about the characteristics would helpful to
avoid the misuse of pg_get_wal_block_info(). How about something like,
"Note that pg_get_wal_block_info() omits records that contains no
block references."?
This should be a strict invariant. In other words, it should be part
of the documented contract of pg_get_wal_block_info and
pg_get_wal_records_info. The two functions should be defined in terms
of each other. Their relationship is important.
Users should be able to safely assume that the records that have a
NULL block_ref according to pg_get_wal_records_info are *precisely*
those records that won't have any entries within pg_get_wal_block_info
(assuming that the same LSN range is used with both functions).
pg_walinspect should explicitly promise this, and promise the
corollary condition around non-NULL block_ref records. It is a useful
promise from the point of view of users. It also makes it easier to
understand what's really going on here without any ambiguity.
I don't completely disagree with Michael about the redundancy. I just
think that it's worth it on performance grounds. We might want to say
that directly in the docs, too.
Attaching v3 patch set - 0001 optimizations around block references,
0002 enables pg_get_wal_block_info() to emit per-record info. Any
thoughts?+ /* Get block references, if any, otherwise continue. */ + if (!XLogRecHasAnyBlockRefs(xlogreader)) + continue;I'm not sure, but the "continue" might be confusing since the code
"continue"s if the condition is true and continues the process
otherwise.. And it seems like a kind of "explaination of what the
code does". I feel we don't need the a comment there.
+1.
Also, if GetWALBlockInfo() is now supposed to only be called when
XLogRecHasAnyBlockRefs() now then it should probably have an assertion
to verify the precondition.
It is not an issue with this patch, but as I look at this version, I'm
starting to feel uneasy about the subtle differences between what
GetWALRecordsInfo and GetWALBlockInfo do. One solution might be to
have GetWALBlockInfo return a values array for each block, but that
could make things more complex than needed. Alternatively, could we
get GetWALRecordsInfo to call tuplestore_putvalues() internally? This
way, both functions can manage the temporary memory context within
themselves.
Agreed. I'm also not sure what to do about it, though.
This means GetWALBlockInfo overwrites the last two columns generated
by GetWalRecordInfo, but I don't think this approach is clean and
stable. I agree we don't want the final columns in a block info tuple
but we don't want to duplicate the common code path.
I initially thought we could devide the function into
GetWALCommonInfo(), GetWALRecordInfo() and GetWALBlockInfo(), but it
doesn't seem that simple.. In the end, I think we should have separate
GetWALRecordInfo() and GetWALBlockInfo() that have duplicate
"values[i++] = .." lines.
I agree. A little redundancy is better when the alternative is fragile
code, and I'm pretty sure that that applies here -- there won't be
very many duplicated lines, and the final code will be significantly
clearer. There can be a comment about keeping GetWALRecordInfo and
GetWALBlockInfo in sync.
--
Peter Geoghegan
On Mon, Mar 20, 2023 at 4:34 PM Peter Geoghegan <pg@bowt.ie> wrote:
I agree. A little redundancy is better when the alternative is fragile
code, and I'm pretty sure that that applies here -- there won't be
very many duplicated lines, and the final code will be significantly
clearer. There can be a comment about keeping GetWALRecordInfo and
GetWALBlockInfo in sync.
The new pg_get_wal_block_info outputs columns in an order that doesn't
seem like the most useful possible order to me. This gives us another
reason to have separate GetWALRecordInfo and GetWALBlockInfo utility
functions rather than sharing logic for building output tuples.
Specifically, I think that pg_get_wal_block_info should ouput the
"primary key" columns first:
reltablespace, reldatabase, relfilenode, blockid, start_lsn, end_lsn
Next comes the columns that duplicate the columns output by
pg_get_wal_records_info, in the same order as they appear in
pg_get_wal_records_info. (Obviously this won't include block_ref).
--
Peter Geoghegan
On Mon, Mar 20, 2023 at 4:51 PM Peter Geoghegan <pg@bowt.ie> wrote:
The new pg_get_wal_block_info outputs columns in an order that doesn't
seem like the most useful possible order to me. This gives us another
reason to have separate GetWALRecordInfo and GetWALBlockInfo utility
functions rather than sharing logic for building output tuples.
One more piece of feedback for Bharath:
I think that we should also make the description output column display
NULLs for those records that don't output any description string. This
at least includes the "FPI" record type from the "XLOG" rmgr.
Alternatively, we could find a way of making it show a description.
--
Peter Geoghegan
On Mon, Mar 20, 2023 at 05:00:25PM -0700, Peter Geoghegan wrote:
I think that we should also make the description output column display
NULLs for those records that don't output any description string. This
at least includes the "FPI" record type from the "XLOG" rmgr.
Alternatively, we could find a way of making it show a description.
An empty StringInfo is sent to rm_desc, so hardcoding something in
pg_walinspect to show some data does not look right to me. Saying
that, using NULL when there is no description is OK by me, as much as
is using an empty string because it maps with the reality of the empty
StringInfo sent to the rm_desc callback.
--
Michael
On Mon, Mar 20, 2023 at 04:51:19PM -0700, Peter Geoghegan wrote:
On Mon, Mar 20, 2023 at 4:34 PM Peter Geoghegan <pg@bowt.ie> wrote:
I agree. A little redundancy is better when the alternative is fragile
code, and I'm pretty sure that that applies here -- there won't be
very many duplicated lines, and the final code will be significantly
clearer. There can be a comment about keeping GetWALRecordInfo and
GetWALBlockInfo in sync.
Well. Vox populi, Vox dei. I am deadly outnumbered.
The new pg_get_wal_block_info outputs columns in an order that doesn't
seem like the most useful possible order to me. This gives us another
reason to have separate GetWALRecordInfo and GetWALBlockInfo utility
functions rather than sharing logic for building output tuples.Specifically, I think that pg_get_wal_block_info should ouput the
"primary key" columns first:reltablespace, reldatabase, relfilenode, blockid, start_lsn, end_lsn
It seems to me that this is up to the one using this SQL? I am not
sure to follow why this is important. For the cases you have poked
at, I guess it is, but is strikes me that it is just natural to shape
that to match the C structures we use for the WAL records
themselves, so the other way around.
Next comes the columns that duplicate the columns output by
pg_get_wal_records_info, in the same order as they appear in
pg_get_wal_records_info. (Obviously this won't include block_ref).
Hmm. Once you add more record data into pg_get_wal_block_info(), I am
not sure to agree with this statement, actually. I would think that
the most logical order would be the start_lsn, the end_lsn, the record
information that you want for your quals, the block ID for the block
registered in the record, and finally then the rest of the block
information. So this puts the record-level data first, and the block
info after. This would be a bit closer with the order of the WAL
record structure itself, aka XLogRecord and such.
Hence, it seems to me that 0002 has the order pretty much right.
What's the point in adding the description, by the way? Only
consistency with the other function? Is that really useful if you
want to apply more quals when retrieving some block data?
Calling GetWALRecordInfo() as part of GetWALBlockInfo() leads to a
rather confusing result, IMO...
@@ -377,6 +385,12 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ CHECK_FOR_INTERRUPTS();
+
+ /* Get block references, if any, otherwise continue. */
+ if (!XLogRecHasAnyBlockRefs(xlogreader))
+ continue;
This early shortcut in 0001 is a good idea.
--
Michael
On Fri, Mar 17, 2023 at 8:51 PM Michael Paquier <michael@paquier.xyz> wrote:
On Fri, Mar 17, 2023 at 04:36:58PM -0700, Peter Geoghegan wrote:
I'm sure that they will do that much more than they would have
otherwise. Since we'll have made pg_get_wal_block_info() so much more
useful than pg_get_wal_records_info() for many important use cases.
Why is that a bad thing? Are you concerned about the overhead of
pulling in FPIs when pg_get_wal_block_info() is run, if Bharath's
patch is committed? That could be a problem, I suppose -- but it would
be good to get more data on that. Do you think that this will be much
of an issue, Bharath?Yes. The CPU cost is one thing, but I am also worrying about the
I/O cost with a tuplestore spilling to disk a large number of FPIs,
and some workloads can generate WAL so as FPIs is what makes for most
of the contents stored in the WAL. (wal_compression is very effective
in such cases, for example.)
I had done some analysis about CPU costs for decompressing FPI upthread
in [1]/messages/by-id/CAAKRu_bJvbcYBRj2cN6G2xV7B7-Ja+pjTO1nEnEhRR8OXYiABA@mail.gmail.com, finding that adding a parameter to allow skipping outputting FPI
would not have much impact when FPI are compressed, as decompressing the
images comprised very little of the overall time.
After reading what you said, I was interested to see how substantial the
I/O cost with non-compressed FPI would be.
Using a patch with a parameter to pg_get_wal_block_info() to skip
outputting FPI, I found that on a fast local nvme ssd, the timing
difference between doing so and not still isn't huge -- 9 seconds when
outputting the FPI vs 8.5 seconds when skipping outputting FPI. (with
~50,000 records all with non-compressed FPIs).
However, perhaps obviously, the I/O cost is worse.
Doing nothing but
SELECT * FROM pg_get_wal_block_info(:start_lsn, :end_lsn, true)
where fpi is not null;
per iostat, the write latency was double for the query which output fpi
from the one that didn't and the wkB/s was much higher. This is probably
obvious, but I'm just wondering if it makes sense to have such a
parameter to avoid impacting a system which is doing concurrent I/O with
walinspect.
I have had use for block info without seeing the FPIs, personally.
- Melanie
[1]: /messages/by-id/CAAKRu_bJvbcYBRj2cN6G2xV7B7-Ja+pjTO1nEnEhRR8OXYiABA@mail.gmail.com
On Wed, Mar 22, 2023 at 11:35 AM Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Fri, Mar 17, 2023 at 8:51 PM Michael Paquier <michael@paquier.xyz> wrote:
On Fri, Mar 17, 2023 at 04:36:58PM -0700, Peter Geoghegan wrote:
I'm sure that they will do that much more than they would have
otherwise. Since we'll have made pg_get_wal_block_info() so much more
useful than pg_get_wal_records_info() for many important use cases.
Why is that a bad thing? Are you concerned about the overhead of
pulling in FPIs when pg_get_wal_block_info() is run, if Bharath's
patch is committed? That could be a problem, I suppose -- but it would
be good to get more data on that. Do you think that this will be much
of an issue, Bharath?Yes. The CPU cost is one thing, but I am also worrying about the
I/O cost with a tuplestore spilling to disk a large number of FPIs,
and some workloads can generate WAL so as FPIs is what makes for most
of the contents stored in the WAL. (wal_compression is very effective
in such cases, for example.)I had done some analysis about CPU costs for decompressing FPI upthread
in [1], finding that adding a parameter to allow skipping outputting FPI
would not have much impact when FPI are compressed, as decompressing the
images comprised very little of the overall time.After reading what you said, I was interested to see how substantial the
I/O cost with non-compressed FPI would be.Using a patch with a parameter to pg_get_wal_block_info() to skip
outputting FPI, I found that on a fast local nvme ssd, the timing
difference between doing so and not still isn't huge -- 9 seconds when
outputting the FPI vs 8.5 seconds when skipping outputting FPI. (with
~50,000 records all with non-compressed FPIs).However, perhaps obviously, the I/O cost is worse.
Doing nothing butSELECT * FROM pg_get_wal_block_info(:start_lsn, :end_lsn, true)
where fpi is not null;
Sorry, I should have been more clear: similar results with a select list
simply excluding fpi and no where clause.
- Melanie
On Mon, Mar 20, 2023 at 7:34 PM Peter Geoghegan <pg@bowt.ie> wrote:
On Sun, Mar 19, 2023 at 8:21 PM Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote:
It is not an issue with this patch, but as I look at this version, I'm
starting to feel uneasy about the subtle differences between what
GetWALRecordsInfo and GetWALBlockInfo do. One solution might be to
have GetWALBlockInfo return a values array for each block, but that
could make things more complex than needed. Alternatively, could we
get GetWALRecordsInfo to call tuplestore_putvalues() internally? This
way, both functions can manage the temporary memory context within
themselves.Agreed. I'm also not sure what to do about it, though.
This means GetWALBlockInfo overwrites the last two columns generated
by GetWalRecordInfo, but I don't think this approach is clean and
stable. I agree we don't want the final columns in a block info tuple
but we don't want to duplicate the common code path.I initially thought we could devide the function into
GetWALCommonInfo(), GetWALRecordInfo() and GetWALBlockInfo(), but it
doesn't seem that simple.. In the end, I think we should have separate
GetWALRecordInfo() and GetWALBlockInfo() that have duplicate
"values[i++] = .." lines.I agree. A little redundancy is better when the alternative is fragile
code, and I'm pretty sure that that applies here -- there won't be
very many duplicated lines, and the final code will be significantly
clearer. There can be a comment about keeping GetWALRecordInfo and
GetWALBlockInfo in sync.
So, I also agree that it is better to have the two separate functions
instead of overwriting the last two columns. As for keeping them in
sync, we could define the number of common columns as a macro like:
#define WALINSPECT_INFO_NUM_COMMON_COLS 10
and use that to calculate the size of the values/nulls array in
GetWalRecordInfo() and GetWALBlockInfo() (assuming a new version where
those two functions duplicate the setting of values[x] = y).
That way, if a new column of information is added and one of the two
functions forgets to set it in the values array, it would still cause an
empty column and it will be easier for the programmer to see it needs to
be added.
We could even define an enum like:
typedef enum walinspect_common_col
{
WALINSPECT_START_LSN,
WALINSPECT_END_LSN,
WALINSPECT_PREV_LSN,
WALINSPECT_XID,
WALINSPECT_RMGR,
WALINSPECT_REC_TYPE,
WALINSPECT_REC_LENGTH,
WALINSPECT_MAIN_DATA_LENGTH,
WALINSPECT_FPILEN,
WALINSPECT_DESC,
WALINSPECT_NUM_COMMON_COL,
} walinspect_common_col;
and set values in both functions like
values[WALINSPECT_FPILEN] = y
if we kept the order of common columns the same and as the first N
columns for both functions. This would keep us from having to manually
update a macro like WALINSPECT_INFO_NUM_COMMON_COLS.
Though, I'm not sure how much value that actually adds.
- Melanie
On Wed, Mar 22, 2023 at 8:35 AM Melanie Plageman
<melanieplageman@gmail.com> wrote:
After reading what you said, I was interested to see how substantial the
I/O cost with non-compressed FPI would be.Using a patch with a parameter to pg_get_wal_block_info() to skip
outputting FPI, I found that on a fast local nvme ssd, the timing
difference between doing so and not still isn't huge -- 9 seconds when
outputting the FPI vs 8.5 seconds when skipping outputting FPI. (with
~50,000 records all with non-compressed FPIs).However, perhaps obviously, the I/O cost is worse.
Doing nothing butSELECT * FROM pg_get_wal_block_info(:start_lsn, :end_lsn, true)
where fpi is not null;per iostat, the write latency was double for the query which output fpi
from the one that didn't and the wkB/s was much higher.
I think that we should also have something like the patch that you
wrote to skip FPIs. It's not something that I feel as strongly about
as the main point about including all the fields from
pg_get_wal_records_info. but it does seem worth doing.
I have had use for block info without seeing the FPIs, personally.
I'd go further than that myself: I haven't had any use for FPIs at
all. If I was going to do something with FPIs then I'd just use
pg_waldump, since I'd likely want to get them onto the filesystem for
analysis anyway. (Just my experience.)
--
Peter Geoghegan
On Tue, Mar 21, 2023 at 11:33 PM Michael Paquier <michael@paquier.xyz> wrote:
The new pg_get_wal_block_info outputs columns in an order that doesn't
seem like the most useful possible order to me. This gives us another
reason to have separate GetWALRecordInfo and GetWALBlockInfo utility
functions rather than sharing logic for building output tuples.Specifically, I think that pg_get_wal_block_info should ouput the
"primary key" columns first:reltablespace, reldatabase, relfilenode, blockid, start_lsn, end_lsn
It seems to me that this is up to the one using this SQL?
If you see it that way, then why does it matter what I may want to do
with the declared order?
I am not
sure to follow why this is important. For the cases you have poked
at, I guess it is, but is strikes me that it is just natural to shape
that to match the C structures we use for the WAL records
themselves, so the other way around.
I don't feel very strongly about it, but it seems better to highlight
the difference that exists between this and pg_get_wal_records_info.
Hence, it seems to me that 0002 has the order pretty much right.
What's the point in adding the description, by the way? Only
consistency with the other function? Is that really useful if you
want to apply more quals when retrieving some block data?
I don't understand. It's useful to include the description for the
same reason as it's useful to include it in pg_get_wal_records_info.
Why wouldn't it be useful?
Most individual records that have any block_ref blocks have exactly
one. Most individual WAL records are very simple record types. So
pg_get_wal_block_info just isn't going to look that different to
pg_get_wal_records_info, once they share most of the same columns. The
way that pg_get_wal_block_info disaggregates on block won't make the
output look all that different. So each distinct "description" will
usually only appear once in pg_get_wal_block_info anyway.
--
Peter Geoghegan
On Wed, Mar 22, 2023 at 05:05:10PM -0700, Peter Geoghegan wrote:
I'd go further than that myself: I haven't had any use for FPIs at
all. If I was going to do something with FPIs then I'd just use
pg_waldump, since I'd likely want to get them onto the filesystem for
analysis anyway. (Just my experience.)
FWIW, being able to get access to raw FPIs with a SQL interface is
useful if you cannot log into the host, which is something that a lot
of cloud providers don't allow, and not everybody is able to have
access to archived WAL segments in a different host.
--
Michael
On Wed, Mar 22, 2023 at 5:14 PM Michael Paquier <michael@paquier.xyz> wrote:
On Wed, Mar 22, 2023 at 05:05:10PM -0700, Peter Geoghegan wrote:
I'd go further than that myself: I haven't had any use for FPIs at
all. If I was going to do something with FPIs then I'd just use
pg_waldump, since I'd likely want to get them onto the filesystem for
analysis anyway. (Just my experience.)FWIW, being able to get access to raw FPIs with a SQL interface is
useful if you cannot log into the host, which is something that a lot
of cloud providers don't allow, and not everybody is able to have
access to archived WAL segments in a different host.
I'm not saying that it's not ever useful. Just that finding FPIs
interesting isn't necessarily all that common when using
pg_get_wal_block_info (or won't be, once it has those additional
columns from pg_get_wal_records_info).
--
Peter Geoghegan
On Mon, Mar 20, 2023 at 8:51 AM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
+ /* Get block references, if any, otherwise continue. */ + if (!XLogRecHasAnyBlockRefs(xlogreader))code does". I feel we don't need the a comment there.
Removed.
This means GetWALBlockInfo overwrites the last two columns generated
by GetWalRecordInfo, but I don't think this approach is clean and
stable. I agree we don't want the final columns in a block info tuple
but we don't want to duplicate the common code path.I initially thought we could devide the function into
GetWALCommonInfo(), GetWALRecordInfo() and GetWALBlockInfo(), but it
doesn't seem that simple.. In the end, I think we should have separate
GetWALRecordInfo() and GetWALBlockInfo() that have duplicate
"values[i++] = .." lines.
Done as per Peter's suggestion (keeping primary key columns first and
having a bit of code duplicated instead of making it complex in the
name of deduplication). Please see the attached v4 patch set.
On Tue, Mar 21, 2023 at 5:04 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Sun, Mar 19, 2023 at 8:21 PM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:The documentation has just one section titled "General Functions"
which directly contains detailed explation of four functions, making
it hard to get clear understanding of the available functions. I
considered breaking it down into a few subsections, but that wouldn't
look great since most of them would only contain one function.
However, I feel it would be helpful to add a list of all functions at
the beginning of the section.I like the idea of sections, even if there is only one function per
section in some cases.
Hm, -1 for now. Most of the extensions that I have seen don't have
anything like that. If needed, someone can start a separate thread for
such a proposal for all of the extensions.
I also think that we should add a "Tip" that advises users that they
may use an "end LSN" that is the largest possible LSN,
'FFFFFFFF/FFFFFFFF' to get information about records up until the
current LSN of the cluster (per commit 5c1b6628).Is there a straightforward way to get a usable LSN constant for this
purpose? The simplest way I could come up with quickly is "SELECT
pg_lsn(2^64.-1)" -- which still isn't very simple. Actually, it might
be even worse than 'FFFFFFFF/FFFFFFFF', so perhaps we should just use
that in the docs new "Tip".
Done.
I agree that adding a note about the characteristics would helpful to
avoid the misuse of pg_get_wal_block_info(). How about something like,
"Note that pg_get_wal_block_info() omits records that contains no
block references."?This should be a strict invariant. In other words, it should be part
of the documented contract of pg_get_wal_block_info and
pg_get_wal_records_info. The two functions should be defined in terms
of each other. Their relationship is important.Users should be able to safely assume that the records that have a
NULL block_ref according to pg_get_wal_records_info are *precisely*
those records that won't have any entries within pg_get_wal_block_info
(assuming that the same LSN range is used with both functions).
pg_walinspect should explicitly promise this, and promise the
corollary condition around non-NULL block_ref records. It is a useful
promise from the point of view of users. It also makes it easier to
understand what's really going on here without any ambiguity.I don't completely disagree with Michael about the redundancy. I just
think that it's worth it on performance grounds. We might want to say
that directly in the docs, too.
Added a note in the docs.
Also, if GetWALBlockInfo() is now supposed to only be called when
XLogRecHasAnyBlockRefs() now then it should probably have an assertion
to verify the precondition.
Done.
I initially thought we could devide the function into
GetWALCommonInfo(), GetWALRecordInfo() and GetWALBlockInfo(), but it
doesn't seem that simple.. In the end, I think we should have separate
GetWALRecordInfo() and GetWALBlockInfo() that have duplicate
"values[i++] = .." lines.I agree. A little redundancy is better when the alternative is fragile
code, and I'm pretty sure that that applies here -- there won't be
very many duplicated lines, and the final code will be significantly
clearer. There can be a comment about keeping GetWALRecordInfo and
GetWALBlockInfo in sync.
Done.
On Tue, Mar 21, 2023 at 5:21 AM Peter Geoghegan <pg@bowt.ie> wrote:
The new pg_get_wal_block_info outputs columns in an order that doesn't
seem like the most useful possible order to me. This gives us another
reason to have separate GetWALRecordInfo and GetWALBlockInfo utility
functions rather than sharing logic for building output tuples.Specifically, I think that pg_get_wal_block_info should ouput the
"primary key" columns first:reltablespace, reldatabase, relfilenode, blockid, start_lsn, end_lsn
Next comes the columns that duplicate the columns output by
pg_get_wal_records_info, in the same order as they appear in
pg_get_wal_records_info. (Obviously this won't include block_ref).
Done.
On Tue, Mar 21, 2023 at 5:30 AM Peter Geoghegan <pg@bowt.ie> wrote:
I think that we should also make the description output column display
NULLs for those records that don't output any description string. This
at least includes the "FPI" record type from the "XLOG" rmgr.
Alternatively, we could find a way of making it show a description.
Done.
Please see the attached v4 patch set addressing all the review comments.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v4-0001-Few-optimizations-in-pg_walinspect.patchapplication/x-patch; name=v4-0001-Few-optimizations-in-pg_walinspect.patchDownload
From dc2bf1a2109211df4650c635b7094240d5bef65b Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 23 Mar 2023 11:25:08 +0000
Subject: [PATCH v4] Few optimizations in pg_walinspect
---
contrib/pg_walinspect/pg_walinspect.c | 58 ++++++++++++++++-----------
1 file changed, 35 insertions(+), 23 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 3b3215daf5..6b3e71b9cd 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -187,35 +187,43 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
uint32 fpi_len = 0;
StringInfoData rec_desc;
StringInfoData rec_blk_ref;
- uint32 main_data_len;
int i = 0;
- desc = GetRmgr(XLogRecGetRmid(record));
- id = desc.rm_identify(XLogRecGetInfo(record));
-
- if (id == NULL)
- id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
-
- initStringInfo(&rec_desc);
- desc.rm_desc(&rec_desc, record);
-
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
-
- main_data_len = XLogRecGetDataLen(record);
-
values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = LSNGetDatum(record->EndRecPtr);
values[i++] = LSNGetDatum(XLogRecGetPrev(record));
values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
values[i++] = CStringGetTextDatum(desc.rm_name);
+
+ id = desc.rm_identify(XLogRecGetInfo(record));
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
- values[i++] = UInt32GetDatum(main_data_len);
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+
+ if (XLogRecHasAnyBlockRefs(record))
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
+
values[i++] = UInt32GetDatum(fpi_len);
- values[i++] = CStringGetTextDatum(rec_desc.data);
- values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
+
+ if (XLogRecHasAnyBlockRefs(record))
+ values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+ else
+ nulls[i++] = true;
Assert(i == ncols);
}
@@ -377,6 +385,11 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ CHECK_FOR_INTERRUPTS();
+
+ if (!XLogRecHasAnyBlockRefs(xlogreader))
+ continue;
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
@@ -385,8 +398,6 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
MemoryContextReset(tmp_cxt);
-
- CHECK_FOR_INTERRUPTS();
}
MemoryContextDelete(tmp_cxt);
@@ -483,8 +494,6 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
#define PG_GET_WAL_RECORDS_INFO_COLS 11
XLogReaderState *xlogreader;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
- Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
MemoryContext old_cxt;
MemoryContext tmp_cxt;
@@ -501,6 +510,9 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
--
2.34.1
v4-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v4-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 72f20a1914c9b02a36d78364b8865913175bb881 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 23 Mar 2023 17:10:07 +0000
Subject: [PATCH v4] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 12 ++-
contrib/pg_walinspect/pg_walinspect.c | 58 ++++++++---
doc/src/sgml/pgwalinspect.sgml | 99 ++++++++++++-------
3 files changed, 119 insertions(+), 50 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..f767211fed 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,12 +12,20 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
- OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT forkname text,
OUT blockdata bytea,
OUT fpi bytea,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 6b3e71b9cd..d46738774e 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -177,6 +177,10 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
/*
* Get a single WAL record info.
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
@@ -230,15 +234,32 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Get a single WAL record info along with its block information (block data
+ * and FPI).
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *id;
+ StringInfoData rec_desc;
+
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ id = desc.rm_identify(XLogRecGetInfo(record));
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
@@ -258,12 +279,24 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
(void) XLogRecGetBlockTagExtended(record, block_id,
&rnode, &fork, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
values[i++] = Int64GetDatum((int64) blkno);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = LSNGetDatum(record->ReadRecPtr);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(id);
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -357,11 +390,12 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get info of all WAL records having block references between start LSN and
+ * end LSN. It outputs one row for each WAL record's block reference (note that
+ * a single WAL record can have one or more blocks associated with it). It
+ * outputs WAL record information as well as block information (block data
+ * and/or FPI). Decompression is applied to the full page images, if necessary.
+ * It omits WAL records that contain no block references.
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -485,7 +519,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -537,7 +571,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..13429a2162 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,62 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
+postgres=# SELECT reltablespace, reldatabase, relfilenode,
+ relblocknumber, blockid, start_lsn, end_lsn,
+ prev_lsn, xid, resource_manager, record_type,
+ record_length, main_data_length, description,
+ forkname, substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 16391
+relblocknumber | 9
+blockid | 0
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+forkname | main
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.34.1
On Thu, Mar 23, 2023 at 10:54:40PM +0530, Bharath Rupireddy wrote:
Please see the attached v4 patch set addressing all the review comments.
- desc = GetRmgr(XLogRecGetRmid(record));
- id = desc.rm_identify(XLogRecGetInfo(record));
-
- if (id == NULL)
- id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
-
- initStringInfo(&rec_desc);
- desc.rm_desc(&rec_desc, record);
-
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
-
- main_data_len = XLogRecGetDataLen(record);
I don't see any need to move this block of code? This leads to
unnecessary diffs, potentially making backpatch a bit harder. Either
way is not a big deal, still.. Except for this bit, 0001 looks fine
by me.
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
I'd still put the LSN data before the three OIDs for consistency with
the structures, though my opinion does not seem to count much..
--
Michael
At Sat, 25 Mar 2023 12:12:50 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Thu, Mar 23, 2023 at 10:54:40PM +0530, Bharath Rupireddy wrote:
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,I'd still put the LSN data before the three OIDs for consistency with
the structures, though my opinion does not seem to count much..
I agree with Michael on this point. Also, although it may not be
significant for SQL, the rows are output in lsn order from the
function.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Mon, Mar 27, 2023 at 9:11 AM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
At Sat, 25 Mar 2023 12:12:50 +0900, Michael Paquier <michael@paquier.xyz> wrote in
On Thu, Mar 23, 2023 at 10:54:40PM +0530, Bharath Rupireddy wrote:
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,I'd still put the LSN data before the three OIDs for consistency with
the structures, though my opinion does not seem to count much..I agree with Michael on this point. Also, although it may not be
significant for SQL, the rows are output in lsn order from the
function.
Done that way.
On Sat, Mar 25, 2023 at 8:42 AM Michael Paquier <michael@paquier.xyz> wrote:
On Thu, Mar 23, 2023 at 10:54:40PM +0530, Bharath Rupireddy wrote:
Please see the attached v4 patch set addressing all the review comments.
- desc = GetRmgr(XLogRecGetRmid(record));
- id = desc.rm_identify(XLogRecGetInfo(record));
-
- if (id == NULL)
- id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
-
- initStringInfo(&rec_desc);
- desc.rm_desc(&rec_desc, record);
-
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
-
- main_data_len = XLogRecGetDataLen(record);I don't see any need to move this block of code? This leads to
unnecessary diffs, potentially making backpatch a bit harder. Either
way is not a big deal, still.. Except for this bit, 0001 looks fine
by me.
It's a cosmetic change - I wanted to keep the calculation of column
values closer to where they're assigned to Datum values. I agree to
not cause too much diff and removed them.
Please see the attached v5 patch set.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v5-0001-Few-optimizations-in-pg_walinspect.patchapplication/x-patch; name=v5-0001-Few-optimizations-in-pg_walinspect.patchDownload
From 601c47f0e56e721da7a5fb7e0427faa926ad982c Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 27 Mar 2023 03:50:44 +0000
Subject: [PATCH v5] Few optimizations in pg_walinspect
1. Emits null values in output columns (description and block_ref)
if WAL records have no data to present.
2. Function to get block references is skipped for the records
that don't have block references.
---
contrib/pg_walinspect/pg_walinspect.c | 39 ++++++++++++++++++---------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 3b3215daf5..90fb8eaed1 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -187,7 +187,6 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
uint32 fpi_len = 0;
StringInfoData rec_desc;
StringInfoData rec_blk_ref;
- uint32 main_data_len;
int i = 0;
desc = GetRmgr(XLogRecGetRmid(record));
@@ -199,11 +198,11 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
- /* Block references. */
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
-
- main_data_len = XLogRecGetDataLen(record);
+ if (XLogRecHasAnyBlockRefs(record))
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = LSNGetDatum(record->EndRecPtr);
@@ -212,10 +211,18 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = CStringGetTextDatum(desc.rm_name);
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
- values[i++] = UInt32GetDatum(main_data_len);
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
values[i++] = UInt32GetDatum(fpi_len);
- values[i++] = CStringGetTextDatum(rec_desc.data);
- values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
+
+ if (XLogRecHasAnyBlockRefs(record))
+ values[i++] = CStringGetTextDatum(rec_blk_ref.data);
+ else
+ nulls[i++] = true;
Assert(i == ncols);
}
@@ -232,6 +239,8 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ Assert(XLogRecHasAnyBlockRefs(record));
+
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
@@ -377,6 +386,11 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ CHECK_FOR_INTERRUPTS();
+
+ if (!XLogRecHasAnyBlockRefs(xlogreader))
+ continue;
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
@@ -385,8 +399,6 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
MemoryContextReset(tmp_cxt);
-
- CHECK_FOR_INTERRUPTS();
}
MemoryContextDelete(tmp_cxt);
@@ -483,8 +495,6 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
#define PG_GET_WAL_RECORDS_INFO_COLS 11
XLogReaderState *xlogreader;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
- Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
- bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
MemoryContext old_cxt;
MemoryContext tmp_cxt;
@@ -501,6 +511,9 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
while (ReadNextXLogRecord(xlogreader) &&
xlogreader->EndRecPtr <= end_lsn)
{
+ Datum values[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+ bool nulls[PG_GET_WAL_RECORDS_INFO_COLS] = {0};
+
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
--
2.34.1
v5-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v5-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 6c3c2966d12768633bd158e145d70e15d6d6652c Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 27 Mar 2023 03:59:02 +0000
Subject: [PATCH v5] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 12 ++-
contrib/pg_walinspect/pg_walinspect.c | 55 ++++++++---
doc/src/sgml/pgwalinspect.sgml | 99 ++++++++++++-------
3 files changed, 117 insertions(+), 49 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..f13d3a8fd3 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,12 +12,20 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
- OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT forkname text,
OUT blockdata bytea,
OUT fpi bytea,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 90fb8eaed1..c8d3ece910 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -177,6 +177,10 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
/*
* Get a single WAL record info.
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
@@ -229,18 +233,34 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Get a single WAL record info along with its block information (block data
+ * and FPI).
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *id;
+ StringInfoData rec_desc;
Assert(XLogRecHasAnyBlockRefs(record));
+ desc = GetRmgr(XLogRecGetRmid(record));
+ id = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
+
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
@@ -260,11 +280,23 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
&rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
values[i++] = Int64GetDatum((int64) blkno);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(id);
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -358,11 +390,12 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get info of all WAL records having block references between start LSN and
+ * end LSN. It outputs one row for each WAL record's block reference (note that
+ * a single WAL record can have one or more blocks associated with it). It
+ * outputs WAL record information as well as block information (block data
+ * and/or FPI). Decompression is applied to the full page images, if necessary.
+ * It omits WAL records that contain no block references.
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -486,7 +519,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -538,7 +571,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..ffcb39bb27 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,62 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, reltablespace,
+ reldatabase, relfilenode, relblocknumber,
+ blockid, xid, resource_manager, record_type,
+ record_length, main_data_length, description,
+ forkname, substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 16391
+relblocknumber | 9
+blockid | 0
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+forkname | main
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.34.1
On Sat, Mar 25, 2023 at 12:12:50PM +0900, Michael Paquier wrote:
I don't see any need to move this block of code? This leads to
unnecessary diffs, potentially making backpatch a bit harder. Either
way is not a big deal, still.. Except for this bit, 0001 looks fine
by me.
FYI, I have gone through 0001 and applied it, after tweaking a bit the
part about block references so as we have only one
XLogRecHasAnyBlockRefs, with its StringInfoData used only locally.
--
Michael
On Mon, Mar 27, 2023 at 9:49 AM Michael Paquier <michael@paquier.xyz> wrote:
On Sat, Mar 25, 2023 at 12:12:50PM +0900, Michael Paquier wrote:
I don't see any need to move this block of code? This leads to
unnecessary diffs, potentially making backpatch a bit harder. Either
way is not a big deal, still.. Except for this bit, 0001 looks fine
by me.FYI, I have gone through 0001 and applied it, after tweaking a bit the
part about block references so as we have only one
XLogRecHasAnyBlockRefs, with its StringInfoData used only locally.
Thanks. Here's the v6 patch (last patch that I have with me for
pg_walinspect) for adding per-record info to pg_get_wal_block_info.
Note that I addressed all review comments received so far. Any
thoughts?
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v6-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v6-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 0c6839f9192b7fa077a62ff01e6bf79d030e32ae Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 27 Mar 2023 07:30:51 +0000
Subject: [PATCH v6] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 12 ++-
contrib/pg_walinspect/pg_walinspect.c | 57 ++++++++---
doc/src/sgml/pgwalinspect.sgml | 99 ++++++++++++-------
3 files changed, 119 insertions(+), 49 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..f13d3a8fd3 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,12 +12,20 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
- OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
OUT relblocknumber int8,
+ OUT blockid int2,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
OUT forkname text,
OUT blockdata bytea,
OUT fpi bytea,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 5946b9e6e7..d9305b50bd 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -177,6 +177,10 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
/*
* Get a single WAL record info.
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
@@ -230,15 +234,33 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Get a single WAL record info along with its block information (block data
+ * and FPI).
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *id;
+ StringInfoData rec_desc;
+
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ id = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
@@ -259,11 +281,23 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
&rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
values[i++] = Int64GetDatum((int64) blkno);
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(id);
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
if (fork >= 0 && fork <= MAX_FORKNUM)
values[i++] = CStringGetTextDatum(forkNames[fork]);
@@ -357,11 +391,12 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get info of all WAL records having block references between start LSN and
+ * end LSN. It outputs one row for each WAL record's block reference (note that
+ * a single WAL record can have one or more blocks associated with it). It
+ * outputs WAL record information as well as block information (block data
+ * and/or FPI). Decompression is applied to the full page images, if necessary.
+ * It omits WAL records that contain no block references.
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -485,7 +520,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -537,7 +572,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..ffcb39bb27 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,62 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, reltablespace,
+ reldatabase, relfilenode, relblocknumber,
+ blockid, xid, resource_manager, record_type,
+ record_length, main_data_length, description,
+ forkname, substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 16391
+relblocknumber | 9
+blockid | 0
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+forkname | main
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.34.1
On Sun, Mar 26, 2023 at 8:41 PM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
I'd still put the LSN data before the three OIDs for consistency with
the structures, though my opinion does not seem to count much..I agree with Michael on this point. Also, although it may not be
significant for SQL, the rows are output in lsn order from the
function.
I guess that it makes sense to have the LSN data first, but to have
the other columns after that. I certainly don't dislike that approach.
I just noticed is that "forkname" appears towards the end among
declared output parameters, even in Bharath's v6. I think that it
should be after "relblocknumber" instead, because it is conceptually
part of the "composite primary key", and so belongs right next to
"relblocknumber". I failed to mention this detail upthread, I think.
--
Peter Geoghegan
On Mon, Mar 27, 2023 at 12:42 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
Thanks. Here's the v6 patch (last patch that I have with me for
pg_walinspect) for adding per-record info to pg_get_wal_block_info.
Note that I addressed all review comments received so far. Any
thoughts?
Looking at this now, with the intention of committing it for 16.
In addition to what I said a little while ago about the forknum
parameter and parameter ordering, I have a concern about the data
type: perhaps the forknum paramater should be declared as
"relforknumber smallint", instead of using text? That would match the
approach taken by pg_buffercache, and would be more efficient.
I don't think that using a text column with the fork name adds too
much, since this is after all supposed to be a tool used by experts.
Plus it's usually pretty clear what it is from context. Not that many
WAL records touch the visibility map, and those that do make it
relatively obvious which block is from the VM based on other details.
Details such as blockid and relblocknumber (the VM is approximately
32k times smaller than the heap). Once I see that the record is (say)
a VISIBLE record, I'm already looking at the order of each block
reference, and maybe at relblocknumber -- I'm not likely to visually
scan the forknum column at all.
--
Peter Geoghegan
On Mon, Mar 27, 2023 at 4:59 PM Peter Geoghegan <pg@bowt.ie> wrote:
Looking at this now, with the intention of committing it for 16.
I see a bug on HEAD, following yesterday's commit 0276ae42dd.
GetWALRecordInfo() will now output the value of the fpi_len variable
before it has actually been set by our call to XXXX. So it'll always
be 0.
Can you post a bugfix patch for this, Bharath?
--
Peter Geoghegan
On Mon, Mar 27, 2023 at 4:59 PM Peter Geoghegan <pg@bowt.ie> wrote:
Looking at this now, with the intention of committing it for 16.
Attached revision v7 adjusts the column order. This is still WIP, but
gives a good idea of the direction I'm going in.
v7 makes the column output look like this:
pg@regression:5432 [1209761]=# select * from
pg_get_wal_block_info('0/10D8280', '0/10D82B8');
┌─[ RECORD 1 ]─────┬──────────────────────────────────────────────────┐
│ start_lsn │ 0/10D8280 │
│ end_lsn │ 0/10D82B8 │
│ prev_lsn │ 0/10D8208 │
│ blockid │ 0 │
│ reltablespace │ 1,663 │
│ reldatabase │ 1 │
│ relfilenode │ 2,610 │
│ relforknumber │ 0 │
│ relblocknumber │ 0 │
│ xid │ 0 │
│ resource_manager │ Heap2 │
│ record_type │ PRUNE │
│ record_length │ 56 │
│ main_data_length │ 8 │
│ block_fpi_length │ 0 │
│ block_fpi_info │ ∅ │
│ description │ snapshotConflictHorizon 10 nredirected 0 ndead 1 │
│ block_data │ \x2b00 │
│ fpi_data │ ∅ │
└──────────────────┴──────────────────────────────────────────────────┘
Few things to note here:
* blockid is now given more prominence, in that it appears just after
the LSN related output params.
The idea is that the blockid is conceptually closer to the LSN stuff.
* There is now a smallint relblocknumber, for consistency with
pg_buffercache. This replaces the previous text column.
As I mentioned earlier on, I don't think that a text based output
param adds much.
* The integer fields record_length, main_data_length, block_fpi_length
all now all appear together. This is for consistency with the similar
output params from the other function.
v7 allows block_fpi_length to be 0 instead of NULL, for consistency
with the fpi_length param from the other function. The intention is to
make it relatively obvious which information "comes from the record"
and which information "comes from the block reference".
* The block_fpi_info output param appears right after block_fpi_info.
This is not very verbose, and I find that it is hard to find by
scrolling horizontally in pspg if it gets placed after either
block_data or fpi_data, which tend to have at least some huge/wide
outputs. It seemed sensible to place block_fpi_info next to the param
I'm now calling block_fpi_length, after it was moved next to the other
"length" fields.
How do people feel about this approach? I'll need to write
documentation to help the user to understand what's really going on
here.
--
Peter Geoghegan
Attachments:
v7-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/octet-stream; name=v7-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 818df7e5f7350bbac65f08a186b3bd90e2c343e8 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 27 Mar 2023 19:09:05 -0700
Subject: [PATCH v7 1/2] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/expected/pg_walinspect.out | 4 +-
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 20 +-
contrib/pg_walinspect/pg_walinspect.c | 172 +++++++++++-------
contrib/pg_walinspect/sql/pg_walinspect.sql | 4 +-
doc/src/sgml/pgwalinspect.sgml | 99 ++++++----
5 files changed, 185 insertions(+), 114 deletions(-)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index b9e2bb994..a481deaac 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -121,7 +121,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
ok
----
t
@@ -134,7 +134,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi_data IS NOT NULL;
ok
----
t
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b446..4ff82c373 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,17 +12,25 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
+ OUT relforknumber int2,
OUT relblocknumber int8,
- OUT forkname text,
- OUT blockdata bytea,
- OUT fpi bytea,
- OUT fpilen int4,
- OUT fpiinfo text[]
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT block_fpi_length int4,
+ OUT block_fpi_info text[],
+ OUT description text,
+ OUT block_data bytea,
+ OUT fpi_data bytea
)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 293373412..834304adc 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -177,6 +177,10 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
/*
* Get a single WAL record info.
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
@@ -230,24 +234,47 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Get a single WAL record info along with its block information (block data
+ * and FPI).
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *record_type;
+ StringInfoData rec_desc;
+
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ record_type = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (record_type == NULL)
+ record_type = psprintf("UNKNOWN (%x)",
+ XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
- ForkNumber fork;
+ ForkNumber forknum;
Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ uint32 block_fpi_len = 0;
+ PGAlignedBlock buf;
+ Page page;
+ ArrayType *block_fpi_info = NULL;
int i = 0;
if (!XLogRecHasBlockRef(record, block_id))
@@ -256,70 +283,22 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
blk = XLogRecGetBlock(record, block_id);
(void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blkno, NULL);
-
- values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
- values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
- values[i++] = Int64GetDatum((int64) blkno);
-
- if (fork >= 0 && fork <= MAX_FORKNUM)
- values[i++] = CStringGetTextDatum(forkNames[fork]);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("invalid fork number: %u", fork)));
-
- /* Block data */
- if (blk->has_data)
- {
- bytea *raw_data;
-
- /* Initialize bytea buffer to copy the data to */
- raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
- SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
-
- /* Copy the data */
- memcpy(VARDATA(raw_data), blk->data, blk->data_len);
- values[i++] = PointerGetDatum(raw_data);
- }
- else
- {
- /* No data, so set this field to NULL */
- nulls[i++] = true;
- }
+ &rnode, &forknum, &blkno, NULL);
if (blk->has_image)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
int bitcnt;
int cnt = 0;
Datum *flags;
- ArrayType *a;
-
- page = (Page) buf.data;
/* Full page image exists, so let's save it */
+ page = (Page) buf.data;
if (!RestoreBlockImage(record, block_id, page))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("%s", record->errormsg_buf)));
-
- /* Initialize bytea buffer to copy the FPI to */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
-
- /* Take a verbatim copy of the FPI */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
-
- values[i++] = PointerGetDatum(raw_page);
- values[i++] = UInt32GetDatum(blk->bimg_len);
-
- /* FPI flags */
+ /* Save block_fpi_len and block_fpi_info */
+ block_fpi_len = blk->bimg_len;
bitcnt = pg_popcount((const char *) &blk->bimg_info,
sizeof(uint8));
/* Build set of raw flags */
@@ -337,15 +316,71 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
Assert(cnt <= bitcnt);
- a = construct_array_builtin(flags, cnt, TEXTOID);
- values[i++] = PointerGetDatum(a);
+ block_fpi_info = construct_array_builtin(flags, cnt, TEXTOID);
+ }
+
+ values[i++] = LSNGetDatum(record->ReadRecPtr);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = Int16GetDatum(block_id);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int16GetDatum(forknum);
+ values[i++] = Int64GetDatum((int64) blkno);
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(record_type);
+
+ /* record_length, main_data_length, block_fpi_length */
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+ values[i++] = UInt32GetDatum(block_fpi_len);
+
+ /* block_fpi_info status flags */
+ if (block_fpi_info)
+ values[i++] = PointerGetDatum(block_fpi_info);
+ else
+ nulls[i++] = true;
+
+ /* description */
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
+
+ /* block_data output parameter */
+ if (blk->has_data)
+ {
+ bytea *raw_data;
+
+ /* Initialize bytea buffer to copy the data to */
+ raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
+
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, blk->data_len);
+ values[i++] = PointerGetDatum(raw_data);
}
else
+ nulls[i++] = true;
+
+ /* fpi_data output parameter */
+ if (blk->has_image)
{
- /* No full page image, so store NULLs for all its fields */
- memset(&nulls[i], true, 3 * sizeof(bool));
- i += 3;
+ bytea *raw_page;
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
}
+ else
+ nulls[i++] = true;
Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
@@ -357,11 +392,12 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get info of all WAL records having block references between start LSN and
+ * end LSN. It outputs one row for each WAL record's block reference (note that
+ * a single WAL record can have one or more blocks associated with it). It
+ * outputs WAL record information as well as block information (block data
+ * and/or FPI). Decompression is applied to the full page images, if necessary.
+ * It omits WAL records that contain no block references.
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -485,7 +521,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -537,7 +573,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 2f0a909c1..34e83f529 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -78,7 +78,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
-- Force full-page image on the next update.
SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
@@ -87,7 +87,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND fpi_data IS NOT NULL;
-- ===================================================================
-- Tests for permissions
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d..ffcb39bb2 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,62 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, reltablespace,
+ reldatabase, relfilenode, relblocknumber,
+ blockid, xid, resource_manager, record_type,
+ record_length, main_data_length, description,
+ forkname, substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 16391
+relblocknumber | 9
+blockid | 0
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+forkname | main
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.39.2
On Tue, Mar 28, 2023 at 5:29 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Mon, Mar 27, 2023 at 12:42 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:Thanks. Here's the v6 patch (last patch that I have with me for
pg_walinspect) for adding per-record info to pg_get_wal_block_info.
Note that I addressed all review comments received so far. Any
thoughts?Looking at this now, with the intention of committing it for 16.
In addition to what I said a little while ago about the forknum
parameter and parameter ordering, I have a concern about the data
type: perhaps the forknum paramater should be declared as
"relforknumber smallint", instead of using text? That would match the
approach taken by pg_buffercache, and would be more efficient.I don't think that using a text column with the fork name adds too
much, since this is after all supposed to be a tool used by experts.
Plus it's usually pretty clear what it is from context. Not that many
WAL records touch the visibility map, and those that do make it
relatively obvious which block is from the VM based on other details.
Details such as blockid and relblocknumber (the VM is approximately
32k times smaller than the heap). Once I see that the record is (say)
a VISIBLE record, I'm already looking at the order of each block
reference, and maybe at relblocknumber -- I'm not likely to visually
scan the forknum column at all.
Hm, agreed. Changed in the attached v7-0002 patch. We can as well
write a case statement in the create function SQL to output forkname
instead forknumber, but I'd stop doing that to keep in sync with
pg_buffercache.
On Tue, Mar 28, 2023 at 6:37 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Mon, Mar 27, 2023 at 4:59 PM Peter Geoghegan <pg@bowt.ie> wrote:
Looking at this now, with the intention of committing it for 16.
I see a bug on HEAD, following yesterday's commit 0276ae42dd.
GetWALRecordInfo() will now output the value of the fpi_len variable
before it has actually been set by our call to XXXX. So it'll always
be 0.Can you post a bugfix patch for this, Bharath?
Oh, thanks for finding it out. Fixed in the attached v7-0001 patch. I
also removed the "invalid fork number" error as users can figure that
out if at all the fork number is wrong.
On the ordering of the columns, I kept start_lsn, end_lsn and prev_lsn
first and then the rel** columns (this rel** columns order follows
pg_buffercache) and then block data related columns. Michael and
Kyotaro are of the opinion that it's better to keep LSNs first to be
consistent and also given that this function is WAL related, it makes
sense to have LSNs first.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v7-0001-Fix-fpi_len-issue-introduced-by-0276ae42dd-in-pg_.patchapplication/x-patch; name=v7-0001-Fix-fpi_len-issue-introduced-by-0276ae42dd-in-pg_.patchDownload
From f2fba90775ef2fee6ea6f04dff3a08c2ed7724f1 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Tue, 28 Mar 2023 01:46:08 +0000
Subject: [PATCH v7] Fix fpi_len issue introduced by 0276ae42dd in
pg_walinspect
0276ae42dd changed the code to emit block references only if
exists. That change moved the block references calucation code
around which left fpi_len to be emitted as always 0. This commit
fixes that issue.
---
contrib/pg_walinspect/pg_walinspect.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 2933734122..062e90dbce 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -186,6 +186,7 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
RmgrData desc;
uint32 fpi_len = 0;
StringInfoData rec_desc;
+ StringInfoData rec_blk_ref;
int i = 0;
desc = GetRmgr(XLogRecGetRmid(record));
@@ -197,6 +198,12 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
+ if (XLogRecHasAnyBlockRefs(record))
+ {
+ initStringInfo(&rec_blk_ref);
+ XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
+ }
+
values[i++] = LSNGetDatum(record->ReadRecPtr);
values[i++] = LSNGetDatum(record->EndRecPtr);
values[i++] = LSNGetDatum(XLogRecGetPrev(record));
@@ -205,7 +212,6 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = CStringGetTextDatum(id);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
-
values[i++] = UInt32GetDatum(fpi_len);
if (rec_desc.len > 0)
@@ -213,15 +219,8 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
else
nulls[i++] = true;
- /* Block references. */
if (XLogRecHasAnyBlockRefs(record))
- {
- StringInfoData rec_blk_ref;
-
- initStringInfo(&rec_blk_ref);
- XLogRecGetBlockRefInfo(record, false, true, &rec_blk_ref, &fpi_len);
values[i++] = CStringGetTextDatum(rec_blk_ref.data);
- }
else
nulls[i++] = true;
--
2.34.1
v7-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/x-patch; name=v7-0002-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 55126ffd91bae629d1116949467a53ba8ba58f8b Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Tue, 28 Mar 2023 02:41:34 +0000
Subject: [PATCH v7] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 16 ++-
contrib/pg_walinspect/pg_walinspect.c | 66 +++++++++----
doc/src/sgml/pgwalinspect.sgml | 98 ++++++++++++-------
3 files changed, 123 insertions(+), 57 deletions(-)
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b4467..5d7dabee9a 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,13 +12,21 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
- OUT blockid int2,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
+ OUT relfilenode oid,
OUT reltablespace oid,
OUT reldatabase oid,
- OUT relfilenode oid,
+ OUT relforknumber int2,
OUT relblocknumber int8,
- OUT forkname text,
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT description text,
+ OUT blockid int2,
OUT blockdata bytea,
OUT fpi bytea,
OUT fpilen int4,
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 062e90dbce..9ef7eaace9 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -177,6 +177,10 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
/*
* Get a single WAL record info.
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
@@ -229,15 +233,33 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Get a single WAL record info along with its block information (block data
+ * and FPI).
+ *
+ * Note that the per-record information fetching code is same in both
+ * GetWALBlockInfo() and GetWALRecordInfo(), it's only the output columns order
+ * that differs. Try to keep this code in sync.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
+#define PG_GET_WAL_BLOCK_INFO_COLS 19
int block_id;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *id;
+ StringInfoData rec_desc;
+
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ id = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (id == NULL)
+ id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
@@ -258,18 +280,25 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
&rnode, &fork, &blkno, NULL);
values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int16GetDatum(fork);
values[i++] = Int64GetDatum((int64) blkno);
-
- if (fork >= 0 && fork <= MAX_FORKNUM)
- values[i++] = CStringGetTextDatum(forkNames[fork]);
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(id);
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
else
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("invalid fork number: %u", fork)));
+ nulls[i++] = true;
+
+ values[i++] = Int16GetDatum(block_id);
/* Block data */
if (blk->has_data)
@@ -356,11 +385,12 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get info of all WAL records having block references between start LSN and
+ * end LSN. It outputs one row for each WAL record's block reference (note that
+ * a single WAL record can have one or more blocks associated with it). It
+ * outputs WAL record information as well as block information (block data
+ * and/or FPI). Decompression is applied to the full page images, if necessary.
+ * It omits WAL records that contain no block references.
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -484,7 +514,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -536,7 +566,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d6..f39f9df2ba 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,63 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, relfilenode,
+ reltablespace, reldatabase, relforknumber,
+ relblocknumber, xid, resource_manager,
+ record_type, record_length, main_data_length,
+ description, blockid,
substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+relfilenode | 16391
+reltablespace | 1663
+reldatabase | 5
+relforknumber | 0
+relblocknumber | 9
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+blockid | 7
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.34.1
On Mon, Mar 27, 2023 at 06:07:09PM -0700, Peter Geoghegan wrote:
I see a bug on HEAD, following yesterday's commit 0276ae42dd.
GetWALRecordInfo() will now output the value of the fpi_len variable
before it has actually been set by our call to XXXX. So it'll always
be 0.
Indeed, good catch. It looks like I was not careful enough with the
block controlled by XLogRecHasAnyBlockRefs().
--
Michael
On Mon, Mar 27, 2023 at 7:47 PM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
Hm, agreed. Changed in the attached v7-0002 patch. We can as well
write a case statement in the create function SQL to output forkname
instead forknumber, but I'd stop doing that to keep in sync with
pg_buffercache.
I just don't see much value in any textual representation of fork
name, however generated. In practice it's just not adding very much
useful information. It is mostly useful as a way of filtering block
references, which makes simple integers more natural.
Oh, thanks for finding it out. Fixed in the attached v7-0001 patch. I
also removed the "invalid fork number" error as users can figure that
out if at all the fork number is wrong.
Pushed just now.
On the ordering of the columns, I kept start_lsn, end_lsn and prev_lsn
first and then the rel** columns (this rel** columns order follows
pg_buffercache) and then block data related columns. Michael and
Kyotaro are of the opinion that it's better to keep LSNs first to be
consistent and also given that this function is WAL related, it makes
sense to have LSNs first.
Right, but I didn't change that part in the revision of the patch I
posted. Those columns still came first, and were totally consistent
with the pg_get_wal_record_info function.
I think that there was a "mid air collision" here, where we both
posted patches that we each called v7 within minutes of each other.
Just to be clear, I ended up with a column order as described here in
my revision:
/messages/by-id/CAH2-WzmzO-AU4QSbnzzANBkrpg=4CuOd3scVtv+7x65e+QKBZw@mail.gmail.com
It now occurs to me that "fpi_data" should perhaps be called
"block_fpi_data". What do you think?
--
Peter Geoghegan
On Tue, Mar 28, 2023 at 11:15:17AM -0700, Peter Geoghegan wrote:
On Mon, Mar 27, 2023 at 7:47 PM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:Hm, agreed. Changed in the attached v7-0002 patch. We can as well
write a case statement in the create function SQL to output forkname
instead forknumber, but I'd stop doing that to keep in sync with
pg_buffercache.I just don't see much value in any textual representation of fork
name, however generated. In practice it's just not adding very much
useful information. It is mostly useful as a way of filtering block
references, which makes simple integers more natural.
I disagree with this argument. Personally, I have a *much* better
experience with textual representation because there is no need to
cross-check the internals of the code in case you don't remember what
a given number means in an enum or in a set of bits, especially if
you're in a hurry of looking at a production or customer deployment.
In short, it makes for less mistakes because you don't have to think
about some extra mapping between some integers and what they actually
mean through text. The clauses you'd apply for a group by on the
forks, or for a filter with IN clauses don't change, they're just made
easier to understand for the common user, and that includes
experienced people. We'd better think about that like the text[]
arrays we use for the flag values, like the FPI flags, or why we've
introduced text[] for the HEAP_* flags in the heap functions of
pageinspect.
There's even more consistency with pageinspect in using a fork name,
where we can pass down a fork name to get a raw page.
--
Michael
On Tue, Mar 28, 2023 at 3:34 PM Michael Paquier <michael@paquier.xyz> wrote:
I disagree with this argument. Personally, I have a *much* better
experience with textual representation because there is no need to
cross-check the internals of the code in case you don't remember what
a given number means in an enum or in a set of bits, especially if
you're in a hurry of looking at a production or customer deployment.
I couldn't tell you which fork number is which right off the top of my
head, either (except that main fork is 0). But it doesn't matter.
There are only ever two relevant fork numbers. And even if that
changes, it's perfectly clear which is which from context. There will
be two block references for a given (say) VISIBLE record, one of which
is obviously for the main fork, the other of which is obviously for
the VM fork.
Plus it's just more consistent that way. The existing
pg_get_wal_block_info() output parameters look like they were directly
copied from pg_buffercache (the same names are already used), so why
not do the same with relforknumber?
In short, it makes for less mistakes because you don't have to think
about some extra mapping between some integers and what they actually
mean through text. The clauses you'd apply for a group by on the
forks, or for a filter with IN clauses don't change, they're just made
easier to understand for the common user, and that includes
experienced people.
It's slightly harder to write a query that filters on text, and it
won't perform as well.
We'd better think about that like the text[]
arrays we use for the flag values, like the FPI flags, or why we've
introduced text[] for the HEAP_* flags in the heap functions of
pageinspect.
I think that those other things are fine, because they're much less
obvious, and are very unlikely to ever appear in a query predicate.
There's even more consistency with pageinspect in using a fork name,
where we can pass down a fork name to get a raw page.
pageinspect provides a way of mapping raw infomask/informask2 fields
to status flags, through its heap_tuple_infomask_flags function. But
the actual function that returns information from tuples
(heap_page_items) faithfully represents the raw on-disk format
directly -- the heap_tuple_infomask_flags part is totally optional. So
that doesn't seem like an example that supports your argument -- quite
the contrary.
I wouldn't mind adding something to the docs about fork number. In
fact it's definitely going to be necessary to have an explanation that
at least matches the one from the pg_buffercache docs.
--
Peter Geoghegan
On Mon, Mar 27, 2023 at 7:40 PM Peter Geoghegan <pg@bowt.ie> wrote:
Attached revision v7 adjusts the column order. This is still WIP, but
gives a good idea of the direction I'm going in.
A couple of small tweaks to this appear in the attached revision, v8.
Now it looks like this:
pg@regression:5432 [1294231]=# select * from
pg_get_wal_block_info('0/10E9D80' , '0/10E9DC0');
┌─[ RECORD 1 ]──────┬────────────────────────────────────┐
│ start_lsn │ 0/10E9D80 │
│ end_lsn │ 0/10E9DC0 │
│ prev_lsn │ 0/10E9860 │
│ blockid │ 0 │
│ reltablespace │ 1,663 │
│ reldatabase │ 1 │
│ relfilenode │ 2,690 │
│ relforknumber │ 0 │
│ relblocknumber │ 5 │
│ xid │ 117 │
│ resource_manager │ Btree │
│ record_type │ INSERT_LEAF │
│ record_length │ 64 │
│ main_data_length │ 2 │
│ block_data_length │ 16 │
│ block_fpi_length │ 0 │
│ block_fpi_info │ ∅ │
│ description │ off 14 │
│ block_data │ \x00005400020010001407000000000000 │
│ block_fpi_data │ ∅ │
└───────────────────┴────────────────────────────────────┘
This is similar to what I showed recently for v7. Just two changes:
* The parameter formerly called fpi_data is now called block_fpi_data,
to enforce the idea that it's block specific (and for consistency with
the related block_fpi_length param).
* There is now a new column, which makes the size of block_data
explicit: block_data_length
This made sense on consistency grounds, since we already had a
block_fpi_length. But it also seems quite useful. In this example, I
can immediately see that this INSERT_LEAF record needed 2 bytes for
the block offset number (indicating off 14), and 16 bytes of block
data for the IndexTuple data itself. There is a more recognizable
pattern to things, since the size of tuples for a given relation tends
to be somewhat homogenous. block_data_length also seems like it could
provide users with a handy way of filtering out definitely-irrelevant
block references.
--
Peter Geoghegan
Attachments:
v8-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/octet-stream; name=v8-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From d49a4329630bd9d943cd770e180c3c70bb0c7817 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 27 Mar 2023 19:09:05 -0700
Subject: [PATCH v8] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/expected/pg_walinspect.out | 4 +-
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 21 +-
contrib/pg_walinspect/pg_walinspect.c | 207 +++++++++++-------
contrib/pg_walinspect/sql/pg_walinspect.sql | 4 +-
doc/src/sgml/pgwalinspect.sgml | 99 ++++++---
5 files changed, 211 insertions(+), 124 deletions(-)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index b9e2bb994..950f0e921 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -121,7 +121,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
ok
----
t
@@ -134,7 +134,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_fpi_data IS NOT NULL;
ok
----
t
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b446..8a314158e 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,17 +12,26 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
+ OUT relforknumber int2,
OUT relblocknumber int8,
- OUT forkname text,
- OUT blockdata bytea,
- OUT fpi bytea,
- OUT fpilen int4,
- OUT fpiinfo text[]
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT block_data_length int4,
+ OUT block_fpi_length int4,
+ OUT block_fpi_info text[],
+ OUT description text,
+ OUT block_data bytea,
+ OUT block_fpi_data bytea
)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 062e90dbc..324e771f1 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -176,13 +176,18 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
}
/*
- * Get a single WAL record info.
+ * Output values that make up a row describing caller's WAL record.
+ *
+ * This function leaks memory. Caller may need to use its own custom memory
+ * context.
+ *
+ * Keep this in sync with GetWALBlockInfo.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
bool *nulls, uint32 ncols)
{
- const char *id;
+ const char *record_type;
RmgrData desc;
uint32 fpi_len = 0;
StringInfoData rec_desc;
@@ -190,10 +195,10 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
int i = 0;
desc = GetRmgr(XLogRecGetRmid(record));
- id = desc.rm_identify(XLogRecGetInfo(record));
+ record_type = desc.rm_identify(XLogRecGetInfo(record));
- if (id == NULL)
- id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+ if (record_type == NULL)
+ record_type = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
@@ -209,7 +214,7 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = LSNGetDatum(XLogRecGetPrev(record));
values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
values[i++] = CStringGetTextDatum(desc.rm_name);
- values[i++] = CStringGetTextDatum(id);
+ values[i++] = CStringGetTextDatum(record_type);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
values[i++] = UInt32GetDatum(fpi_len);
@@ -229,101 +234,84 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Output one or more rows in rsinfo tuple store, each describing a single
+ * block reference from caller's WAL record. (Should only be called with
+ * records that have block references.)
+ *
+ * This function leaks memory. Caller may need to use its own custom memory
+ * context.
+ *
+ * Keep this in sync with GetWALRecordInfo.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
- int block_id;
+#define PG_GET_WAL_BLOCK_INFO_COLS 20
+ int blockid;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *record_type;
+ StringInfoData rec_desc;
- for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ record_type = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (record_type == NULL)
+ record_type = psprintf("UNKNOWN (%x)",
+ XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
+
+ for (blockid = 0; blockid <= XLogRecMaxBlockId(record); blockid++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
- ForkNumber fork;
+ ForkNumber forknum;
Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ uint32 block_data_len = 0,
+ block_fpi_len = 0;
+ PGAlignedBlock buf;
+ Page page;
+ ArrayType *block_fpi_info = NULL;
int i = 0;
- if (!XLogRecHasBlockRef(record, block_id))
+ if (!XLogRecHasBlockRef(record, blockid))
continue;
- blk = XLogRecGetBlock(record, block_id);
+ blk = XLogRecGetBlock(record, blockid);
- (void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blkno, NULL);
+ (void) XLogRecGetBlockTagExtended(record, blockid,
+ &rnode, &forknum, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
- values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
- values[i++] = Int64GetDatum((int64) blkno);
-
- if (fork >= 0 && fork <= MAX_FORKNUM)
- values[i++] = CStringGetTextDatum(forkNames[fork]);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("invalid fork number: %u", fork)));
-
- /* Block data */
+ /* Save block_data_len */
if (blk->has_data)
- {
- bytea *raw_data;
-
- /* Initialize bytea buffer to copy the data to */
- raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
- SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
-
- /* Copy the data */
- memcpy(VARDATA(raw_data), blk->data, blk->data_len);
- values[i++] = PointerGetDatum(raw_data);
- }
- else
- {
- /* No data, so set this field to NULL */
- nulls[i++] = true;
- }
+ block_data_len = blk->data_len;
if (blk->has_image)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
+ /* Block reference has an FPI, so prepare relevant output */
int bitcnt;
int cnt = 0;
Datum *flags;
- ArrayType *a;
page = (Page) buf.data;
-
- /* Full page image exists, so let's save it */
- if (!RestoreBlockImage(record, block_id, page))
+ if (!RestoreBlockImage(record, blockid, page))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("%s", record->errormsg_buf)));
- /* Initialize bytea buffer to copy the FPI to */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Save block_fpi_len */
+ block_fpi_len = blk->bimg_len;
- /* Take a verbatim copy of the FPI */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
-
- values[i++] = PointerGetDatum(raw_page);
- values[i++] = UInt32GetDatum(blk->bimg_len);
-
- /* FPI flags */
+ /* Construct and save block_fpi_info */
bitcnt = pg_popcount((const char *) &blk->bimg_info,
sizeof(uint8));
- /* Build set of raw flags */
flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
-
if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
if (blk->apply_image)
@@ -336,18 +324,85 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
Assert(cnt <= bitcnt);
- a = construct_array_builtin(flags, cnt, TEXTOID);
- values[i++] = PointerGetDatum(a);
+ block_fpi_info = construct_array_builtin(flags, cnt, TEXTOID);
+ }
+
+ /* start_lsn, end_lsn, prev_lsn, and blockid outputs */
+ values[i++] = LSNGetDatum(record->ReadRecPtr);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = Int16GetDatum(blockid);
+
+ /* relfile and block related outputs */
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int16GetDatum(forknum);
+ values[i++] = Int64GetDatum((int64) blkno);
+
+ /* xid, resource_manager, and record_type outputs */
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(record_type);
+
+ /*
+ * record_length, main_data_length, block_data_len, and
+ * block_fpi_length outputs
+ */
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+ values[i++] = UInt32GetDatum(block_data_len);
+ values[i++] = UInt32GetDatum(block_fpi_len);
+
+ /* block_fpi_info (text array) output */
+ if (block_fpi_info)
+ values[i++] = PointerGetDatum(block_fpi_info);
+ else
+ nulls[i++] = true;
+
+ /* description output (describes WAL record) */
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
+
+ /* block_data output */
+ if (blk->has_data)
+ {
+ bytea *raw_data;
+
+ /* Initialize bytea buffer to copy the data to */
+ Assert(block_data_len > 0);
+ raw_data = (bytea *) palloc(block_data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, block_data_len + VARHDRSZ);
+
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, block_data_len);
+ values[i++] = PointerGetDatum(raw_data);
}
else
+ nulls[i++] = true;
+
+ /* block_fpi_data output */
+ if (blk->has_image)
{
- /* No full page image, so store NULLs for all its fields */
- memset(&nulls[i], true, 3 * sizeof(bool));
- i += 3;
+ bytea *raw_page;
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
}
+ else
+ nulls[i++] = true;
Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
+ /* Store a tuple for this block reference */
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
}
@@ -356,11 +411,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get WAL record info, disaggregated on block reference
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -484,7 +535,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -536,7 +587,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 2f0a909c1..0541e5fbf 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -78,7 +78,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
-- Force full-page image on the next update.
SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
@@ -87,7 +87,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_fpi_data IS NOT NULL;
-- ===================================================================
-- Tests for permissions
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d..ffcb39bb2 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -94,9 +94,10 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
+ Returns one row per WAL record. If <replaceable>end_lsn</replaceable>
+ specified is in future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
The function raises an error if <replaceable>start_lsn</replaceable>
is not available. For example, usage of the function is as follows:
<screen>
@@ -133,11 +134,13 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>.
+ If <replaceable>end_lsn</replaceable> specified is in future (i.e. ahead
+ of the current LSN of the server) or <literal>FFFFFFFF/FFFFFFFF</literal>
+ (i.e. highest possible value for <type>pg_lsn</type> type), it returns
+ statistics until the end of WAL. An error is raised if
+ <replaceable>start_lsn</replaceable> is not available. For example, usage
+ of the function is as follows:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -164,38 +167,62 @@ combined_size_percentage | 2.8634072910530795
<listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
+ Gets both WAL record and block information of all the valid WAL records
+ between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable>. It omits WAL records that contain no
+ block references. The block information includes copies of the block data
+ (<literal>NULL</literal> if none) and full page images as
+ <type>bytea</type> values (after applying decompression when necessary,
+ or <literal>NULL</literal> if none). Returns one row per block registered
+ in a WAL record. If <replaceable>end_lsn</replaceable> specified is in
+ future (i.e. ahead of the current LSN of the server) or
+ <literal>FFFFFFFF/FFFFFFFF</literal> (i.e. highest possible value for
+ <type>pg_lsn</type> type), it returns information until the end of WAL.
+ An error is raised if <replaceable>start_lsn</replaceable> is not
+ available. For example, usage of the function is as follows:
<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
+postgres=# SELECT start_lsn, end_lsn, prev_lsn, reltablespace,
+ reldatabase, relfilenode, relblocknumber,
+ blockid, xid, resource_manager, record_type,
+ record_length, main_data_length, description,
+ forkname, substring(blockdata for 24) as block_trimmed,
substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
+ FROM pg_get_wal_block_info('0/14BF938', '0/14BFBA0');
+-[ RECORD 1 ]----+---------------------------------------------------
+start_lsn | 0/14BFA20
+end_lsn | 0/14BFB38
+prev_lsn | 0/14BF9A8
+reltablespace | 1663
+reldatabase | 5
+relfilenode | 16391
+relblocknumber | 9
+blockid | 0
+xid | 735
+resource_manager | Heap
+record_type | HOT_UPDATE
+record_length | 279
+main_data_length | 14
+description | off 1 xmax 735 flags 0x10 ; new off 5 xmax 0
+forkname | main
+block_trimmed | \x02800128180102000000
+fpi_trimmed | \x00000000d0f84b01000000002c00601f00200420df020000
+fpilen | 204
+fpiinfo | {HAS_HOLE,APPLY}
</screen>
</para>
+ <note>
+ <para>
+ Note that although <function>pg_get_wal_records_info</function>
+ and <function>pg_get_wal_block_info</function> functions return
+ information of WAL records in common, the
+ <function>pg_get_wal_records_info</function> outputs block information
+ (without block data and FPI data) as a text column
+ (<literal>NULL</literal> if none), and
+ <function>pg_get_wal_block_info</function> outputs one row for each block
+ (including block data and FPI data) associated with each WAL record, omitting
+ WAL records that contain no block references.
+ </para>
+ </note>
</listitem>
</varlistentry>
--
2.39.2
On Wed, Mar 29, 2023 at 12:47 PM Peter Geoghegan <pg@bowt.ie> wrote:
A couple of small tweaks to this appear in the attached revision, v8.
I spent some time on the documentation today, too. Attached is v9,
which seems pretty close to being committable. I hope to commit what I
have here (or something very close to it) in the next couple of days.
Note that I've relocated the documentation for pg_get_wal_block_info()
right after pg_get_wal_records_info(), despite getting some push back
on that before now. It just doesn't make sense to leave it where it
is, since the documentation now explains the new functionality by
directly comparing the two functions.
I also noticed that the docs were never updated following the end_lsn
changes in commit 5c1b6628 (they still said that you needed an end_lsn
before the server's current LSN). I've fixed that in passing, and
added a new "Tip" that advertises the permissive interpretation around
end_lsn values in a general sort of way (since it applies equally to
all but one of the pg_walinspect functions). I've also done a little
bit of restructuring of some of the other functions, to keep things
consistent with what I want to do with pg_get_wal_block_info.
--
Peter Geoghegan
Attachments:
v9-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchapplication/octet-stream; name=v9-0001-Emit-WAL-record-info-via-pg_get_wal_block_info.patchDownload
From 14fd9ab8000a300e6502a480b916818a534bf0f3 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 27 Mar 2023 19:09:05 -0700
Subject: [PATCH v9] Emit WAL record info via pg_get_wal_block_info
---
.../pg_walinspect/expected/pg_walinspect.out | 4 +-
.../pg_walinspect/pg_walinspect--1.0--1.1.sql | 21 +-
contrib/pg_walinspect/pg_walinspect.c | 207 +++++++++++-------
contrib/pg_walinspect/sql/pg_walinspect.sql | 4 +-
doc/src/sgml/pgwalinspect.sgml | 179 +++++++++------
5 files changed, 262 insertions(+), 153 deletions(-)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index b9e2bb994..950f0e921 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -121,7 +121,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
ok
----
t
@@ -134,7 +134,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_fpi_data IS NOT NULL;
ok
----
t
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index 586c3b446..8a314158e 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,17 +12,26 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
- OUT lsn pg_lsn,
+ OUT start_lsn pg_lsn,
+ OUT end_lsn pg_lsn,
+ OUT prev_lsn pg_lsn,
OUT blockid int2,
OUT reltablespace oid,
OUT reldatabase oid,
OUT relfilenode oid,
+ OUT relforknumber int2,
OUT relblocknumber int8,
- OUT forkname text,
- OUT blockdata bytea,
- OUT fpi bytea,
- OUT fpilen int4,
- OUT fpiinfo text[]
+ OUT xid xid,
+ OUT resource_manager text,
+ OUT record_type text,
+ OUT record_length int4,
+ OUT main_data_length int4,
+ OUT block_data_length int4,
+ OUT block_fpi_length int4,
+ OUT block_fpi_info text[],
+ OUT description text,
+ OUT block_data bytea,
+ OUT block_fpi_data bytea
)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 062e90dbc..324e771f1 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -176,13 +176,18 @@ ReadNextXLogRecord(XLogReaderState *xlogreader)
}
/*
- * Get a single WAL record info.
+ * Output values that make up a row describing caller's WAL record.
+ *
+ * This function leaks memory. Caller may need to use its own custom memory
+ * context.
+ *
+ * Keep this in sync with GetWALBlockInfo.
*/
static void
GetWALRecordInfo(XLogReaderState *record, Datum *values,
bool *nulls, uint32 ncols)
{
- const char *id;
+ const char *record_type;
RmgrData desc;
uint32 fpi_len = 0;
StringInfoData rec_desc;
@@ -190,10 +195,10 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
int i = 0;
desc = GetRmgr(XLogRecGetRmid(record));
- id = desc.rm_identify(XLogRecGetInfo(record));
+ record_type = desc.rm_identify(XLogRecGetInfo(record));
- if (id == NULL)
- id = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+ if (record_type == NULL)
+ record_type = psprintf("UNKNOWN (%x)", XLogRecGetInfo(record) & ~XLR_INFO_MASK);
initStringInfo(&rec_desc);
desc.rm_desc(&rec_desc, record);
@@ -209,7 +214,7 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
values[i++] = LSNGetDatum(XLogRecGetPrev(record));
values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
values[i++] = CStringGetTextDatum(desc.rm_name);
- values[i++] = CStringGetTextDatum(id);
+ values[i++] = CStringGetTextDatum(record_type);
values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
values[i++] = UInt32GetDatum(fpi_len);
@@ -229,101 +234,84 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
/*
- * Store a set of block information from a single record (FPI and block
- * information).
+ * Output one or more rows in rsinfo tuple store, each describing a single
+ * block reference from caller's WAL record. (Should only be called with
+ * records that have block references.)
+ *
+ * This function leaks memory. Caller may need to use its own custom memory
+ * context.
+ *
+ * Keep this in sync with GetWALRecordInfo.
*/
static void
GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
{
-#define PG_GET_WAL_BLOCK_INFO_COLS 11
- int block_id;
+#define PG_GET_WAL_BLOCK_INFO_COLS 20
+ int blockid;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ RmgrData desc;
+ const char *record_type;
+ StringInfoData rec_desc;
- for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ Assert(XLogRecHasAnyBlockRefs(record));
+
+ desc = GetRmgr(XLogRecGetRmid(record));
+ record_type = desc.rm_identify(XLogRecGetInfo(record));
+
+ if (record_type == NULL)
+ record_type = psprintf("UNKNOWN (%x)",
+ XLogRecGetInfo(record) & ~XLR_INFO_MASK);
+
+ initStringInfo(&rec_desc);
+ desc.rm_desc(&rec_desc, record);
+
+ for (blockid = 0; blockid <= XLogRecMaxBlockId(record); blockid++)
{
DecodedBkpBlock *blk;
BlockNumber blkno;
RelFileLocator rnode;
- ForkNumber fork;
+ ForkNumber forknum;
Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
+ uint32 block_data_len = 0,
+ block_fpi_len = 0;
+ PGAlignedBlock buf;
+ Page page;
+ ArrayType *block_fpi_info = NULL;
int i = 0;
- if (!XLogRecHasBlockRef(record, block_id))
+ if (!XLogRecHasBlockRef(record, blockid))
continue;
- blk = XLogRecGetBlock(record, block_id);
+ blk = XLogRecGetBlock(record, blockid);
- (void) XLogRecGetBlockTagExtended(record, block_id,
- &rnode, &fork, &blkno, NULL);
+ (void) XLogRecGetBlockTagExtended(record, blockid,
+ &rnode, &forknum, &blkno, NULL);
- values[i++] = LSNGetDatum(record->ReadRecPtr);
- values[i++] = Int16GetDatum(block_id);
- values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
- values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
- values[i++] = Int64GetDatum((int64) blkno);
-
- if (fork >= 0 && fork <= MAX_FORKNUM)
- values[i++] = CStringGetTextDatum(forkNames[fork]);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg_internal("invalid fork number: %u", fork)));
-
- /* Block data */
+ /* Save block_data_len */
if (blk->has_data)
- {
- bytea *raw_data;
-
- /* Initialize bytea buffer to copy the data to */
- raw_data = (bytea *) palloc(blk->data_len + VARHDRSZ);
- SET_VARSIZE(raw_data, blk->data_len + VARHDRSZ);
-
- /* Copy the data */
- memcpy(VARDATA(raw_data), blk->data, blk->data_len);
- values[i++] = PointerGetDatum(raw_data);
- }
- else
- {
- /* No data, so set this field to NULL */
- nulls[i++] = true;
- }
+ block_data_len = blk->data_len;
if (blk->has_image)
{
- PGAlignedBlock buf;
- Page page;
- bytea *raw_page;
+ /* Block reference has an FPI, so prepare relevant output */
int bitcnt;
int cnt = 0;
Datum *flags;
- ArrayType *a;
page = (Page) buf.data;
-
- /* Full page image exists, so let's save it */
- if (!RestoreBlockImage(record, block_id, page))
+ if (!RestoreBlockImage(record, blockid, page))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("%s", record->errormsg_buf)));
- /* Initialize bytea buffer to copy the FPI to */
- raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
- SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+ /* Save block_fpi_len */
+ block_fpi_len = blk->bimg_len;
- /* Take a verbatim copy of the FPI */
- memcpy(VARDATA(raw_page), page, BLCKSZ);
-
- values[i++] = PointerGetDatum(raw_page);
- values[i++] = UInt32GetDatum(blk->bimg_len);
-
- /* FPI flags */
+ /* Construct and save block_fpi_info */
bitcnt = pg_popcount((const char *) &blk->bimg_info,
sizeof(uint8));
- /* Build set of raw flags */
flags = (Datum *) palloc0(sizeof(Datum) * bitcnt);
-
if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) != 0)
flags[cnt++] = CStringGetTextDatum("HAS_HOLE");
if (blk->apply_image)
@@ -336,18 +324,85 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
flags[cnt++] = CStringGetTextDatum("COMPRESS_ZSTD");
Assert(cnt <= bitcnt);
- a = construct_array_builtin(flags, cnt, TEXTOID);
- values[i++] = PointerGetDatum(a);
+ block_fpi_info = construct_array_builtin(flags, cnt, TEXTOID);
+ }
+
+ /* start_lsn, end_lsn, prev_lsn, and blockid outputs */
+ values[i++] = LSNGetDatum(record->ReadRecPtr);
+ values[i++] = LSNGetDatum(record->EndRecPtr);
+ values[i++] = LSNGetDatum(XLogRecGetPrev(record));
+ values[i++] = Int16GetDatum(blockid);
+
+ /* relfile and block related outputs */
+ values[i++] = ObjectIdGetDatum(blk->rlocator.spcOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.dbOid);
+ values[i++] = ObjectIdGetDatum(blk->rlocator.relNumber);
+ values[i++] = Int16GetDatum(forknum);
+ values[i++] = Int64GetDatum((int64) blkno);
+
+ /* xid, resource_manager, and record_type outputs */
+ values[i++] = TransactionIdGetDatum(XLogRecGetXid(record));
+ values[i++] = CStringGetTextDatum(desc.rm_name);
+ values[i++] = CStringGetTextDatum(record_type);
+
+ /*
+ * record_length, main_data_length, block_data_len, and
+ * block_fpi_length outputs
+ */
+ values[i++] = UInt32GetDatum(XLogRecGetTotalLen(record));
+ values[i++] = UInt32GetDatum(XLogRecGetDataLen(record));
+ values[i++] = UInt32GetDatum(block_data_len);
+ values[i++] = UInt32GetDatum(block_fpi_len);
+
+ /* block_fpi_info (text array) output */
+ if (block_fpi_info)
+ values[i++] = PointerGetDatum(block_fpi_info);
+ else
+ nulls[i++] = true;
+
+ /* description output (describes WAL record) */
+ if (rec_desc.len > 0)
+ values[i++] = CStringGetTextDatum(rec_desc.data);
+ else
+ nulls[i++] = true;
+
+ /* block_data output */
+ if (blk->has_data)
+ {
+ bytea *raw_data;
+
+ /* Initialize bytea buffer to copy the data to */
+ Assert(block_data_len > 0);
+ raw_data = (bytea *) palloc(block_data_len + VARHDRSZ);
+ SET_VARSIZE(raw_data, block_data_len + VARHDRSZ);
+
+ /* Copy the data */
+ memcpy(VARDATA(raw_data), blk->data, block_data_len);
+ values[i++] = PointerGetDatum(raw_data);
}
else
+ nulls[i++] = true;
+
+ /* block_fpi_data output */
+ if (blk->has_image)
{
- /* No full page image, so store NULLs for all its fields */
- memset(&nulls[i], true, 3 * sizeof(bool));
- i += 3;
+ bytea *raw_page;
+
+ /* Initialize bytea buffer to copy the FPI to */
+ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+ SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+
+ /* Take a verbatim copy of the FPI */
+ memcpy(VARDATA(raw_page), page, BLCKSZ);
+
+ values[i++] = PointerGetDatum(raw_page);
}
+ else
+ nulls[i++] = true;
Assert(i == PG_GET_WAL_BLOCK_INFO_COLS);
+ /* Store a tuple for this block reference */
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
}
@@ -356,11 +411,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
}
/*
- * Get information about all the blocks saved in WAL records between start
- * and end LSNs. This produces information about the full page images with
- * their relation information, and the data saved in each block associated
- * to a record. Decompression is applied to the full page images, if
- * necessary.
+ * Get WAL record info, disaggregated on block reference
*/
Datum
pg_get_wal_block_info(PG_FUNCTION_ARGS)
@@ -484,7 +535,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn)
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
static void
GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
@@ -536,7 +587,7 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
}
/*
- * Get info and data of all WAL records between start LSN and end LSN.
+ * Get info of all WAL records between start LSN and end LSN.
*/
Datum
pg_get_wal_records_info(PG_FUNCTION_ARGS)
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 2f0a909c1..0541e5fbf 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -78,7 +78,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 1;
SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
-- Check if we get block data from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
- WHERE relfilenode = :'sample_tbl_oid' AND blockdata IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
-- Force full-page image on the next update.
SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
@@ -87,7 +87,7 @@ UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
-- Check if we get FPI from WAL record.
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
- WHERE relfilenode = :'sample_tbl_oid' AND fpi IS NOT NULL;
+ WHERE relfilenode = :'sample_tbl_oid' AND block_fpi_data IS NOT NULL;
-- ===================================================================
-- Tests for permissions
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 9a0241a8d..82e421d21 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -25,22 +25,32 @@
All the functions of this module will try to find the first valid WAL record
that is at or after the given <replaceable>in_lsn</replaceable> or
<replaceable>start_lsn</replaceable> and will emit error if no such record
- is available. Similarly, the <replaceable>end_lsn</replaceable> must be
- available, and if it falls in the middle of a record, the entire record must
- be available.
+ is available.
</para>
<note>
<para>
- Some functions, such as <function><link
- linkend="pg-logical-emit-message">pg_logical_emit_message</link></function>,
- return the LSN <emphasis>after</emphasis> the record just
- inserted. Therefore, if you pass that LSN as
- <replaceable>in_lsn</replaceable> or <replaceable>start_lsn</replaceable>
- to one of these functions, it will return the <emphasis>next</emphasis>
- record.
+ The <filename>pg_walinspect</filename> functions are often called
+ using an LSN argument that specifies the location at which a known
+ WAL record of interest <emphasis>begins</emphasis>. However, some
+ functions, such as
+ <function><link linkend="pg-logical-emit-message">pg_logical_emit_message</link></function>,
+ return the LSN <emphasis>after</emphasis> the record that was just
+ inserted.
</para>
</note>
+ <tip>
+ <para>
+ All of the <filename>pg_walinspect</filename> functions with an
+ <replaceable>end_lsn</replaceable> argument are permissive about
+ accepting LSN values after the server's current LSN. It may be
+ convenient to provide an <replaceable>end_lsn</replaceable> of
+ <literal>FFFFFFFF/FFFFFFFF</literal> (the largest possible LSN) to
+ avoid having to determine the server's current LSN. The functions
+ will behave as if the server's current LSN had been specified
+ directly.
+ </para>
+ </tip>
<para>
By default, use of these functions is restricted to superusers and members of
the <literal>pg_read_server_files</literal> role. Access may be granted by
@@ -58,11 +68,9 @@
<listitem>
<para>
- Gets WAL record information of a given LSN. If the given LSN isn't
- at the start of a WAL record, it gives the information of the next
- available valid WAL record; or an error if no such record is found.
- For example, usage of the function is as
- follows:
+ Gets WAL record information about a record that is located at or
+ after the <replaceable>in_lsn</replaceable> argument. For
+ example:
<screen>
postgres=# SELECT * FROM pg_get_wal_record_info('0/1E826E98');
-[ RECORD 1 ]----+----------------------------------------------------
@@ -79,6 +87,12 @@ description | snapshotConflictHorizon 33748 nredirected 0 ndead 2
block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
</screen>
</para>
+ <para>
+ If <replaceable>in_lsn</replaceable> isn't at the start of a WAL
+ record, information about the next valid WAL record is shown
+ instead. If there is no next valid WAL record, the function
+ raises an error.
+ </para>
</listitem>
</varlistentry>
@@ -94,11 +108,7 @@ block_ref | blkref #0: rel 1663/5/60221 fork main blk 2
<para>
Gets information of all the valid WAL records between
<replaceable>start_lsn</replaceable> and <replaceable>end_lsn</replaceable>.
- Returns one row per WAL record. If a future
- <replaceable>end_lsn</replaceable> (i.e. ahead of the current LSN of
- the server) is specified, it returns information until the end of WAL.
- The function raises an error if <replaceable>start_lsn</replaceable>
- is not available. For example, usage of the function is as follows:
+ Returns one row per WAL record. For example:
<screen>
postgres=# SELECT * FROM pg_get_wal_records_info('0/1E913618', '0/1E913740') LIMIT 1;
-[ RECORD 1 ]----+--------------------------------------------------------------
@@ -115,6 +125,87 @@ description | nextXid 33775 latestCompletedXid 33774 oldestRunningXid 33775
block_ref |
</screen>
</para>
+ <para>
+ The function raises an error if
+ <replaceable>start_lsn</replaceable> is not available.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
+ </term>
+
+ <listitem>
+ <para>
+ Gets information about each block reference from all the valid
+ WAL records between <replaceable>start_lsn</replaceable> and
+ <replaceable>end_lsn</replaceable> with one or more block
+ references. Returns one row per block reference per WAL record.
+ For example:
+<screen>
+postgres=# SELECT * FROM pg_get_wal_block_info('0/10E9D80', '0/10E9DC0') LIMIT 1;
+-[ RECORD 1 ]-----+-----------------------------------
+start_lsn | 0/10E9D80
+end_lsn | 0/10E9DC0
+prev_lsn | 0/10E9860
+blockid | 0
+reltablespace | 1663
+reldatabase | 1
+relfilenode | 2690
+relforknumber | 0
+relblocknumber | 5
+xid | 117
+resource_manager | Btree
+record_type | INSERT_LEAF
+record_length | 64
+main_data_length | 2
+block_data_length | 16
+block_fpi_length | 0
+block_fpi_info |
+description | off 14
+block_data | \x00005400020010001407000000000000
+block_fpi_data |
+</screen>
+ </para>
+ <para>
+ The output parameters show the same information as the
+ parameters from <function>pg_get_wal_records_info</function>
+ that have matching names, where applicable. However,
+ <function>pg_get_wal_block_info</function> provides a view of
+ WAL records that is <quote>disaggregated</quote> by block
+ reference, so there are subtle but important differences between
+ each of the two functions.
+ </para>
+ <para>
+ <function>pg_get_wal_block_info</function> shows more than one
+ row per WAL record for those records with more than a single
+ block reference, such as <literal>UPDATE</literal> records with
+ a successor version that goes onto a new, distinct heap page to
+ the original version (not shown here). Rows output by
+ <function>pg_get_wal_block_info</function> can be individually
+ identified by their unique combination of
+ <replaceable>start_lsn</replaceable> and
+ <replaceable>blockid</replaceable> values. Note that records
+ with no block references (e.g., <literal>COMMIT</literal> WAL
+ records) never have any rows returned.
+ </para>
+ <para>
+ The <structfield>reltablespace</structfield>,
+ <structfield>reldatabase</structfield>, and
+ <structfield>relfilenode</structfield> parameters reference
+ <link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link>.<structfield>oid</structfield>,
+ <link linkend="catalog-pg-database"><structname>pg_database</structname></link>.<structfield>oid</structfield>, and
+ <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>oid</structfield> respectively.
+ The <structfield>relforknumber</structfield> field is the fork
+ number within the relation;
+ see <filename>common/relpath.h</filename> for details.
+ </para>
+ <para>
+ The function raises an error if
+ <replaceable>start_lsn</replaceable> is not available.
+ </para>
</listitem>
</varlistentry>
@@ -133,11 +224,7 @@ block_ref |
<replaceable>end_lsn</replaceable>. By default, it returns one row per
<replaceable>resource_manager</replaceable> type. When
<replaceable>per_record</replaceable> is set to <literal>true</literal>,
- it returns one row per <replaceable>record_type</replaceable>. If a
- future <replaceable>end_lsn</replaceable> (i.e. ahead of the current
- LSN of the server) is specified, it returns statistics until the end
- of WAL. An error is raised if <replaceable>start_lsn</replaceable> is
- not available. For example, usage of the function is as follows:
+ it returns one row per <replaceable>record_type</replaceable>. For example:
<screen>
postgres=# SELECT * FROM pg_get_wal_stats('0/1E847D00', '0/1E84F500')
WHERE count > 0 LIMIT 1 AND
@@ -154,47 +241,9 @@ combined_size | 875
combined_size_percentage | 2.8634072910530795
</screen>
</para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>
- <function>pg_get_wal_block_info(start_lsn pg_lsn, end_lsn pg_lsn) returns setof record</function>
- </term>
-
- <listitem>
<para>
- Gets a copy of the block information stored in WAL records. This includes
- copies of the block data (<literal>NULL</literal> if none) and full page
- images as <type>bytea</type> values (after
- applying decompression when necessary, or <literal>NULL</literal> if none)
- and their information associated with all the valid WAL records between
- <replaceable>start_lsn</replaceable> and
- <replaceable>end_lsn</replaceable>. Returns one row per block registered
- in a WAL record. If a future <replaceable>end_lsn</replaceable> (i.e.
- ahead of the current LSN of the server) is specified, it returns
- statistics until the end of WAL. An error is raised if
- <replaceable>start_lsn</replaceable> is not available. For example,
- usage of the function is as follows:
-<screen>
-postgres=# SELECT lsn, blockid, reltablespace, reldatabase, relfilenode,
- relblocknumber, forkname,
- substring(blockdata for 24) as block_trimmed,
- substring(fpi for 24) as fpi_trimmed, fpilen, fpiinfo
- FROM pg_get_wal_block_info('0/1871080', '0/1871440');
--[ RECORD 1 ]--+---------------------------------------------------
-lsn | 0/18712F8
-blockid | 0
-reltablespace | 1663
-reldatabase | 16384
-relfilenode | 16392
-relblocknumber | 0
-forkname | main
-block_trimmed | \x02800128180164000000
-fpi_trimmed | \x0000000050108701000000002c00601f00200420e0020000
-fpilen | 204
-fpiinfo | {HAS_HOLE,APPLY}
-</screen>
+ The function raises an error if
+ <replaceable>start_lsn</replaceable> is not available.
</para>
</listitem>
</varlistentry>
--
2.39.2
On Thu, Mar 30, 2023 at 5:15 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Wed, Mar 29, 2023 at 12:47 PM Peter Geoghegan <pg@bowt.ie> wrote:
A couple of small tweaks to this appear in the attached revision, v8.
I spent some time on the documentation today, too. Attached is v9,
which seems pretty close to being committable. I hope to commit what I
have here (or something very close to it) in the next couple of days.Note that I've relocated the documentation for pg_get_wal_block_info()
right after pg_get_wal_records_info(), despite getting some push back
on that before now. It just doesn't make sense to leave it where it
is, since the documentation now explains the new functionality by
directly comparing the two functions.I also noticed that the docs were never updated following the end_lsn
changes in commit 5c1b6628 (they still said that you needed an end_lsn
before the server's current LSN). I've fixed that in passing, and
added a new "Tip" that advertises the permissive interpretation around
end_lsn values in a general sort of way (since it applies equally to
all but one of the pg_walinspect functions). I've also done a little
bit of restructuring of some of the other functions, to keep things
consistent with what I want to do with pg_get_wal_block_info.
I took a look at v9 and LGTM.
--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Wed, Mar 29, 2023 at 8:28 PM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
I took a look at v9 and LGTM.
Pushed, thanks.
There is still an outstanding question around the overhead of
outputting FPIs and even block data from pg_get_wal_block_info(). At
one point Melanie suggested that we'd need to do something about that,
and I tend to agree. Attached patch provides an optional parameter
that will make pg_get_wal_block_info return NULLs for both block_data
and block_fpi_data, no matter whether or not there is something to
show. Note that this only affects those two bytea columns; we'll still
show everything else, including valid block_data_length and
block_fpi_length values (so the metadata describing the on-disk size
of block_data and block_fpi_data is unaffected).
To test this patch, I ran pgbench for about 5 minutes, using a fairly
standard configuration with added indexes and with wal_log_hints
enabled. I ended up with the following WAL records afterwards:
pg@regression:5432 [1402115]=# SELECT
"resource_manager/record_type" t,
pg_size_pretty(combined_size) s,
fpi_size_percentage perc_fpi
FROM
pg_get_wal_Stats ('0/10E9D80', 'FFFFFFFF/FFFFFFFF', FALSE) where
combined_size > 0;
┌─[ RECORD 1 ]──────────────────┐
│ t │ XLOG │
│ s │ 1557 MB │
│ perc_fpi │ 22.029466865781302 │
├─[ RECORD 2 ]──────────────────┤
│ t │ Transaction │
│ s │ 49 MB │
│ perc_fpi │ 0 │
├─[ RECORD 3 ]──────────────────┤
│ t │ Storage │
│ s │ 13 kB │
│ perc_fpi │ 0 │
├─[ RECORD 4 ]──────────────────┤
│ t │ CLOG │
│ s │ 1380 bytes │
│ perc_fpi │ 0 │
├─[ RECORD 5 ]──────────────────┤
│ t │ Database │
│ s │ 118 bytes │
│ perc_fpi │ 0 │
├─[ RECORD 6 ]──────────────────┤
│ t │ RelMap │
│ s │ 565 bytes │
│ perc_fpi │ 0 │
├─[ RECORD 7 ]──────────────────┤
│ t │ Standby │
│ s │ 30 kB │
│ perc_fpi │ 0 │
├─[ RECORD 8 ]──────────────────┤
│ t │ Heap2 │
│ s │ 4235 MB │
│ perc_fpi │ 0.6731388657682449 │
├─[ RECORD 9 ]──────────────────┤
│ t │ Heap │
│ s │ 4482 MB │
│ perc_fpi │ 54.46811493602934 │
├─[ RECORD 10 ]─────────────────┤
│ t │ Btree │
│ s │ 1786 MB │
│ perc_fpi │ 22.829279332421116 │
└──────────┴────────────────────┘
Time: 3618.693 ms (00:03.619)
So about 12GB of WAL -- certainly enough to be a challenge for pg_walinspect.
I then ran the following query several times over the same LSN range
as before, with my patch applied, but with behavior equivalent to
current git HEAD (this is with outputting block_data and
block_fpi_data values still turned on):
pg@regression:5432 [1402115]=# SELECT
count(*)
FROM
pg_get_wal_block_info ('0/10E9D80', 'FFFFFFFF/FFFFFFFF', false);
┌─[ RECORD 1 ]───────┐
│ count │ 17,031,979 │
└───────┴────────────┘
Time: 35171.463 ms (00:35.171)
The time shown here is typical of what I saw.
And now the same query, but without any overhead for outputting
block_data and block_fpi_data values:
pg@regression:5432 [1402115]=# SELECT
count(*)
FROM
pg_get_wal_block_info ('0/10E9D80', 'FFFFFFFF/FFFFFFFF', true);
┌─[ RECORD 1 ]───────┐
│ count │ 17,031,979 │
└───────┴────────────┘
Time: 15235.499 ms (00:15.235)
This time is also typical of what I saw. The variance was fairly low,
so I won't bother describing it.
I think that this is a compelling reason to apply the patch. It would
be possible to get about 75% of the benefit shown here by just
suppressing block_fpi_data output, without suppressing block_data, but
I think that it makes sense to either suppress both or neither. Things
like page split records can write a fairly large amount of WAL in a
way that resembles an FPI, even though technically no FPI is involved.
If there are no objections, I'll move ahead with committing something
along the lines of this patch in the next couple of days.
--
Peter Geoghegan
Attachments:
v1-0001-pg_get_wal_block_info-suppress-block-data-outputs.patchapplication/octet-stream; name=v1-0001-pg_get_wal_block_info-suppress-block-data-outputs.patchDownload
From fa6ba8ec943bc06a7c7ca2008d9603b046191871 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 30 Mar 2023 13:14:25 -0700
Subject: [PATCH v1] pg_get_wal_block_info: suppress block data outputs.
Make block_data and block_fpi_data output NULLs in all cases when
pg_get_wal_block_info is called with suppress_block_data = true.
---
contrib/pg_walinspect/expected/oldextversions.out | 8 ++++----
contrib/pg_walinspect/expected/pg_walinspect.out | 10 +++++-----
contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql | 5 +++--
contrib/pg_walinspect/pg_walinspect.c | 13 ++++++++-----
contrib/pg_walinspect/sql/pg_walinspect.sql | 10 +++++-----
5 files changed, 25 insertions(+), 21 deletions(-)
diff --git a/contrib/pg_walinspect/expected/oldextversions.out b/contrib/pg_walinspect/expected/oldextversions.out
index ed80a0c62..89953af3d 100644
--- a/contrib/pg_walinspect/expected/oldextversions.out
+++ b/contrib/pg_walinspect/expected/oldextversions.out
@@ -50,10 +50,10 @@ ERROR: WAL start LSN must be less than current LSN
ALTER EXTENSION pg_walinspect UPDATE TO '1.1';
-- List what version 1.1 contains.
\dx+ pg_walinspect
- Objects in extension "pg_walinspect"
- Object description
---------------------------------------------------
- function pg_get_wal_block_info(pg_lsn,pg_lsn)
+ Objects in extension "pg_walinspect"
+ Object description
+-------------------------------------------------------
+ function pg_get_wal_block_info(pg_lsn,pg_lsn,boolean)
function pg_get_wal_record_info(pg_lsn)
function pg_get_wal_records_info(pg_lsn,pg_lsn)
function pg_get_wal_stats(pg_lsn,pg_lsn,boolean)
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index 950f0e921..a8f4c9106 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -166,7 +166,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
has_function_privilege
------------------------
f
@@ -196,7 +196,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -210,7 +210,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_record_info(pg_lsn)', 'EXECUTE'); -- yes
@@ -234,7 +234,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
(1 row)
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
has_function_privilege
------------------------
t
@@ -246,7 +246,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-- ===================================================================
-- Clean up
diff --git a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
index fdd75e8a7..ef4e6e8fd 100644
--- a/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
+++ b/contrib/pg_walinspect/pg_walinspect--1.0--1.1.sql
@@ -12,6 +12,7 @@ DROP FUNCTION pg_get_wal_stats_till_end_of_wal(pg_lsn, boolean);
--
CREATE FUNCTION pg_get_wal_block_info(IN start_lsn pg_lsn,
IN end_lsn pg_lsn,
+ IN no_bytea boolean DEFAULT false,
OUT start_lsn pg_lsn,
OUT end_lsn pg_lsn,
OUT prev_lsn pg_lsn,
@@ -37,5 +38,5 @@ RETURNS SETOF record
AS 'MODULE_PATHNAME', 'pg_get_wal_block_info'
LANGUAGE C STRICT PARALLEL SAFE;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) FROM PUBLIC;
-GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn) TO pg_read_server_files;
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) TO pg_read_server_files;
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 1ff53c5ab..8c34577c5 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -59,7 +59,8 @@ static void GetWalStats(FunctionCallInfo fcinfo,
XLogRecPtr start_lsn,
XLogRecPtr end_lsn,
bool stats_per_record);
-static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record);
+static void GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record,
+ bool suppress_block_data);
/*
* Return the LSN up to which the server has WAL.
@@ -244,7 +245,8 @@ GetWALRecordInfo(XLogReaderState *record, Datum *values,
* Keep this in sync with GetWALRecordInfo.
*/
static void
-GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
+GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record,
+ bool suppress_block_data)
{
#define PG_GET_WAL_BLOCK_INFO_COLS 20
int block_id;
@@ -359,7 +361,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
nulls[i++] = true;
/* block_data output */
- if (blk->has_data)
+ if (blk->has_data && !suppress_block_data)
{
bytea *block_data;
@@ -372,7 +374,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record)
nulls[i++] = true;
/* block_fpi_data output */
- if (blk->has_image)
+ if (blk->has_image && !suppress_block_data)
{
PGAlignedBlock buf;
Page page;
@@ -410,6 +412,7 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
{
XLogRecPtr start_lsn = PG_GETARG_LSN(0);
XLogRecPtr end_lsn = PG_GETARG_LSN(1);
+ bool suppress_block_data = PG_GETARG_BOOL(2);
XLogReaderState *xlogreader;
MemoryContext old_cxt;
MemoryContext tmp_cxt;
@@ -435,7 +438,7 @@ pg_get_wal_block_info(PG_FUNCTION_ARGS)
/* Use the tmp context so we can clean up after each tuple is done */
old_cxt = MemoryContextSwitchTo(tmp_cxt);
- GetWALBlockInfo(fcinfo, xlogreader);
+ GetWALBlockInfo(fcinfo, xlogreader, suppress_block_data);
/* clean up and switch back */
MemoryContextSwitchTo(old_cxt);
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 0541e5fbf..f987ca31c 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -101,7 +101,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- no
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- no
-- Functions accessible by users with role pg_read_server_files.
GRANT pg_read_server_files TO regress_pg_walinspect;
@@ -113,7 +113,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
REVOKE pg_read_server_files FROM regress_pg_walinspect;
@@ -124,7 +124,7 @@ GRANT EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
TO regress_pg_walinspect;
GRANT EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
-GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
+GRANT EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean)
TO regress_pg_walinspect;
SELECT has_function_privilege('regress_pg_walinspect',
@@ -134,7 +134,7 @@ SELECT has_function_privilege('regress_pg_walinspect',
SELECT has_function_privilege('regress_pg_walinspect',
'pg_get_wal_stats(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
SELECT has_function_privilege('regress_pg_walinspect',
- 'pg_get_wal_block_info(pg_lsn, pg_lsn) ', 'EXECUTE'); -- yes
+ 'pg_get_wal_block_info(pg_lsn, pg_lsn, boolean) ', 'EXECUTE'); -- yes
REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn)
FROM regress_pg_walinspect;
@@ -142,7 +142,7 @@ REVOKE EXECUTE ON FUNCTION pg_get_wal_records_info(pg_lsn, pg_lsn)
FROM regress_pg_walinspect;
REVOKE EXECUTE ON FUNCTION pg_get_wal_stats(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn)
+REVOKE EXECUTE ON FUNCTION pg_get_wal_block_info(pg_lsn, pg_lsn, boolean)
FROM regress_pg_walinspect;
-- ===================================================================
--
2.39.2
On Thu, Mar 30, 2023 at 2:41 PM Peter Geoghegan <pg@bowt.ie> wrote:
pg@regression:5432 [1402115]=# SELECT
count(*)
FROM
pg_get_wal_block_info ('0/10E9D80', 'FFFFFFFF/FFFFFFFF', true);
┌─[ RECORD 1 ]───────┐
│ count │ 17,031,979 │
└───────┴────────────┘Time: 15235.499 ms (00:15.235)
This time is also typical of what I saw. The variance was fairly low,
so I won't bother describing it.
If I rerun the same test case with pg_get_wal_records_info (same WAL
records, same system) then I find that it takes about 16 and a half
seconds. So my patch makes pg_get_wal_block_info a little bit faster
than pg_get_wal_records_info for this test case, and likely many
interesting cases (assuming that the user opts out of fetching
block_data and block_fpi_data values when running
pg_get_wal_block_info, per the patch).
This result closely matches what I was expecting. We're doing almost
the same amount of work when each function is called, so naturally the
runtime almost matches. Note that pg_get_wal_records_info does
slightly *more* work here, since it alone must output rows for commit
records. Unlike pg_get_wal_block_info, which (by design) never outputs
rows for WAL records that lack block references.
--
Peter Geoghegan
On Thu, Mar 30, 2023 at 2:41 PM Peter Geoghegan <pg@bowt.ie> wrote:
There is still an outstanding question around the overhead of
outputting FPIs and even block data from pg_get_wal_block_info(). At
one point Melanie suggested that we'd need to do something about that,
and I tend to agree. Attached patch provides an optional parameter
that will make pg_get_wal_block_info return NULLs for both block_data
and block_fpi_data, no matter whether or not there is something to
show. Note that this only affects those two bytea columns; we'll still
show everything else, including valid block_data_length and
block_fpi_length values (so the metadata describing the on-disk size
of block_data and block_fpi_data is unaffected).
I pushed this patch just now. Except that the final commited version
had the "suppress_block_data" output parameter name flipped. It was
inverted and renamed to "show_data" (and made "DEFAULT true"). This is
closer to how the pg_stat_statements() function handles a similar
issue with overly large query texts.
I'm very happy with the end result of the work on this thread. It
works a lot better for the sorts of queries I am interested in. Thanks
to all involved, particularly Bharath.
--
Peter Geoghegan