Provide a pg_truncate_freespacemap function

Started by Ronan Dunklaualmost 2 years ago6 messages
#1Ronan Dunklau
ronan.dunklau@aiven.io
1 attachment(s)

Hello,

As we are currently experiencing a FSM corruption issue [1]/messages/by-id/flat/ 1925490.taCxCBeP46%40aivenlaptop#7ace95c95cab17b6d92607e5362984ac, we need to
rebuild FSM when we detect it.

I noticed we have something to truncate a visibility map, but nothing for the
freespace map, so I propose the attached (liberally copied from the VM
counterpart) to allow to truncate a FSM without incurring downtime, as
currently our only options are to either VACUUM FULL the table or stop the
cluster and remove the FSM manually.

Does that seem correct ?

[1]: /messages/by-id/flat/ 1925490.taCxCBeP46%40aivenlaptop#7ace95c95cab17b6d92607e5362984ac
1925490.taCxCBeP46%40aivenlaptop#7ace95c95cab17b6d92607e5362984ac

--
Ronan Dunklau

Attachments:

0001-Provide-a-pg_truncate_freespacemap-function.patchtext/x-patch; charset=x-UTF_8J; name=0001-Provide-a-pg_truncate_freespacemap-function.patchDownload
From b3e28e4542311094ee7177b39cc9cdf3672d1b55 Mon Sep 17 00:00:00 2001
From: Ronan Dunklau <ronan.dunklau@aiven.io>
Date: Fri, 1 Mar 2024 08:57:49 +0100
Subject: [PATCH] Provide a pg_truncate_freespacemap function.

Similar to the pg_truncate_visibilitymap function, this one allows to
avoid downtime to fix an FSM in the case a corruption event happens
---
 contrib/pg_freespacemap/Makefile              |  5 +-
 .../pg_freespacemap--1.2--1.3.sql             | 13 +++++
 contrib/pg_freespacemap/pg_freespacemap.c     | 58 +++++++++++++++++++
 .../pg_freespacemap/pg_freespacemap.control   |  2 +-
 4 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql

diff --git a/contrib/pg_freespacemap/Makefile b/contrib/pg_freespacemap/Makefile
index b48e4b255bc..0f4b52f5812 100644
--- a/contrib/pg_freespacemap/Makefile
+++ b/contrib/pg_freespacemap/Makefile
@@ -6,8 +6,9 @@ OBJS = \
 	pg_freespacemap.o
 
 EXTENSION = pg_freespacemap
-DATA = pg_freespacemap--1.1.sql pg_freespacemap--1.1--1.2.sql \
-	pg_freespacemap--1.0--1.1.sql
+DATA = pg_freespacemap--1.1.sql pg_freespacemap--1.2--1.3.sql \
+			pg_freespacemap--1.1--1.2.sql \
+			pg_freespacemap--1.0--1.1.sql
 PGFILEDESC = "pg_freespacemap - monitoring of free space map"
 
 REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_freespacemap/pg_freespacemap.conf
diff --git a/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql b/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql
new file mode 100644
index 00000000000..1f25877a175
--- /dev/null
+++ b/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql
@@ -0,0 +1,13 @@
+/* contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_freespacemap UPDATE TO '1.3'" to load this file. \quit
+
+CREATE FUNCTION pg_truncate_freespace_map(regclass)
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_truncate_freespace_map'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+
+
diff --git a/contrib/pg_freespacemap/pg_freespacemap.c b/contrib/pg_freespacemap/pg_freespacemap.c
index b82cab2d97e..17f6a631ad8 100644
--- a/contrib/pg_freespacemap/pg_freespacemap.c
+++ b/contrib/pg_freespacemap/pg_freespacemap.c
@@ -9,8 +9,12 @@
 #include "postgres.h"
 
 #include "access/relation.h"
+#include "access/xloginsert.h"
+#include "catalog/storage_xlog.h"
 #include "funcapi.h"
 #include "storage/freespace.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
 
 PG_MODULE_MAGIC;
 
@@ -20,6 +24,9 @@ PG_MODULE_MAGIC;
  */
 PG_FUNCTION_INFO_V1(pg_freespace);
 
+/* Truncate the free space map, in case of corruption */
+PG_FUNCTION_INFO_V1(pg_truncate_freespace_map);
+
 Datum
 pg_freespace(PG_FUNCTION_ARGS)
 {
@@ -40,3 +47,54 @@ pg_freespace(PG_FUNCTION_ARGS)
 	relation_close(rel, AccessShareLock);
 	PG_RETURN_INT16(freespace);
 }
+
+
+Datum
+pg_truncate_freespace_map(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
+	ForkNumber	fork;
+	BlockNumber block;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	/* Only some relkinds have a freespacemap */
+	if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is of wrong relation kind",
+						RelationGetRelationName(rel)),
+				 errdetail_relkind_not_supported(rel->rd_rel->relkind)));
+
+
+	/* Forcibly reset cached file size */
+	RelationGetSmgr(rel)->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
+
+	/* Just pretend we're going to wipeout the whole rel */
+	block = FreeSpaceMapPrepareTruncateRel(rel, 0);
+
+	if (BlockNumberIsValid(block))
+	{
+		fork = FSM_FORKNUM;
+		smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
+	}
+
+	if (RelationNeedsWAL(rel))
+	{
+		xl_smgr_truncate xlrec;
+
+		xlrec.blkno = 0;
+		xlrec.rlocator = rel->rd_locator;
+		xlrec.flags = SMGR_TRUNCATE_FSM;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+		XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+	}
+
+	relation_close(rel, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/pg_freespacemap/pg_freespacemap.control b/contrib/pg_freespacemap/pg_freespacemap.control
index ac8fc5050a9..1992320691b 100644
--- a/contrib/pg_freespacemap/pg_freespacemap.control
+++ b/contrib/pg_freespacemap/pg_freespacemap.control
@@ -1,5 +1,5 @@
 # pg_freespacemap extension
 comment = 'examine the free space map (FSM)'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/pg_freespacemap'
 relocatable = true
-- 
2.43.2

#2Stephen Frost
sfrost@snowman.net
In reply to: Ronan Dunklau (#1)
Re: Provide a pg_truncate_freespacemap function

Greetings,

* Ronan Dunklau (ronan.dunklau@aiven.io) wrote:

As we are currently experiencing a FSM corruption issue [1], we need to
rebuild FSM when we detect it.

Ideally, we'd figure out a way to pick up on this and address it without
the user needing to intervene, however ...

I noticed we have something to truncate a visibility map, but nothing for the
freespace map, so I propose the attached (liberally copied from the VM
counterpart) to allow to truncate a FSM without incurring downtime, as
currently our only options are to either VACUUM FULL the table or stop the
cluster and remove the FSM manually.

I agree that this would generally be a useful thing to have.

Does that seem correct ?

Definitely needs to have a 'REVOKE ALL ON FUNCTION' at the end of the
upgrade script, similar to what you'll find at the bottom of
pg_visibility--1.1.sql in the tree today, otherwise anyone could run it.

Beyond that, I'd suggest a function-level comment above the definition
of the function itself (which is where we tend to put those- not at the
point where we declare the function).

Thanks!

Stephen

#3Ronan Dunklau
ronan.dunklau@aiven.io
In reply to: Stephen Frost (#2)
1 attachment(s)
Re: Provide a pg_truncate_freespacemap function

Le mercredi 6 mars 2024, 20:28:44 CET Stephen Frost a écrit :

I agree that this would generally be a useful thing to have.

Thanks !

Does that seem correct ?

Definitely needs to have a 'REVOKE ALL ON FUNCTION' at the end of the
upgrade script, similar to what you'll find at the bottom of
pg_visibility--1.1.sql in the tree today, otherwise anyone could run it.

Beyond that, I'd suggest a function-level comment above the definition
of the function itself (which is where we tend to put those- not at the
point where we declare the function).

Thank you for the review. Here is an updated patch for both of those.

Best regards,

--
Ronan

Attachments:

0002-Provide-a-pg_truncate_freespacemap-function.patchtext/x-patch; charset=x-UTF_8J; name=0002-Provide-a-pg_truncate_freespacemap-function.patchDownload
From 812061b47443ce1052f65ba2867223f9db2cfa18 Mon Sep 17 00:00:00 2001
From: Ronan Dunklau <ronan.dunklau@aiven.io>
Date: Fri, 1 Mar 2024 08:57:49 +0100
Subject: [PATCH] Provide a pg_truncate_freespacemap function.

Similar to the pg_truncate_visibilitymap function, this one allows to
avoid downtime to fix an FSM in the case a corruption event happens
---
 contrib/pg_freespacemap/Makefile              |  5 +-
 .../pg_freespacemap--1.2--1.3.sql             | 12 ++++
 contrib/pg_freespacemap/pg_freespacemap.c     | 64 +++++++++++++++++++
 .../pg_freespacemap/pg_freespacemap.control   |  2 +-
 4 files changed, 80 insertions(+), 3 deletions(-)
 create mode 100644 contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql

diff --git a/contrib/pg_freespacemap/Makefile b/contrib/pg_freespacemap/Makefile
index b48e4b255bc..0f4b52f5812 100644
--- a/contrib/pg_freespacemap/Makefile
+++ b/contrib/pg_freespacemap/Makefile
@@ -6,8 +6,9 @@ OBJS = \
 	pg_freespacemap.o
 
 EXTENSION = pg_freespacemap
-DATA = pg_freespacemap--1.1.sql pg_freespacemap--1.1--1.2.sql \
-	pg_freespacemap--1.0--1.1.sql
+DATA = pg_freespacemap--1.1.sql pg_freespacemap--1.2--1.3.sql \
+			pg_freespacemap--1.1--1.2.sql \
+			pg_freespacemap--1.0--1.1.sql
 PGFILEDESC = "pg_freespacemap - monitoring of free space map"
 
 REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_freespacemap/pg_freespacemap.conf
diff --git a/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql b/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql
new file mode 100644
index 00000000000..cc94eccf2f6
--- /dev/null
+++ b/contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql
@@ -0,0 +1,12 @@
+/* contrib/pg_freespacemap/pg_freespacemap--1.2--1.3.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_freespacemap UPDATE TO '1.3'" to load this file. \quit
+
+CREATE FUNCTION pg_truncate_freespace_map(regclass)
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_truncate_freespace_map'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+REVOKE ALL ON FUNCTION pg_truncate_freespace_map(regclass) FROM PUBLIC;
diff --git a/contrib/pg_freespacemap/pg_freespacemap.c b/contrib/pg_freespacemap/pg_freespacemap.c
index b82cab2d97e..fa492c18534 100644
--- a/contrib/pg_freespacemap/pg_freespacemap.c
+++ b/contrib/pg_freespacemap/pg_freespacemap.c
@@ -9,8 +9,12 @@
 #include "postgres.h"
 
 #include "access/relation.h"
+#include "access/xloginsert.h"
+#include "catalog/storage_xlog.h"
 #include "funcapi.h"
 #include "storage/freespace.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
 
 PG_MODULE_MAGIC;
 
@@ -20,6 +24,8 @@ PG_MODULE_MAGIC;
  */
 PG_FUNCTION_INFO_V1(pg_freespace);
 
+PG_FUNCTION_INFO_V1(pg_truncate_freespace_map);
+
 Datum
 pg_freespace(PG_FUNCTION_ARGS)
 {
@@ -40,3 +46,61 @@ pg_freespace(PG_FUNCTION_ARGS)
 	relation_close(rel, AccessShareLock);
 	PG_RETURN_INT16(freespace);
 }
+
+
+/*
+ * Remove the free space map for a relation.
+ *
+ * This is useful in case of corruption of the FSM, as the
+ * only other options are either triggering a VACUUM FULL or
+ * shutting down the server and removing the FSM on the filesystem.
+ */
+Datum
+pg_truncate_freespace_map(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
+	ForkNumber	fork;
+	BlockNumber block;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	/* Only some relkinds have a freespace map */
+	if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is of wrong relation kind",
+						RelationGetRelationName(rel)),
+				 errdetail_relkind_not_supported(rel->rd_rel->relkind)));
+
+
+	/* Forcibly reset cached file size */
+	RelationGetSmgr(rel)->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
+
+	/* Just pretend we're going to wipeout the whole rel */
+	block = FreeSpaceMapPrepareTruncateRel(rel, 0);
+
+	if (BlockNumberIsValid(block))
+	{
+		fork = FSM_FORKNUM;
+		smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
+	}
+
+	if (RelationNeedsWAL(rel))
+	{
+		xl_smgr_truncate xlrec;
+
+		xlrec.blkno = 0;
+		xlrec.rlocator = rel->rd_locator;
+		xlrec.flags = SMGR_TRUNCATE_FSM;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+		XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+	}
+
+	relation_close(rel, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/pg_freespacemap/pg_freespacemap.control b/contrib/pg_freespacemap/pg_freespacemap.control
index ac8fc5050a9..1992320691b 100644
--- a/contrib/pg_freespacemap/pg_freespacemap.control
+++ b/contrib/pg_freespacemap/pg_freespacemap.control
@@ -1,5 +1,5 @@
 # pg_freespacemap extension
 comment = 'examine the free space map (FSM)'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/pg_freespacemap'
 relocatable = true
-- 
2.44.0

#4Fujii Masao
masao.fujii@oss.nttdata.com
In reply to: Ronan Dunklau (#3)
Re: Provide a pg_truncate_freespacemap function

On 2024/03/07 16:59, Ronan Dunklau wrote:

Le mercredi 6 mars 2024, 20:28:44 CET Stephen Frost a écrit :

I agree that this would generally be a useful thing to have.

Thanks !

Does that seem correct ?

Definitely needs to have a 'REVOKE ALL ON FUNCTION' at the end of the
upgrade script, similar to what you'll find at the bottom of
pg_visibility--1.1.sql in the tree today, otherwise anyone could run it.

Beyond that, I'd suggest a function-level comment above the definition
of the function itself (which is where we tend to put those- not at the
point where we declare the function).

Thank you for the review. Here is an updated patch for both of those.

Here are my review comments:

The documentation for pg_freespace needs updating.

A regression test for pg_truncate_freespace_map() should be added.

+	/* Only some relkinds have a freespace map */
+	if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is of wrong relation kind",
+						RelationGetRelationName(rel)),
+				 errdetail_relkind_not_supported(rel->rd_rel->relkind)));

An index can have an FSM, but this code doesn't account for that.

+ smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);

Shouldn't truncation be performed after WAL-logging due to the WAL rule?
I'm not sure if the current order might actually cause significant issues
in FSM truncation case, though.

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#5Tom Lane
tgl@sss.pgh.pa.us
In reply to: Fujii Masao (#4)
Re: Provide a pg_truncate_freespacemap function

Fujii Masao <masao.fujii@oss.nttdata.com> writes:

Le mercredi 6 mars 2024, 20:28:44 CET Stephen Frost a écrit :

I agree that this would generally be a useful thing to have.

Personally, I want to push back on whether this has any legitimate
use-case. Even if the FSM is corrupt, it should self-heal over
time, and I'm not seeing the argument why truncating it would
speed convergence towards correct values. Worse, in the interim
where you don't have any FSM, you will suffer table bloat because
insertions will be appended at the end of the table. So this
looks like a foot-gun, and the patch's lack of user-visible
documentation surely does nothing to make it safer.

(The analogy to pg_truncate_visibility_map seems forced.
If you are in a situation with a trashed visibility map,
you are probably getting wrong query answers, and truncating
the map will make that better. But a trashed FSM doesn't
result in incorrect output, and zeroing it will make things
worse not better.)

regards, tom lane

#6Ronan Dunklau
ronan.dunklau@aiven.io
In reply to: Tom Lane (#5)
Re: Provide a pg_truncate_freespacemap function

Le dimanche 21 juillet 2024, 07:39:13 UTC+2 Tom Lane a écrit :

Fujii Masao <masao.fujii@oss.nttdata.com> writes:

Le mercredi 6 mars 2024, 20:28:44 CET Stephen Frost a écrit :

I agree that this would generally be a useful thing to have.

Sorry for the late reply, as I was not available during the late summer.

Personally, I want to push back on whether this has any legitimate
use-case. Even if the FSM is corrupt, it should self-heal over
time, and I'm not seeing the argument why truncating it would
speed convergence towards correct values. Worse, in the interim
where you don't have any FSM, you will suffer table bloat because
insertions will be appended at the end of the table. So this
looks like a foot-gun, and the patch's lack of user-visible
documentation surely does nothing to make it safer.

(The analogy to pg_truncate_visibility_map seems forced.
If you are in a situation with a trashed visibility map,
you are probably getting wrong query answers, and truncating
the map will make that better. But a trashed FSM doesn't
result in incorrect output, and zeroing it will make things
worse not better.)

Now that the other patch for self-healing is in, I agree it may not be that
useful. I'm withdrawing the patch and will keep it in mind if we encounter
other FSM issues in the future.

Best regards,

--
Ronan Dunklau