[POC] Allow an extension to add data into Query and PlannedStmt nodes

Started by Andrey Lepikhovalmost 3 years ago3 messages

a.lepikhov@postgrespro.ru

almost 3 years ago

1 attachment(s)

Hi,

Previously, we read int this mailing list some controversial opinions on
queryid generation and Jumbling technique. Here we don't intend to solve
these problems but help an extension at least don't conflict with others
on the queryId value.

Extensions could need to pass some query-related data through all stages
of the query planning and execution. As a trivial example,
pg_stat_statements uses queryid at the end of execution to save some
statistics. One more reason - extensions now conflict on queryid value
and the logic of its changing. With this patch, it can be managed.

This patch introduces the structure 'ExtensionData' which allows to
manage of a list of entries with a couple of interface functions
addExtensionDataToNode() and GetExtensionData(). Keep in mind the
possible future hiding of this structure from the public interface.
An extension should invent a symbolic key to identify its data. It may
invent as many additional keys as it wants but the best option here - is
no more than one entry for each extension.
Usage of this machinery is demonstrated by the pg_stat_statements
example - here we introduced Bigint node just for natively storing of
queryId value.

Ruthless pgbench benchmark shows that we got some overhead:
1.6% - in default mode
4% - in prepared mode
~0.1% in extended mode.

An optimization that avoids copying of queryId by storing it into the
node pointer field directly allows to keep this overhead in a range of
%0.5 for all these modes but increases complexity. So here we
demonstrate not optimized variant.

Some questions still cause doubts:
- QueryRewrite: should we copy extension fields from the parent
parsetree to the rewritten ones?
- Are we need to invent a registration procedure to do away with the
names of entries and use some compact integer IDs?
- Do we need to optimize this structure to avoid a copy for simple data
types, for example, inventing something like A_Const?

All in all, in our opinion, this issue is tend to grow with an
increasing number of extensions that utilize planner and executor hooks
for some purposes. So, any thoughts will be useful.

--
Regards
Andrey Lepikhov
Postgres Professional

Attachments:

v0-0001-Add-on-more-field-into-Query-and-PlannedStmt.patchtext/x-patch; charset=UTF-8; name=v0-0001-Add-on-more-field-into-Query-and-PlannedStmt.patchDownload

From 944ce61d7ff934727240d90ee7620bfb69ad3a5a Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Wed, 22 Mar 2023 16:59:30 +0500
Subject: [PATCH] Add on more field into Query and PlannedStmt structures to
 allow an extension to pass some data across all the execution stages of
 specific instance of the query.

---
 .../pg_stat_statements/pg_stat_statements.c   |  57 ++++++++--
 src/backend/commands/extension.c              | 103 ++++++++++++++++++
 src/backend/executor/execParallel.c           |   1 +
 src/backend/nodes/value.c                     |  12 ++
 src/backend/optimizer/plan/planner.c          |   1 +
 src/backend/rewrite/rewriteHandler.c          |   6 +
 src/backend/tcop/postgres.c                   |   2 +
 src/include/commands/extension.h              |   4 +
 src/include/nodes/parsenodes.h                |  23 ++++
 src/include/nodes/plannodes.h                 |   2 +
 src/include/nodes/value.h                     |  10 ++
 11 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 5285c3f7fa..fc47661c86 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -49,6 +49,7 @@
 
 #include "access/parallel.h"
 #include "catalog/pg_authid.h"
+#include "commands/extension.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
 #include "funcapi.h"
@@ -107,6 +108,14 @@ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
 									!IsA(n, PrepareStmt) && \
 									!IsA(n, DeallocateStmt))
 
+#define GET_QUERYID(node) \
+	(Bigint *) GetExtensionData(node->ext_field, "pg_stat_statements")
+
+#define INSERT_QUERYID(node, queryid) \
+	castNode(Bigint, AddExtensionDataToNode((Node *) node, \
+											"pg_stat_statements", \
+											(Node *) makeBigint((int64) queryid), \
+											true))
 /*
  * Extension version number, for supporting older extension versions' objects
  */
@@ -821,6 +830,13 @@ error:
 static void
 pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
 {
+	Bigint *queryId;
+
+	if ((queryId = GET_QUERYID(query)) == NULL)
+		queryId = INSERT_QUERYID(query, query->queryId);
+
+	Assert(queryId);
+
 	if (prev_post_parse_analyze_hook)
 		prev_post_parse_analyze_hook(pstate, query, jstate);
 
@@ -837,7 +853,7 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
 	{
 		if (pgss_track_utility && !PGSS_HANDLED_UTILITY(query->utilityStmt))
 		{
-			query->queryId = UINT64CONST(0);
+			queryId->val = UINT64CONST(0);
 			return;
 		}
 	}
@@ -851,7 +867,7 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
 	 */
 	if (jstate && jstate->clocations_count > 0)
 		pgss_store(pstate->p_sourcetext,
-				   query->queryId,
+				   queryId->val,
 				   query->stmt_location,
 				   query->stmt_len,
 				   PGSS_INVALID,
@@ -873,7 +889,10 @@ pgss_planner(Query *parse,
 			 int cursorOptions,
 			 ParamListInfo boundParams)
 {
-	PlannedStmt *result;
+	PlannedStmt	   *result;
+	Bigint		   *queryId;
+
+	queryId = GET_QUERYID(parse);
 
 	/*
 	 * We can't process the query if no query_string is provided, as
@@ -889,7 +908,7 @@ pgss_planner(Query *parse,
 	 */
 	if (pgss_enabled(plan_nested_level + exec_nested_level)
 		&& pgss_track_planning && query_string
-		&& parse->queryId != UINT64CONST(0))
+		&& queryId && queryId->val != UINT64CONST(0))
 	{
 		instr_time	start;
 		instr_time	duration;
@@ -936,7 +955,7 @@ pgss_planner(Query *parse,
 		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
 
 		pgss_store(query_string,
-				   parse->queryId,
+				   queryId->val,
 				   parse->stmt_location,
 				   parse->stmt_len,
 				   PGSS_PLAN,
@@ -966,17 +985,22 @@ pgss_planner(Query *parse,
 static void
 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 {
+	Bigint *queryId;
+
 	if (prev_ExecutorStart)
 		prev_ExecutorStart(queryDesc, eflags);
 	else
 		standard_ExecutorStart(queryDesc, eflags);
 
+	queryId = GET_QUERYID(queryDesc->plannedstmt);
+
 	/*
 	 * If query has queryId zero, don't track it.  This prevents double
 	 * counting of optimizable statements that are directly contained in
 	 * utility statements.
 	 */
-	if (pgss_enabled(exec_nested_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
+	if (pgss_enabled(exec_nested_level) &&
+		queryId && queryId->val != UINT64CONST(0))
 	{
 		/*
 		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
@@ -1043,9 +1067,11 @@ pgss_ExecutorFinish(QueryDesc *queryDesc)
 static void
 pgss_ExecutorEnd(QueryDesc *queryDesc)
 {
-	uint64		queryId = queryDesc->plannedstmt->queryId;
+	Bigint *queryId;
+
+	queryId = GET_QUERYID(queryDesc->plannedstmt);
 
-	if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
+	if (queryId && queryId->val != UINT64CONST(0) && queryDesc->totaltime &&
 		pgss_enabled(exec_nested_level))
 	{
 		/*
@@ -1055,7 +1081,7 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
 		InstrEndLoop(queryDesc->totaltime);
 
 		pgss_store(queryDesc->sourceText,
-				   queryId,
+				   queryId->val,
 				   queryDesc->plannedstmt->stmt_location,
 				   queryDesc->plannedstmt->stmt_len,
 				   PGSS_EXEC,
@@ -1084,10 +1110,17 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 					DestReceiver *dest, QueryCompletion *qc)
 {
 	Node	   *parsetree = pstmt->utilityStmt;
-	uint64		saved_queryId = pstmt->queryId;
+	uint64		saved_queryId;
+	Bigint	   *queryId;
 	int			saved_stmt_location = pstmt->stmt_location;
 	int			saved_stmt_len = pstmt->stmt_len;
 
+	if ((queryId = GET_QUERYID(pstmt)) == NULL)
+		queryId = INSERT_QUERYID(pstmt, pstmt->queryId);
+
+	Assert(queryId);
+	saved_queryId = queryId->val;
+
 	/*
 	 * Force utility statements to get queryId zero.  We do this even in cases
 	 * where the statement contains an optimizable statement for which a
@@ -1103,7 +1136,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 	 * only.
 	 */
 	if (pgss_enabled(exec_nested_level) && pgss_track_utility)
-		pstmt->queryId = UINT64CONST(0);
+		queryId->val = UINT64CONST(0);
 
 	/*
 	 * If it's an EXECUTE statement, we don't track it and don't increment the
@@ -1158,7 +1191,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		 * freed.  We must copy everything we still need into local variables,
 		 * which we did above.
 		 *
-		 * For the same reason, we can't risk restoring pstmt->queryId to its
+		 * For the same reason, we can't risk restoring queryId to its
 		 * former value, which'd otherwise be a good idea.
 		 */
 
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 02ff4a9a7f..74f8a820cb 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -3395,3 +3395,106 @@ read_whole_file(const char *filename, int *length)
 	buf[*length] = '\0';
 	return buf;
 }
+
+static ExtensionData *
+_initial_actions(Node *node, const char *entryname)
+{
+	ExtensionData **ext_field;
+	ExtensionData  *scanner;
+	ExtensionData  *elem = NULL;
+
+	switch (nodeTag(node))
+	{
+		case T_Query:
+			ext_field = &((Query *) node)->ext_field;
+			break;
+		case T_PlannedStmt:
+			ext_field = &((PlannedStmt *) node)->ext_field;
+			break;
+		default:
+			/* TODO: use ereport */
+			elog(ERROR, "Unexpected node type: %d", nodeTag(node));
+	}
+
+	Assert(strlen(entryname) > 0);
+#ifdef USE_ASSERT_CHECKING
+	{
+		const char *c;
+		/* check name doesn't contain escapable symbols */
+		for (c = entryname; *c; c++)
+			Assert(isgraph(*c) && strchr("(){}\\\"", *c) == NULL);
+	}
+#endif
+
+	/* Check existing entry */
+	for (scanner = *ext_field; scanner != NULL; scanner = scanner->next)
+	{
+		if (strcmp(scanner->name, entryname) == 0)
+		{
+				elem = scanner;
+				break;
+		}
+	}
+
+	if (elem == NULL)
+	{
+		elem = makeNode(ExtensionData);
+		elem->next = *ext_field;
+		elem->name = pstrdup(entryname);
+		*ext_field = elem;
+	}
+
+	return elem;
+}
+
+Node *
+AddExtensionDataToNode(Node *node, const char *entryname, const Node *data,
+					   bool noCopy)
+{
+	MemoryContext	destCtx = GetMemoryChunkContext(node);
+	MemoryContext	oldctx;
+	ExtensionData  *elem;
+
+	/* Safely store in the Query memory context */
+	oldctx = MemoryContextSwitchTo(destCtx);
+
+	elem = _initial_actions(node, entryname);
+	elem->node = (Node *) (noCopy ? data : copyObject(data));
+
+	MemoryContextSwitchTo(oldctx);
+
+#ifdef USE_ASSERT_CHECKING
+	if (noCopy)
+	{
+		/* brute check val is copyable */
+		MemoryContext tmpctx;
+
+		tmpctx = AllocSetContextCreate(CurrentMemoryContext,
+									   "assert_checking",
+									   ALLOCSET_DEFAULT_SIZES);
+		oldctx = MemoryContextSwitchTo(tmpctx);
+		(void) copyObject(data);
+		MemoryContextSwitchTo(oldctx);
+		MemoryContextDelete(tmpctx);
+	}
+#endif
+
+	return elem->node;
+}
+
+/*
+ * Return the node, stored in extended area of Query or PlannedStmt or
+ * NULL, if not found.
+ */
+Node *
+GetExtensionData(ExtensionData *extdata, const char *entryname)
+{
+	while (extdata != NULL)
+	{
+		if (strcmp(entryname, extdata->name) == 0)
+			return extdata->node;
+
+		extdata = extdata->next;
+	}
+	return NULL;
+}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index aa3f283453..ae0d67a419 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -176,6 +176,7 @@ ExecSerializePlan(Plan *plan, EState *estate)
 	pstmt = makeNode(PlannedStmt);
 	pstmt->commandType = CMD_SELECT;
 	pstmt->queryId = pgstat_get_my_query_id();
+	/* What about Extension nodes here? */
 	pstmt->hasReturning = false;
 	pstmt->hasModifyingCTE = false;
 	pstmt->canSetTag = true;
diff --git a/src/backend/nodes/value.c b/src/backend/nodes/value.c
index f11074970e..eb1a975e7d 100644
--- a/src/backend/nodes/value.c
+++ b/src/backend/nodes/value.c
@@ -28,6 +28,18 @@ makeInteger(int i)
 	return v;
 }
 
+/*
+ *	makeInteger
+ */
+Bigint *
+makeBigint(int64 i)
+{
+	Bigint    *v = makeNode(Bigint);
+
+	v->val = i;
+	return v;
+}
+
 /*
  *	makeFloat
  *
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index a1873ce26d..fb4ac7fd0a 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -517,6 +517,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 
 	result->commandType = parse->commandType;
 	result->queryId = parse->queryId;
+	result->ext_field = copyObject(parse->ext_field);
 	result->hasReturning = (parse->returningList != NIL);
 	result->hasModifyingCTE = parse->hasModifyingCTE;
 	result->canSetTag = parse->canSetTag;
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 980dc1816f..3403abfe42 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -4232,6 +4232,12 @@ QueryRewrite(Query *parsetree)
 
 		query->queryId = input_query_id;
 
+		/*
+		 * Should we copy extension fields here? Guess no, because it could be
+		 * quite different query. Much simpler to skip these fields and allow
+		 * extensions to realize that and re-generate data if needed.
+		 */
+
 		results = lappend(results, query);
 	}
 
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index cab709b07b..c6675b340c 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -823,6 +823,7 @@ pg_rewrite_query(Query *query)
 			 * here to avoid breaking pg_stat_statements.
 			 */
 			new_query->queryId = curr_query->queryId;
+			new_query->ext_field = copyObject(curr_query->ext_field);
 
 			new_list = lappend(new_list, new_query);
 			pfree(str);
@@ -956,6 +957,7 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions,
 			stmt->stmt_location = query->stmt_location;
 			stmt->stmt_len = query->stmt_len;
 			stmt->queryId = query->queryId;
+			stmt->ext_field = copyObject(query->ext_field);
 		}
 		else
 		{
diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h
index 74ae391395..a652fdbf1b 100644
--- a/src/include/commands/extension.h
+++ b/src/include/commands/extension.h
@@ -53,4 +53,8 @@ extern bool extension_file_exists(const char *extensionName);
 extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema,
 											 Oid *oldschema);
 
+extern Node *AddExtensionDataToNode(Node *node, const char *entryname,
+									const Node *data, bool noCopy);
+extern Node *GetExtensionData(ExtensionData *extdata, const char *entryname);
+
 #endif							/* EXTENSION_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 028588fb33..59a41f398d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -101,6 +101,27 @@ typedef uint64 AclMode;			/* a bitmask of privilege bits */
 #define ACL_SELECT_FOR_UPDATE	ACL_UPDATE
 
 
+/*
+ * Structure definition should be hidden from an user, but current serialization
+ * techniques make it impossible.
+ * XXX: Instead of name, maybe better to invent some unique id for each loaded
+ * library, according to the position in file_list, as an example?
+ */
+typedef struct ExtensionData
+{
+	/*
+	 * Don't allow reading the field (remember, extension may be dynamically
+	 * loaded into a backend. Parallel workers still don't need it.
+	 * Allow writing it for debug purposes.
+	 */
+	pg_node_attr(no_equal, no_read, no_query_jumble)
+
+	NodeTag					type;
+	char				   *name;
+	Node				   *node;
+	struct ExtensionData   *next;
+} ExtensionData;
+
 /*****************************************************************************
  *	Query Tree
  *****************************************************************************/
@@ -236,6 +257,8 @@ typedef struct Query
 	int			stmt_location;
 	/* length in bytes; 0 means "rest of string" */
 	int			stmt_len pg_node_attr(query_jumble_ignore);
+
+	ExtensionData *ext_field pg_node_attr(query_jumble_ignore, read_write_ignore, equal_ignore, read_as(NULL));
 } Query;
 
 
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 659bd05c0c..038a77e57e 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -101,6 +101,8 @@ typedef struct PlannedStmt
 	/* statement location in source string (copied from Query) */
 	int			stmt_location;	/* start location, or -1 if unknown */
 	int			stmt_len;		/* length in bytes; 0 means "rest of string" */
+
+	ExtensionData *ext_field  pg_node_attr(query_jumble_ignore, read_write_ignore, equal_ignore, read_as(NULL));
 } PlannedStmt;
 
 /* macro for fetching the Plan associated with a SubPlan node */
diff --git a/src/include/nodes/value.h b/src/include/nodes/value.h
index b24c4c1afe..6774a0d9a7 100644
--- a/src/include/nodes/value.h
+++ b/src/include/nodes/value.h
@@ -33,6 +33,14 @@ typedef struct Integer
 	int			ival;
 } Integer;
 
+typedef struct Bigint
+{
+	pg_node_attr(special_read_write)
+
+	NodeTag		type;
+	int64		val;
+} Bigint;
+
 /*
  * Float is internally represented as string.  Using T_Float as the node type
  * simply indicates that the contents of the string look like a valid numeric
@@ -77,11 +85,13 @@ typedef struct BitString
 } BitString;
 
 #define intVal(v)		(castNode(Integer, v)->ival)
+#define bigintVal(v)	(castNode(Bigint, v)->val)
 #define floatVal(v)		atof(castNode(Float, v)->fval)
 #define boolVal(v)		(castNode(Boolean, v)->boolval)
 #define strVal(v)		(castNode(String, v)->sval)
 
 extern Integer *makeInteger(int i);
+extern Bigint *makeBigint(int64 i);
 extern Float *makeFloat(char *numericStr);
 extern Boolean *makeBoolean(bool val);
 extern String *makeString(char *str);
-- 
2.34.1

Julien Rouhaud

rjuju123@gmail.com

almost 3 years ago

In reply to: Andrey Lepikhov (#1)

Re: [POC] Allow an extension to add data into Query and PlannedStmt nodes

Hi,

On Wed, Mar 29, 2023 at 12:02:30PM +0500, Andrey Lepikhov wrote:

Previously, we read int this mailing list some controversial opinions on
queryid generation and Jumbling technique. Here we don't intend to solve
these problems but help an extension at least don't conflict with others on
the queryId value.

Extensions could need to pass some query-related data through all stages of
the query planning and execution. As a trivial example, pg_stat_statements
uses queryid at the end of execution to save some statistics. One more
reason - extensions now conflict on queryid value and the logic of its
changing. With this patch, it can be managed.

I just had a quick lookc at the patch, and IIUC it doesn't really help on that
side, as there's still a single official "queryid" that's computed, stored
everywhere and later used by pg_stat_statements (which does then store in
additionally to that official queryid).

You can currently change the main jumbling algorithm with a custom extension,
and all extensions will then use it as the source of truth, but I guess that
what you want is to e.g. have an additional and semantically different queryid,
and create multiple ecosystem of extensions, each using one or the other source
of queryid without changing the other ecosystem behavior.

This patch introduces the structure 'ExtensionData' which allows to manage
of a list of entries with a couple of interface functions
addExtensionDataToNode() and GetExtensionData(). Keep in mind the possible
future hiding of this structure from the public interface.
An extension should invent a symbolic key to identify its data. It may
invent as many additional keys as it wants but the best option here - is no
more than one entry for each extension.
Usage of this machinery is demonstrated by the pg_stat_statements example -
here we introduced Bigint node just for natively storing of queryId value.

Ruthless pgbench benchmark shows that we got some overhead:
1.6% - in default mode
4% - in prepared mode
~0.1% in extended mode.

That's a quite significant overhead. But the only reason to accept such a
change is to actually use it to store additional data, so it would be
interesting to see what the overhead is like once you store at least 2
different values there.

- Are we need to invent a registration procedure to do away with the names
of entries and use some compact integer IDs?

Note that the patch as proposed doesn't have any defense for two extensions
trying to register something with the same name, or update a stored value, as
AddExtensionDataToNode() simply prepend the new value to the list. You can
actually update the value by just storing the new value, but it will add a
significant overhead to every other extension that want to read another value.

The API is also quite limited as each stored value has a single identifier.
What if your extension needs to store multiple things? Since it's all based on
Node you can't really store some custom struct, so you probably have to end up
with things like "my_extension.my_val1", "my_extension.my_val2" which isn't
great.

Import Notes

Reply to msg id not found: e321eec2-b91c-1378-250a-e38dcf0ed827@postgrespro.rue321eec2-b91c-1378-250a-e38dcf0ed827@postgrespro.ru

Andrey Lepikhov

a.lepikhov@postgrespro.ru

almost 3 years ago

In reply to: Julien Rouhaud (#2)

1 attachment(s)

Re: [POC] Allow an extension to add data into Query and PlannedStmt nodes

On 30/3/2023 12:57, Julien Rouhaud wrote:

Extensions could need to pass some query-related data through all stages of
the query planning and execution. As a trivial example, pg_stat_statements
uses queryid at the end of execution to save some statistics. One more
reason - extensions now conflict on queryid value and the logic of its
changing. With this patch, it can be managed.

I just had a quick lookc at the patch, and IIUC it doesn't really help on that
side, as there's still a single official "queryid" that's computed, stored
everywhere and later used by pg_stat_statements (which does then store in
additionally to that official queryid).

Thank you for the attention!
This patch doesn't try to solve the problem of oneness of queryId. In
this patch we change pg_stat_statements and it doesn't set 0 into
queryId field according to its internal logic. And another extension
should do the same - use queryId on your own but not erase it - erase
your private copy in the ext_field.

Ruthless pgbench benchmark shows that we got some overhead:
1.6% - in default mode
4% - in prepared mode
~0.1% in extended mode.

That's a quite significant overhead. But the only reason to accept such a
change is to actually use it to store additional data, so it would be
interesting to see what the overhead is like once you store at least 2
different values there.

Yeah, but as I said earlier, it can be reduced to much smaller value
just with simple optimization. Here I intentionally avoid it to discuss
the core concept.

- Are we need to invent a registration procedure to do away with the names
of entries and use some compact integer IDs?

Note that the patch as proposed doesn't have any defense for two extensions
trying to register something with the same name, or update a stored value, as
AddExtensionDataToNode() simply prepend the new value to the list. You can
actually update the value by just storing the new value, but it will add a
significant overhead to every other extension that want to read another value.

Thanks a lot! Patch in attachment implements such an idea - extension
can allocate some entries and use these private IDs to add entries. I
hope, an extension would prefer to use only one entry for all the data
to manage overhead, but still.

The API is also quite limited as each stored value has a single identifier.
What if your extension needs to store multiple things? Since it's all based on
Node you can't really store some custom struct, so you probably have to end up
with things like "my_extension.my_val1", "my_extension.my_val2" which isn't
great.

Main idea here - if an extension holds custom struct and want to pass it
along all planning and execution stages it should use extensible node
with custom read/write/copy routines.

--
regards,
Andrey Lepikhov
Postgres Professional

Attachments:

v0-0002-Add-ExtendedData-entry-registration-routines-to-use-.patchtext/plain; charset=UTF-8; name=v0-0002-Add-ExtendedData-entry-registration-routines-to-use-.patchDownload

From ab101322330684e9839e46c26f70ad5462e40dac Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Thu, 30 Mar 2023 21:49:32 +0500
Subject: [PATCH 2/2] Add ExtendedData entry registration routines to use
 entryId instead of symbolic name.

---
 .../pg_stat_statements/pg_stat_statements.c   |  7 +-
 src/backend/commands/extension.c              | 69 +++++++++++++++----
 src/include/commands/extension.h              |  6 +-
 src/include/nodes/parsenodes.h                |  2 +-
 4 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index fc47661c86..5b21163365 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -109,11 +109,11 @@ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
 									!IsA(n, DeallocateStmt))
 
 #define GET_QUERYID(node) \
-	(Bigint *) GetExtensionData(node->ext_field, "pg_stat_statements")
+	(Bigint *) GetExtensionData(node->ext_field, extendedEntryID)
 
 #define INSERT_QUERYID(node, queryid) \
 	castNode(Bigint, AddExtensionDataToNode((Node *) node, \
-											"pg_stat_statements", \
+											extendedEntryID, \
 											(Node *) makeBigint((int64) queryid), \
 											true))
 /*
@@ -298,6 +298,7 @@ static bool pgss_track_utility = true;	/* whether to track utility commands */
 static bool pgss_track_planning = false;	/* whether to track planning
 											 * duration */
 static bool pgss_save = true;	/* whether to save stats across shutdown */
+static int extendedEntryID = -1; /* Use to add queryId into the Query struct */
 
 
 #define pgss_enabled(level) \
@@ -482,6 +483,8 @@ _PG_init(void)
 	ExecutorEnd_hook = pgss_ExecutorEnd;
 	prev_ProcessUtility = ProcessUtility_hook;
 	ProcessUtility_hook = pgss_ProcessUtility;
+
+	extendedEntryID = RegisterExtensionDataEntry("pg_stat_statements");
 }
 
 /*
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 74f8a820cb..effaeada24 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -3396,8 +3396,53 @@ read_whole_file(const char *filename, int *length)
 	return buf;
 }
 
+#define EXTDATA_ENTRIES_NUM	(64)
+
+/*
+ * Storage for registered ExtensionData entries.
+ * Most common use case should be - registering one entry on extension startup.
+ * It would be better to store complex structures as a node tree in single
+ * extension entry, because it can affect performance and the extension only
+ * knows its data and should be responsible for additional overhead.
+ * So, we need quite limited number of entries.
+ */
+struct
+{
+	char *label;
+} ExtensionDataEntries [EXTDATA_ENTRIES_NUM] = {0};
+
+int
+RegisterExtensionDataEntry(const char *label)
+{
+	int num;
+
+	for (num = 0; num < EXTDATA_ENTRIES_NUM; num++)
+	{
+		MemoryContext oldmemctx;
+
+		if (ExtensionDataEntries[num].label != NULL)
+			continue;
+
+		oldmemctx = MemoryContextSwitchTo(TopMemoryContext);
+		ExtensionDataEntries[num].label = pstrdup(label);
+		MemoryContextSwitchTo(oldmemctx);
+		break;
+	}
+
+	return (num < EXTDATA_ENTRIES_NUM) ? num : -1;
+}
+
+void
+UnRegisterExtensionDataEntry(int num)
+{
+	if (ExtensionDataEntries[num].label == NULL)
+		elog(ERROR, "Extension data entry already freed");
+
+	pfree(ExtensionDataEntries[num].label);
+}
+
 static ExtensionData *
-_initial_actions(Node *node, const char *entryname)
+_initial_actions(Node *node, int entryId)
 {
 	ExtensionData **ext_field;
 	ExtensionData  *scanner;
@@ -3417,19 +3462,13 @@ _initial_actions(Node *node, const char *entryname)
 	}
 
 	Assert(strlen(entryname) > 0);
-#ifdef USE_ASSERT_CHECKING
-	{
-		const char *c;
-		/* check name doesn't contain escapable symbols */
-		for (c = entryname; *c; c++)
-			Assert(isgraph(*c) && strchr("(){}\\\"", *c) == NULL);
-	}
-#endif
+	Assert(entryId >= 0 && entryId < EXTDATA_ENTRIES_NUM &&
+		   ExtensionDataEntries[entryId].label != NULL);
 
 	/* Check existing entry */
 	for (scanner = *ext_field; scanner != NULL; scanner = scanner->next)
 	{
-		if (strcmp(scanner->name, entryname) == 0)
+		if (scanner->eid == entryId)
 		{
 				elem = scanner;
 				break;
@@ -3439,8 +3478,8 @@ _initial_actions(Node *node, const char *entryname)
 	if (elem == NULL)
 	{
 		elem = makeNode(ExtensionData);
+		elem->eid = entryId;
 		elem->next = *ext_field;
-		elem->name = pstrdup(entryname);
 		*ext_field = elem;
 	}
 
@@ -3448,7 +3487,7 @@ _initial_actions(Node *node, const char *entryname)
 }
 
 Node *
-AddExtensionDataToNode(Node *node, const char *entryname, const Node *data,
+AddExtensionDataToNode(Node *node, int entryId, const Node *data,
 					   bool noCopy)
 {
 	MemoryContext	destCtx = GetMemoryChunkContext(node);
@@ -3458,7 +3497,7 @@ AddExtensionDataToNode(Node *node, const char *entryname, const Node *data,
 	/* Safely store in the Query memory context */
 	oldctx = MemoryContextSwitchTo(destCtx);
 
-	elem = _initial_actions(node, entryname);
+	elem = _initial_actions(node, entryId);
 	elem->node = (Node *) (noCopy ? data : copyObject(data));
 
 	MemoryContextSwitchTo(oldctx);
@@ -3487,11 +3526,11 @@ AddExtensionDataToNode(Node *node, const char *entryname, const Node *data,
  * NULL, if not found.
  */
 Node *
-GetExtensionData(ExtensionData *extdata, const char *entryname)
+GetExtensionData(ExtensionData *extdata, int entryId)
 {
 	while (extdata != NULL)
 	{
-		if (strcmp(entryname, extdata->name) == 0)
+		if (extdata->eid == entryId)
 			return extdata->node;
 
 		extdata = extdata->next;
diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h
index a652fdbf1b..c13ceee478 100644
--- a/src/include/commands/extension.h
+++ b/src/include/commands/extension.h
@@ -53,8 +53,10 @@ extern bool extension_file_exists(const char *extensionName);
 extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema,
 											 Oid *oldschema);
 
-extern Node *AddExtensionDataToNode(Node *node, const char *entryname,
+extern int RegisterExtensionDataEntry(const char *label);
+extern void UnRegisterExtensionDataEntry(int num);
+extern Node *AddExtensionDataToNode(Node *node, int entryId,
 									const Node *data, bool noCopy);
-extern Node *GetExtensionData(ExtensionData *extdata, const char *entryname);
+extern Node *GetExtensionData(ExtensionData *extdata, int entryId);
 
 #endif							/* EXTENSION_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 59a41f398d..8996780450 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -117,7 +117,7 @@ typedef struct ExtensionData
 	pg_node_attr(no_equal, no_read, no_query_jumble)
 
 	NodeTag					type;
-	char				   *name;
+	int16					eid; /* entry id. See RegisterExtensionDataEntry */
 	Node				   *node;
 	struct ExtensionData   *next;
 } ExtensionData;
-- 
2.40.0