Add parameter jit_warn_above_fraction

Started by Magnus Haganderalmost 4 years ago38 messages

magnus@hagander.net

almost 4 years ago

1 attachment(s)

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

--
Magnus Hagander
Me: https://www.hagander.net/
Work: https://www.redpill-linpro.com/

Attachments:

jit_warn_above_fraction.patchtext/x-patch; charset=US-ASCII; name=jit_warn_above_fraction.patchDownload

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 7ed8c82a9d..7c3bc56227 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6592,6 +6592,21 @@ local0.*    /var/log/postgresql
        </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-jit-warn-above-fraction" xreflabel="jit_warn_above_fraction">
+      <term><varname>jit_warn_above_fraction</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>jit_warn_above_fraction</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+       <listitem>
+        <para>
+         Causes a warning to be written to the log if the time spent on JIT (see <xref linkend="jit"/>)
+         goes above this fraction of the total query runtime.
+         A value of 0 (the default)disables the warning.
+        </para>
+       </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-log-startup-progress-interval" xreflabel="log_startup_progress_interval">
       <term><varname>log_startup_progress_interval</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/tcop/fastpath.c b/src/backend/tcop/fastpath.c
index d429aa4663..a1bff893a3 100644
--- a/src/backend/tcop/fastpath.c
+++ b/src/backend/tcop/fastpath.c
@@ -196,6 +196,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	struct fp_info *fip;
 	bool		callit;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -305,7 +306,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3c7d08209f..c0487ea67f 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -961,7 +961,9 @@ exec_simple_query(const char *query_string)
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		was_logged = false;
 	bool		use_implicit_block;
+	int			msecs;
 	char		msec_str[32];
+	int64		jit_time = 0;
 
 	/*
 	 * Report query to various monitoring facilities.
@@ -1220,6 +1222,16 @@ exec_simple_query(const char *query_string)
 
 		receiver->rDestroy(receiver);
 
+		/* Collect JIT timings in case it's active */
+		if (jit_enabled && jit_warn_above_fraction > 0 && portal->queryDesc && portal->queryDesc->estate->es_jit)
+		{
+			jit_time +=
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+		}
+
 		PortalDrop(portal, false);
 
 		if (lnext(parsetree_list, parsetree_item) == NULL)
@@ -1290,7 +1302,7 @@ exec_simple_query(const char *query_string)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -1306,6 +1318,16 @@ exec_simple_query(const char *query_string)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT time was %ld ms of %d ms",
+							jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("QUERY STATISTICS");
 
@@ -1333,6 +1355,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	CachedPlanSource *psrc;
 	bool		is_named;
 	bool		save_log_statement_stats = log_statement_stats;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -1548,7 +1571,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -1595,6 +1618,7 @@ exec_bind_message(StringInfo input_message)
 	MemoryContext oldContext;
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		snapshot_set = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2007,7 +2031,7 @@ exec_bind_message(StringInfo input_message)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -2053,6 +2077,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	bool		is_xact_command;
 	bool		execute_is_fetch;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2244,7 +2269,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -2267,6 +2292,22 @@ exec_execute_message(const char *portal_name, long max_rows)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		int64 jit_time =
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT time was %ld ms of %d ms",
+							jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("EXECUTE MESSAGE STATISTICS");
 
@@ -2313,6 +2354,10 @@ check_log_statement(List *stmt_list)
  *		1 if just the duration should be logged
  *		2 if duration and query details should be logged
  *
+ * The millisecond runtime of the query is stored in msecs if any logging parameter
+ * that requires the calculation of time is enabled, even if the result is not that
+ * the entry should be logged. In other cases, it is set to -1.
+ *
  * If logging is needed, the duration in msec is formatted into msec_str[],
  * which must be a 32-byte buffer.
  *
@@ -2320,14 +2365,13 @@ check_log_statement(List *stmt_list)
  * essentially prevents 2 from being returned).
  */
 int
-check_log_duration(char *msec_str, bool was_logged)
+check_log_duration(int *msecs, char *msec_str, bool was_logged)
 {
 	if (log_duration || log_min_duration_sample >= 0 ||
-		log_min_duration_statement >= 0 || xact_is_sampled)
+	log_min_duration_statement >= 0 || xact_is_sampled || jit_warn_above_fraction > 0)
 	{
 		long		secs;
 		int			usecs;
-		int			msecs;
 		bool		exceeded_duration;
 		bool		exceeded_sample_duration;
 		bool		in_sample = false;
@@ -2335,7 +2379,7 @@ check_log_duration(char *msec_str, bool was_logged)
 		TimestampDifference(GetCurrentStatementStartTimestamp(),
 							GetCurrentTimestamp(),
 							&secs, &usecs);
-		msecs = usecs / 1000;
+		*msecs = usecs / 1000;
 
 		/*
 		 * This odd-looking test for log_min_duration_* being exceeded is
@@ -2345,12 +2389,12 @@ check_log_duration(char *msec_str, bool was_logged)
 		exceeded_duration = (log_min_duration_statement == 0 ||
 							 (log_min_duration_statement > 0 &&
 							  (secs > log_min_duration_statement / 1000 ||
-							   secs * 1000 + msecs >= log_min_duration_statement)));
+							   secs * 1000 + *msecs >= log_min_duration_statement)));
 
 		exceeded_sample_duration = (log_min_duration_sample == 0 ||
 									(log_min_duration_sample > 0 &&
 									 (secs > log_min_duration_sample / 1000 ||
-									  secs * 1000 + msecs >= log_min_duration_sample)));
+									  secs * 1000 + *msecs >= log_min_duration_sample)));
 
 		/*
 		 * Do not log if log_statement_sample_rate = 0. Log a sample if
@@ -2365,13 +2409,17 @@ check_log_duration(char *msec_str, bool was_logged)
 		if (exceeded_duration || in_sample || log_duration || xact_is_sampled)
 		{
 			snprintf(msec_str, 32, "%ld.%03d",
-					 secs * 1000 + msecs, usecs % 1000);
+					 secs * 1000 + *msecs, usecs % 1000);
 			if ((exceeded_duration || in_sample || xact_is_sampled) && !was_logged)
 				return 2;
 			else
 				return 1;
 		}
 	}
+	else
+	{
+		*msecs = -1;
+	}
 
 	return 0;
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1e3650184b..1abcf90f14 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -608,6 +608,7 @@ int			log_parameter_max_length_on_error = 0;
 int			log_temp_files = -1;
 double		log_statement_sample_rate = 1.0;
 double		log_xact_sample_rate = 0;
+double		jit_warn_above_fraction = 0;
 int			trace_recovery_messages = LOG;
 char	   *backtrace_functions;
 char	   *backtrace_symbol_list;
@@ -3873,6 +3874,19 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"jit_warn_above_fraction", PGC_SUSET, LOGGING_WHEN,
+			gettext_noop("Sets the fraction of query time spent on JIT before writing"
+						 "a warning to the log."),
+			gettext_noop("Write a message tot he server log if more than this"
+						 "fraction of the query runtime is spent on JIT."
+						 "Zero turns off the warning.")
+		},
+		&jit_warn_above_fraction,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 4a094bb38b..19795e5398 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -404,6 +404,8 @@
 #jit_optimize_above_cost = 500000	# use expensive JIT optimizations if
 					# query is more expensive than this;
 					# -1 disables
+#jit_warn_above_fraction = 0		# warn if JIT is more than this fraction of
+					# query runtime. 0 disables.
 
 # - Genetic Query Optimizer -
 
diff --git a/src/include/jit/jit.h b/src/include/jit/jit.h
index 707176d9ed..59b37d6dc0 100644
--- a/src/include/jit/jit.h
+++ b/src/include/jit/jit.h
@@ -89,6 +89,7 @@ extern bool jit_tuple_deforming;
 extern double jit_above_cost;
 extern double jit_inline_above_cost;
 extern double jit_optimize_above_cost;
+extern double jit_warn_above_fraction;
 
 
 extern void jit_reset_after_error(void);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 15a11bc3ff..71457be9f7 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -82,7 +82,7 @@ extern void PostgresMain(const char *dbname,
 extern long get_stack_depth_rlimit(void);
 extern void ResetUsage(void);
 extern void ShowUsage(const char *title);
-extern int	check_log_duration(char *msec_str, bool was_logged);
+extern int	check_log_duration(int *msec, char *msec_str, bool was_logged);
 extern void set_debug_options(int debug_flag,
 							  GucContext context, GucSource source);
 extern bool set_plan_disabling_options(const char *arg,

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Magnus Hagander (#1)

Re: Add parameter jit_warn_above_fraction

Hi,

On Fri, Feb 25, 2022 at 04:16:01PM +0100, Magnus Hagander wrote:

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

I think that's tremendously useful, huge +1.

Just a few minor nit:

+ A value of 0 (the default)disables the warning.

missing space

+			ereport(WARNING,
+					(errmsg("JIT time was %ld ms of %d ms",
+							jit_time, msecs)));

"JIT time" may a bit obscure for users, how about "JIT total processing time"?"

+			gettext_noop("Sets the fraction of query time spent on JIT before writing"
+						 "a warning to the log."),
+			gettext_noop("Write a message tot he server log if more than this"
+						 "fraction of the query runtime is spent on JIT."
+						 "Zero turns off the warning.")

missing spaces in the concatenated strings.

The rest of the patch looks good to me.

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: Magnus Hagander (#1)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-02-25 16:16:01 +0100, Magnus Hagander wrote:

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

Hm. Could it make sense to do this as a auto_explain feature?

Greetings,

Andres Freund

Justin Pryzby

pryzby@telsasoft.com

almost 4 years ago

In reply to: Magnus Hagander (#1)

Re: Add parameter jit_warn_above_fraction

On Fri, Feb 25, 2022 at 04:16:01PM +0100, Magnus Hagander wrote:

+	{
+		{"jit_warn_above_fraction", PGC_SUSET, LOGGING_WHEN,
+			gettext_noop("Sets the fraction of query time spent on JIT before writing"
+						 "a warning to the log."),
+			gettext_noop("Write a message tot he server log if more than this"
+						 "fraction of the query runtime is spent on JIT."
+						 "Zero turns off the warning.")
+		},
+		&jit_warn_above_fraction,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},

Should be PGC_USERSET ?

+ gettext_noop("Write a message tot he server log if more than this"

to the

+       if (jit_enabled && jit_warn_above_fraction > 0)                                                                                                                                                                            
+       {                                                                                                                                                                                                                          
+               int64 jit_time =                                                                                                                                                                                                   
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +                                                                                                                     
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +                                                                                                                       
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +                                                                                                                   
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);                                                                                                                        
+                                                                                                                                                                                                                                  
+               if (jit_time > msecs * jit_warn_above_fraction)                                                                                                                                                                    
+               {                                                                                                                                                                                                                  
+                       ereport(WARNING,                                                                                                                                                                                           
+                                       (errmsg("JIT time was %ld ms of %d ms",                                                                                                                                                    
+                                                       jit_time, msecs)));                                                                                                                                                        
+               }                                                                                                                                                                                                                  
+       }

I think it should be a NOTICE (or less?)

Is it more useful if this is applied combined with log_min_duration_statement ?

It's easy to imagine a query for which the planner computes a high cost, but
actually runs quickly. You might get a bunch of WARNINGs that the query took
10 MS and JIT was 75% of that, even if you don't care about queries that take
less than 10 SEC.

I should say that this is already available by processing the output of
autoexplain.

--
Justin

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Andres Freund (#3)

Re: Add parameter jit_warn_above_fraction

On Fri, Feb 25, 2022 at 5:20 PM Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2022-02-25 16:16:01 +0100, Magnus Hagander wrote:

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

Hm. Could it make sense to do this as a auto_explain feature?

It could be. But I was looking for something a lot more "light weight"
than having to install an extension. But yes, if we wanted to, we
could certainly change jit_warn_above_fraction to be
auto_explain.log_min_jit_fraction or something like that, and do
basically the same thing. But then, we could also have
log_min_duration_statement be part of auto_explain instead, so it's
all about where to draw the line :)

--
Magnus Hagander
Me: https://www.hagander.net/
Work: https://www.redpill-linpro.com/

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: Magnus Hagander (#5)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-02-25 17:28:41 +0100, Magnus Hagander wrote:

On Fri, Feb 25, 2022 at 5:20 PM Andres Freund <andres@anarazel.de> wrote:

On 2022-02-25 16:16:01 +0100, Magnus Hagander wrote:

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

Hm. Could it make sense to do this as a auto_explain feature?

It could be. But I was looking for something a lot more "light weight"
than having to install an extension. But yes, if we wanted to, we
could certainly change jit_warn_above_fraction to be
auto_explain.log_min_jit_fraction or something like that, and do
basically the same thing. But then, we could also have
log_min_duration_statement be part of auto_explain instead, so it's
all about where to draw the line :)

I guess it feels a tad on the "too narrow/specific" side of things for the
general code. We don't have log_min_duration_{parsing,planning,execution}
either. But I also get it. So I just wanted to raise it ;)

Greetings,

Andres Freund

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Andres Freund (#6)

Re: Add parameter jit_warn_above_fraction

On Fri, Feb 25, 2022 at 5:47 PM Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2022-02-25 17:28:41 +0100, Magnus Hagander wrote:

On Fri, Feb 25, 2022 at 5:20 PM Andres Freund <andres@anarazel.de> wrote:

On 2022-02-25 16:16:01 +0100, Magnus Hagander wrote:

This patch adds a configuration parameter jit_warn_above_fraction that
will cause a warning to be logged if the fraction of time spent on
doing JIT is bigger than the specified one. For example, this can be
used to track down those cases where JIT ends up taking 90% of the
query runtime because of bad estimates...

Hm. Could it make sense to do this as a auto_explain feature?

It could be. But I was looking for something a lot more "light weight"
than having to install an extension. But yes, if we wanted to, we
could certainly change jit_warn_above_fraction to be
auto_explain.log_min_jit_fraction or something like that, and do
basically the same thing. But then, we could also have
log_min_duration_statement be part of auto_explain instead, so it's
all about where to draw the line :)

I guess it feels a tad on the "too narrow/specific" side of things for the
general code. We don't have log_min_duration_{parsing,planning,execution}
either. But I also get it. So I just wanted to raise it ;)

Oh it's absolutely a valid issue to raise :) In fact, one could
definitely argue that we should have a parameter for auto_explain *as
well*.

It's true we don't have those -- but it's also in my experience a lot
more common to be an issue with JIT. And unfortunately the current
"solution" people tend to apply is to globally disable JIT, and
there's no similar "globally disable parsing or "globally disable
planning" pattern, for obvious reasons.

--
Magnus Hagander
Me: https://www.hagander.net/
Work: https://www.redpill-linpro.com/

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Justin Pryzby (#4)

1 attachment(s)

Re: Add parameter jit_warn_above_fraction

On Fri, Feb 25, 2022 at 5:23 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

On Fri, Feb 25, 2022 at 04:16:01PM +0100, Magnus Hagander wrote:

+     {
+             {"jit_warn_above_fraction", PGC_SUSET, LOGGING_WHEN,
+                     gettext_noop("Sets the fraction of query time spent on JIT before writing"
+                                              "a warning to the log."),
+                     gettext_noop("Write a message tot he server log if more than this"
+                                              "fraction of the query runtime is spent on JIT."
+                                              "Zero turns off the warning.")
+             },
+             &jit_warn_above_fraction,
+             0.0, 0.0, 1.0,
+             NULL, NULL, NULL
+     },

Should be PGC_USERSET ?

Yes. Definitely. Copy/paste thinko.

+ gettext_noop("Write a message tot he server log if more than this"

to the

Yup.

+       if (jit_enabled && jit_warn_above_fraction > 0)
+       {
+               int64 jit_time =
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+                       INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+
+               if (jit_time > msecs * jit_warn_above_fraction)
+               {
+                       ereport(WARNING,
+                                       (errmsg("JIT time was %ld ms of %d ms",
+                                                       jit_time, msecs)));
+               }
+       }

I think it should be a NOTICE (or less?)

Hmm. I'm not so sure.

Other similar parameters often use LOG, but the downside of that is
that it won't be sent to the client.

The problem with using NOTICE is that it won't go to the log by
default. It needs to be at least warning to do that.

Is it more useful if this is applied combined with log_min_duration_statement ?

It's easy to imagine a query for which the planner computes a high cost, but
actually runs quickly. You might get a bunch of WARNINGs that the query took
10 MS and JIT was 75% of that, even if you don't care about queries that take
less than 10 SEC.

Yeah, that's definitely a problem. But which way would you want to tie
it to log_min_duration_statement? That a statement would both have to
take longer than log_min_duration_statement *and* have JIT above a
certain percentage? In my experience that is instead likely to miss
most of the interesting times. Maybe it would need a separate guc for
the timing, but I'd like to avoid that, I'm not sure it's a function
worth *that* much...

I should say that this is already available by processing the output of
autoexplain.

True. You just can't trigger it based on it. (and it can be somewhat
of a PITA to parse things out of auto_explain on busy systems, but it
does give a lot of very useful details)

Meanwhile here is an updated based on your other comments above, as
well as those from Julien.

--
Magnus Hagander
Me: https://www.hagander.net/
Work: https://www.redpill-linpro.com/

Attachments:

jit_warn_above_fraction_v2.patchtext/x-patch; charset=US-ASCII; name=jit_warn_above_fraction_v2.patchDownload

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 7ed8c82a9d..ec4308a480 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6592,6 +6592,21 @@ local0.*    /var/log/postgresql
        </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-jit-warn-above-fraction" xreflabel="jit_warn_above_fraction">
+      <term><varname>jit_warn_above_fraction</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>jit_warn_above_fraction</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+       <listitem>
+        <para>
+         Causes a warning to be written to the log if the time spent on JIT (see <xref linkend="jit"/>)
+         goes above this fraction of the total query runtime.
+         A value of 0 (the default) disables the warning.
+        </para>
+       </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-log-startup-progress-interval" xreflabel="log_startup_progress_interval">
       <term><varname>log_startup_progress_interval</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/tcop/fastpath.c b/src/backend/tcop/fastpath.c
index d429aa4663..a1bff893a3 100644
--- a/src/backend/tcop/fastpath.c
+++ b/src/backend/tcop/fastpath.c
@@ -196,6 +196,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	struct fp_info *fip;
 	bool		callit;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -305,7 +306,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index d7e39aed64..fcf20428a0 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1016,7 +1016,9 @@ exec_simple_query(const char *query_string)
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		was_logged = false;
 	bool		use_implicit_block;
+	int			msecs;
 	char		msec_str[32];
+	int64		jit_time = 0;
 
 	/*
 	 * Report query to various monitoring facilities.
@@ -1275,6 +1277,16 @@ exec_simple_query(const char *query_string)
 
 		receiver->rDestroy(receiver);
 
+		/* Collect JIT timings in case it's active */
+		if (jit_enabled && jit_warn_above_fraction > 0 && portal->queryDesc && portal->queryDesc->estate->es_jit)
+		{
+			jit_time +=
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+		}
+
 		PortalDrop(portal, false);
 
 		if (lnext(parsetree_list, parsetree_item) == NULL)
@@ -1345,7 +1357,7 @@ exec_simple_query(const char *query_string)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -1361,6 +1373,16 @@ exec_simple_query(const char *query_string)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %ld ms of %d ms",
+							jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("QUERY STATISTICS");
 
@@ -1388,6 +1410,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	CachedPlanSource *psrc;
 	bool		is_named;
 	bool		save_log_statement_stats = log_statement_stats;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -1581,7 +1604,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -1628,6 +1651,7 @@ exec_bind_message(StringInfo input_message)
 	MemoryContext oldContext;
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		snapshot_set = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2040,7 +2064,7 @@ exec_bind_message(StringInfo input_message)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -2086,6 +2110,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	bool		is_xact_command;
 	bool		execute_is_fetch;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2277,7 +2302,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -2300,6 +2325,22 @@ exec_execute_message(const char *portal_name, long max_rows)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		int64 jit_time =
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %ld ms of %d ms",
+							jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("EXECUTE MESSAGE STATISTICS");
 
@@ -2346,6 +2387,10 @@ check_log_statement(List *stmt_list)
  *		1 if just the duration should be logged
  *		2 if duration and query details should be logged
  *
+ * The millisecond runtime of the query is stored in msecs if any logging parameter
+ * that requires the calculation of time is enabled, even if the result is not that
+ * the entry should be logged. In other cases, it is set to -1.
+ *
  * If logging is needed, the duration in msec is formatted into msec_str[],
  * which must be a 32-byte buffer.
  *
@@ -2353,14 +2398,13 @@ check_log_statement(List *stmt_list)
  * essentially prevents 2 from being returned).
  */
 int
-check_log_duration(char *msec_str, bool was_logged)
+check_log_duration(int *msecs, char *msec_str, bool was_logged)
 {
 	if (log_duration || log_min_duration_sample >= 0 ||
-		log_min_duration_statement >= 0 || xact_is_sampled)
+	log_min_duration_statement >= 0 || xact_is_sampled || jit_warn_above_fraction > 0)
 	{
 		long		secs;
 		int			usecs;
-		int			msecs;
 		bool		exceeded_duration;
 		bool		exceeded_sample_duration;
 		bool		in_sample = false;
@@ -2368,7 +2412,7 @@ check_log_duration(char *msec_str, bool was_logged)
 		TimestampDifference(GetCurrentStatementStartTimestamp(),
 							GetCurrentTimestamp(),
 							&secs, &usecs);
-		msecs = usecs / 1000;
+		*msecs = usecs / 1000;
 
 		/*
 		 * This odd-looking test for log_min_duration_* being exceeded is
@@ -2378,12 +2422,12 @@ check_log_duration(char *msec_str, bool was_logged)
 		exceeded_duration = (log_min_duration_statement == 0 ||
 							 (log_min_duration_statement > 0 &&
 							  (secs > log_min_duration_statement / 1000 ||
-							   secs * 1000 + msecs >= log_min_duration_statement)));
+							   secs * 1000 + *msecs >= log_min_duration_statement)));
 
 		exceeded_sample_duration = (log_min_duration_sample == 0 ||
 									(log_min_duration_sample > 0 &&
 									 (secs > log_min_duration_sample / 1000 ||
-									  secs * 1000 + msecs >= log_min_duration_sample)));
+									  secs * 1000 + *msecs >= log_min_duration_sample)));
 
 		/*
 		 * Do not log if log_statement_sample_rate = 0. Log a sample if
@@ -2398,13 +2442,17 @@ check_log_duration(char *msec_str, bool was_logged)
 		if (exceeded_duration || in_sample || log_duration || xact_is_sampled)
 		{
 			snprintf(msec_str, 32, "%ld.%03d",
-					 secs * 1000 + msecs, usecs % 1000);
+					 secs * 1000 + *msecs, usecs % 1000);
 			if ((exceeded_duration || in_sample || xact_is_sampled) && !was_logged)
 				return 2;
 			else
 				return 1;
 		}
 	}
+	else
+	{
+		*msecs = -1;
+	}
 
 	return 0;
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6d11f9c71b..cefb05c653 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -608,6 +608,7 @@ int			log_parameter_max_length_on_error = 0;
 int			log_temp_files = -1;
 double		log_statement_sample_rate = 1.0;
 double		log_xact_sample_rate = 0;
+double		jit_warn_above_fraction = 0;
 int			trace_recovery_messages = LOG;
 char	   *backtrace_functions;
 char	   *backtrace_symbol_list;
@@ -3873,6 +3874,19 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"jit_warn_above_fraction", PGC_USERSET, LOGGING_WHEN,
+			gettext_noop("Sets the fraction of query time spent on JIT before writing "
+						 "a warning to the log."),
+			gettext_noop("Write a message to the server log if more than this "
+						 "fraction of the query runtime is spent on JIT. "
+						 "Zero turns off the warning.")
+		},
+		&jit_warn_above_fraction,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 4a094bb38b..19795e5398 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -404,6 +404,8 @@
 #jit_optimize_above_cost = 500000	# use expensive JIT optimizations if
 					# query is more expensive than this;
 					# -1 disables
+#jit_warn_above_fraction = 0		# warn if JIT is more than this fraction of
+					# query runtime. 0 disables.
 
 # - Genetic Query Optimizer -
 
diff --git a/src/include/jit/jit.h b/src/include/jit/jit.h
index 707176d9ed..59b37d6dc0 100644
--- a/src/include/jit/jit.h
+++ b/src/include/jit/jit.h
@@ -89,6 +89,7 @@ extern bool jit_tuple_deforming;
 extern double jit_above_cost;
 extern double jit_inline_above_cost;
 extern double jit_optimize_above_cost;
+extern double jit_warn_above_fraction;
 
 
 extern void jit_reset_after_error(void);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 92291a750d..3f12f7fe25 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -87,7 +87,7 @@ extern void PostgresMain(const char *dbname,
 extern long get_stack_depth_rlimit(void);
 extern void ResetUsage(void);
 extern void ShowUsage(const char *title);
-extern int	check_log_duration(char *msec_str, bool was_logged);
+extern int	check_log_duration(int *msec, char *msec_str, bool was_logged);
 extern void set_debug_options(int debug_flag,
 							  GucContext context, GucSource source);
 extern bool set_plan_disabling_options(const char *arg,

Justin Pryzby

pryzby@telsasoft.com

almost 4 years ago

In reply to: Magnus Hagander (#8)

Re: Add parameter jit_warn_above_fraction

On Mon, Mar 07, 2022 at 01:10:32PM +0100, Magnus Hagander wrote:

On Fri, Feb 25, 2022 at 5:23 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

I think it should be a NOTICE (or less?)

Hmm. I'm not so sure.

Other similar parameters often use LOG, but the downside of that is
that it won't be sent to the client.

The problem with using NOTICE is that it won't go to the log by
default. It needs to be at least warning to do that.

Anyone who uses this parameter is aleady going to be changing GUCs, so it
doesn't need to work by default. The people who are likely to enable this
already monitor their logs and have probably customized their logging
configuration.

Is it more useful if this is applied combined with log_min_duration_statement ?

It's easy to imagine a query for which the planner computes a high cost, but
actually runs quickly. You might get a bunch of WARNINGs that the query took
10 MS and JIT was 75% of that, even if you don't care about queries that take
less than 10 SEC.

Yeah, that's definitely a problem. But which way would you want to tie
it to log_min_duration_statement? That a statement would both have to
take longer than log_min_duration_statement *and* have JIT above a
certain percentage? In my experience that is instead likely to miss
most of the interesting times.

I don't understand - why doesn't it combine trivially with
log_min_duration_statement? Are you saying that the default / pre-existing min
duration may not log all of the intended queries ? I think that just means the
configuration needs to be changed. The GUC needs to allow/help finding these
JIT issues, but going to require an admin's interaction in any case. Avoiding
false positives may be important for it to be useful at all.

Hmm .. should it also combine with min_sample_rate ? Maybe that addresses your
concern.

--
Justin

#10

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Justin Pryzby (#9)

Re: Add parameter jit_warn_above_fraction

On Mon, Mar 7, 2022 at 2:09 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

On Mon, Mar 07, 2022 at 01:10:32PM +0100, Magnus Hagander wrote:

On Fri, Feb 25, 2022 at 5:23 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

I think it should be a NOTICE (or less?)

Hmm. I'm not so sure.

Other similar parameters often use LOG, but the downside of that is
that it won't be sent to the client.

The problem with using NOTICE is that it won't go to the log by
default. It needs to be at least warning to do that.

Anyone who uses this parameter is aleady going to be changing GUCs, so it
doesn't need to work by default. The people who are likely to enable this
already monitor their logs and have probably customized their logging
configuration.

Sure, but if you set log_min_messagse to NOTICE you are likely going
to flood your logs beyond usefulness. And the whole idea of this
parameter is you can keep it on during "normal times" to catch
outliers.

Is it more useful if this is applied combined with log_min_duration_statement ?

It's easy to imagine a query for which the planner computes a high cost, but
actually runs quickly. You might get a bunch of WARNINGs that the query took
10 MS and JIT was 75% of that, even if you don't care about queries that take
less than 10 SEC.

Yeah, that's definitely a problem. But which way would you want to tie
it to log_min_duration_statement? That a statement would both have to
take longer than log_min_duration_statement *and* have JIT above a
certain percentage? In my experience that is instead likely to miss
most of the interesting times.

I don't understand - why doesn't it combine trivially with
log_min_duration_statement? Are you saying that the default / pre-existing min
duration may not log all of the intended queries ? I think that just means the
configuration needs to be changed. The GUC needs to allow/help finding these
JIT issues, but going to require an admin's interaction in any case. Avoiding
false positives may be important for it to be useful at all.

Hmm .. should it also combine with min_sample_rate ? Maybe that addresses your
concern.

For one, what if I don't want to log any queries unless this is the
case? I leave log_min_duration_statement at -1...
Or what if I want to log say only queries taking more than 60 seconds.
Now I will miss all queries taking 30 seconds where 99.9% of the time
is spent on JIT which I definitely *would* want.

If all I wanted was to get the JIT information for all queries over a
certain amount of time, then as someone else mentioned as well, I can
just use auto_explain. The point here is I want to trigger the logging
for queries where the runtime is *shorter* than what I have in
log_min_duration_statement, but a large portion of the runtime is
spent on JIT.

I maen. If I have a bunch of queries that take 10ms and 75% is spent
in JIT (per your example above), that is actually a problem and I'd
like to know about it and fix it (by tuning the jit triggers either
globally or locally). Yes, at 10ms I *may* ignore it, and in general
I'd use pg_stat_statements for that. But at 100ms with 75% JIT it
starts becoming exactly the situation that I intended this patch to
help with.

--
Magnus Hagander
Me: https://www.hagander.net/
Work: https://www.redpill-linpro.com/

#11

Justin Pryzby

pryzby@telsasoft.com

almost 4 years ago

In reply to: Magnus Hagander (#10)

Re: Add parameter jit_warn_above_fraction

On Mon, Mar 07, 2022 at 04:02:16PM +0100, Magnus Hagander wrote:

On Mon, Mar 7, 2022 at 2:09 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

On Mon, Mar 07, 2022 at 01:10:32PM +0100, Magnus Hagander wrote:

On Fri, Feb 25, 2022 at 5:23 PM Justin Pryzby <pryzby@telsasoft.com> wrote:

I think it should be a NOTICE (or less?)

Hmm. I'm not so sure.

Other similar parameters often use LOG, but the downside of that is
that it won't be sent to the client.

The problem with using NOTICE is that it won't go to the log by
default. It needs to be at least warning to do that.

Anyone who uses this parameter is aleady going to be changing GUCs, so it
doesn't need to work by default. The people who are likely to enable this
already monitor their logs and have probably customized their logging
configuration.

Sure, but if you set log_min_messagse to NOTICE you are likely going
to flood your logs beyond usefulness. And the whole idea of this
parameter is you can keep it on during "normal times" to catch
outliers.

I do set log_min_messages - not to NOTICE, but to INFO ;)

Would NOTICEs really flood most people's logs ? From what ? They're aren't
that many, and most seem to be for DDL and utility commands. We do, uh, more
of that than most people would say is reasonable.

I see a couple hundred of these per day:
| PostGIS: Unable to compute statistics for...
And during deployment, a few hundred more for IF NOT EXIST commands.
That's it.

I still think this GUC should not be WARNING. If it's a warning, then *this*
will cause previously-quiet logs to be flooded - the opposite problem. Which
is not desirable for a new guc.

https://www.postgresql.org/docs/current/runtime-config-logging.html

INFO Provides information implicitly requested by the user, e.g., output from VACUUM VERBOSE. INFO INFORMATION
NOTICE Provides information that might be helpful to users, e.g., notice of truncation of long identifiers. NOTICE INFORMATION
WARNING Provides warnings of likely problems, e.g., COMMIT outside a transaction block. NOTICE WARNING

I don't understand - why doesn't it combine trivially with
log_min_duration_statement? Are you saying that the default / pre-existing min
duration may not log all of the intended queries ? I think that just means the
configuration needs to be changed. The GUC needs to allow/help finding these
JIT issues, but going to require an admin's interaction in any case. Avoiding
false positives may be important for it to be useful at all.

Hmm .. should it also combine with min_sample_rate ? Maybe that addresses your
concern.

For one, what if I don't want to log any queries unless this is the
case? I leave log_min_duration_statement at -1...

Then you will conclude that our logging is inadequate for your case.
You need to filter the lines which don't interest you.

Or what if I want to log say only queries taking more than 60 seconds.
Now I will miss all queries taking 30 seconds where 99.9% of the time
is spent on JIT which I definitely *would* want.

If you're unwilling to deal with log entries which are uninteresting, then
clearly you'll need a separate GUC just for log_min_duration_jit_above_fraction
(and maybe another one for jit_above_duration_log_level). Or implement generic
log filtering based on file/severity/message.

As I see it: most people won't set this GUC at all. Those who do might enable
it with a high threshold value of log_min_duration_statement (probably a
pre-existing, configured value) to see what there is to see - the worst issues.

A small fraction of people will enable it with a low threshold value for
log_min_duration_statement - maybe only temporarily or per-session. They'll be
willing to sift through the logs to look for interesting things, like queries
that take 10ms, 75% of which is in JIT, but run hundreds of times per second.

This feature intends to make it easier to identify queries affected by this
problem, but it doesn't need to be possible to do that without logging anything
else.

--
Justin

#12

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: Magnus Hagander (#8)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-03-07 13:10:32 +0100, Magnus Hagander wrote:

Meanwhile here is an updated based on your other comments above, as
well as those from Julien.

This fails on cfbot, due to compiler warnings:
https://cirrus-ci.com/task/5127667648299008?logs=mingw_cross_warning#L390

Greetings,

Andres Freund

#13

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Andres Freund (#12)

1 attachment(s)

Re: Add parameter jit_warn_above_fraction

On Tue, Mar 22, 2022 at 12:50 AM Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2022-03-07 13:10:32 +0100, Magnus Hagander wrote:

Meanwhile here is an updated based on your other comments above, as
well as those from Julien.

This fails on cfbot, due to compiler warnings:
https://cirrus-ci.com/task/5127667648299008?logs=mingw_cross_warning#L390

Huh. That's annoying. I forgot int64 is %d on linux and %lld on Windows :/

PFA a fix for that.

--
Magnus Hagander
Me: https://www.hagander.net/ <http://www.hagander.net/>
Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>

Attachments:

jit_warn_above_fraction_v3.patchtext/x-patch; charset=US-ASCII; name=jit_warn_above_fraction_v3.patchDownload

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 05df48131d..4831fc362c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6618,6 +6618,21 @@ local0.*    /var/log/postgresql
        </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-jit-warn-above-fraction" xreflabel="jit_warn_above_fraction">
+      <term><varname>jit_warn_above_fraction</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>jit_warn_above_fraction</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+       <listitem>
+        <para>
+         Causes a warning to be written to the log if the time spent on JIT (see <xref linkend="jit"/>)
+         goes above this fraction of the total query runtime.
+         A value of 0 (the default) disables the warning.
+        </para>
+       </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-log-startup-progress-interval" xreflabel="log_startup_progress_interval">
       <term><varname>log_startup_progress_interval</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/tcop/fastpath.c b/src/backend/tcop/fastpath.c
index d429aa4663..a1bff893a3 100644
--- a/src/backend/tcop/fastpath.c
+++ b/src/backend/tcop/fastpath.c
@@ -196,6 +196,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	struct fp_info *fip;
 	bool		callit;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -305,7 +306,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index ba2fcfeb4a..cd7c076b7c 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -998,7 +998,9 @@ exec_simple_query(const char *query_string)
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		was_logged = false;
 	bool		use_implicit_block;
+	int			msecs;
 	char		msec_str[32];
+	int64		jit_time = 0;
 
 	/*
 	 * Report query to various monitoring facilities.
@@ -1257,6 +1259,16 @@ exec_simple_query(const char *query_string)
 
 		receiver->rDestroy(receiver);
 
+		/* Collect JIT timings in case it's active */
+		if (jit_enabled && jit_warn_above_fraction > 0 && portal->queryDesc && portal->queryDesc->estate->es_jit)
+		{
+			jit_time +=
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+		}
+
 		PortalDrop(portal, false);
 
 		if (lnext(parsetree_list, parsetree_item) == NULL)
@@ -1327,7 +1339,7 @@ exec_simple_query(const char *query_string)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -1343,6 +1355,16 @@ exec_simple_query(const char *query_string)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %lld ms of %d ms",
+							(long long) jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("QUERY STATISTICS");
 
@@ -1370,6 +1392,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	CachedPlanSource *psrc;
 	bool		is_named;
 	bool		save_log_statement_stats = log_statement_stats;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -1563,7 +1586,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -1610,6 +1633,7 @@ exec_bind_message(StringInfo input_message)
 	MemoryContext oldContext;
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		snapshot_set = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2022,7 +2046,7 @@ exec_bind_message(StringInfo input_message)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -2068,6 +2092,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	bool		is_xact_command;
 	bool		execute_is_fetch;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2259,7 +2284,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -2282,6 +2307,22 @@ exec_execute_message(const char *portal_name, long max_rows)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		int64 jit_time =
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %lld ms of %d ms",
+							(long long) jit_time, msecs)));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("EXECUTE MESSAGE STATISTICS");
 
@@ -2328,6 +2369,10 @@ check_log_statement(List *stmt_list)
  *		1 if just the duration should be logged
  *		2 if duration and query details should be logged
  *
+ * The millisecond runtime of the query is stored in msecs if any logging parameter
+ * that requires the calculation of time is enabled, even if the result is not that
+ * the entry should be logged. In other cases, it is set to -1.
+ *
  * If logging is needed, the duration in msec is formatted into msec_str[],
  * which must be a 32-byte buffer.
  *
@@ -2335,14 +2380,13 @@ check_log_statement(List *stmt_list)
  * essentially prevents 2 from being returned).
  */
 int
-check_log_duration(char *msec_str, bool was_logged)
+check_log_duration(int *msecs, char *msec_str, bool was_logged)
 {
 	if (log_duration || log_min_duration_sample >= 0 ||
-		log_min_duration_statement >= 0 || xact_is_sampled)
+	log_min_duration_statement >= 0 || xact_is_sampled || jit_warn_above_fraction > 0)
 	{
 		long		secs;
 		int			usecs;
-		int			msecs;
 		bool		exceeded_duration;
 		bool		exceeded_sample_duration;
 		bool		in_sample = false;
@@ -2350,7 +2394,7 @@ check_log_duration(char *msec_str, bool was_logged)
 		TimestampDifference(GetCurrentStatementStartTimestamp(),
 							GetCurrentTimestamp(),
 							&secs, &usecs);
-		msecs = usecs / 1000;
+		*msecs = usecs / 1000;
 
 		/*
 		 * This odd-looking test for log_min_duration_* being exceeded is
@@ -2360,12 +2404,12 @@ check_log_duration(char *msec_str, bool was_logged)
 		exceeded_duration = (log_min_duration_statement == 0 ||
 							 (log_min_duration_statement > 0 &&
 							  (secs > log_min_duration_statement / 1000 ||
-							   secs * 1000 + msecs >= log_min_duration_statement)));
+							   secs * 1000 + *msecs >= log_min_duration_statement)));
 
 		exceeded_sample_duration = (log_min_duration_sample == 0 ||
 									(log_min_duration_sample > 0 &&
 									 (secs > log_min_duration_sample / 1000 ||
-									  secs * 1000 + msecs >= log_min_duration_sample)));
+									  secs * 1000 + *msecs >= log_min_duration_sample)));
 
 		/*
 		 * Do not log if log_statement_sample_rate = 0. Log a sample if
@@ -2380,13 +2424,17 @@ check_log_duration(char *msec_str, bool was_logged)
 		if (exceeded_duration || in_sample || log_duration || xact_is_sampled)
 		{
 			snprintf(msec_str, 32, "%ld.%03d",
-					 secs * 1000 + msecs, usecs % 1000);
+					 secs * 1000 + *msecs, usecs % 1000);
 			if ((exceeded_duration || in_sample || xact_is_sampled) && !was_logged)
 				return 2;
 			else
 				return 1;
 		}
 	}
+	else
+	{
+		*msecs = -1;
+	}
 
 	return 0;
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b86137dc38..5cabc292ee 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -612,6 +612,7 @@ int			log_parameter_max_length_on_error = 0;
 int			log_temp_files = -1;
 double		log_statement_sample_rate = 1.0;
 double		log_xact_sample_rate = 0;
+double		jit_warn_above_fraction = 0;
 int			trace_recovery_messages = LOG;
 char	   *backtrace_functions;
 char	   *backtrace_symbol_list;
@@ -3889,6 +3890,19 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"jit_warn_above_fraction", PGC_USERSET, LOGGING_WHEN,
+			gettext_noop("Sets the fraction of query time spent on JIT before writing "
+						 "a warning to the log."),
+			gettext_noop("Write a message to the server log if more than this "
+						 "fraction of the query runtime is spent on JIT. "
+						 "Zero turns off the warning.")
+		},
+		&jit_warn_above_fraction,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b933fade8c..d04ace34c0 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -404,6 +404,8 @@
 #jit_optimize_above_cost = 500000	# use expensive JIT optimizations if
 					# query is more expensive than this;
 					# -1 disables
+#jit_warn_above_fraction = 0		# warn if JIT is more than this fraction of
+					# query runtime. 0 disables.
 
 # - Genetic Query Optimizer -
 
diff --git a/src/include/jit/jit.h b/src/include/jit/jit.h
index 707176d9ed..59b37d6dc0 100644
--- a/src/include/jit/jit.h
+++ b/src/include/jit/jit.h
@@ -89,6 +89,7 @@ extern bool jit_tuple_deforming;
 extern double jit_above_cost;
 extern double jit_inline_above_cost;
 extern double jit_optimize_above_cost;
+extern double jit_warn_above_fraction;
 
 
 extern void jit_reset_after_error(void);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 92291a750d..3f12f7fe25 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -87,7 +87,7 @@ extern void PostgresMain(const char *dbname,
 extern long get_stack_depth_rlimit(void);
 extern void ResetUsage(void);
 extern void ShowUsage(const char *title);
-extern int	check_log_duration(char *msec_str, bool was_logged);
+extern int	check_log_duration(int *msec, char *msec_str, bool was_logged);
 extern void set_debug_options(int debug_flag,
 							  GucContext context, GucSource source);
 extern bool set_plan_disabling_options(const char *arg,

#14

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Magnus Hagander (#13)

Re: Add parameter jit_warn_above_fraction

Hi,

On Mon, Mar 28, 2022 at 10:11:16PM +0200, Magnus Hagander wrote:

On Tue, Mar 22, 2022 at 12:50 AM Andres Freund <andres@anarazel.de> wrote:

This fails on cfbot, due to compiler warnings:
https://cirrus-ci.com/task/5127667648299008?logs=mingw_cross_warning#L390

Huh. That's annoying. I forgot int64 is %d on linux and %lld on Windows :/

PFA a fix for that.

The v3 is now stable on all platforms according to the cfbot, and apart from
the log level the patch looks good to me.

The last remaining thing is whether logging at WARNING level is the correct
choice. I'm personally fine with it, because the people who are going to use
it will probably use the same approach as for log_min_duration_statements:
enable it first with a high value, and check if that just lead to a massive log
spam. If not, see if there's anything that needs attention and fix it,
otherwise keep lowering it and keep going.

Also, jitting isn't particularly fast in general, so it's probably somewhat
unlikely to have a massive amount of queries that both have a cost higher than
the jit threshold cost and still run fast enough to spam the log without jit
slowing everything down.

I'm not sure what status the patch should be. I think it's the committer's
decision to chose which log level to use, and that won't affect the patch
enough to justify another review, so I'm switching the patch to ready for
committer. This doesn't prevent any further discussions for the log level.

#15

Robert Haas

robertmhaas@gmail.com

almost 4 years ago

In reply to: Julien Rouhaud (#14)

Re: Add parameter jit_warn_above_fraction

On Tue, Mar 29, 2022 at 6:09 AM Julien Rouhaud <rjuju123@gmail.com> wrote:

The last remaining thing is whether logging at WARNING level is the correct
choice. I'm personally fine with it, because the people who are going to use
it will probably use the same approach as for log_min_duration_statements:
enable it first with a high value, and check if that just lead to a massive log
spam. If not, see if there's anything that needs attention and fix it,
otherwise keep lowering it and keep going.

I think WARNING is fine. After all, the parameter is called
"jit_warn_above_fraction". Yeah, we could also rename the parameter,
but I think "jit_notice_above_fraction" would be harder to understand.
It feels very intuitive to just say "warn me if X thing happens" and I
don't see a reason why we shouldn't just do that.

--
Robert Haas
EDB: http://www.enterprisedb.com

#16

David Rowley

dgrowleyml@gmail.com

almost 4 years ago

In reply to: Robert Haas (#15)

Re: Add parameter jit_warn_above_fraction

On Wed, 30 Mar 2022 at 02:38, Robert Haas <robertmhaas@gmail.com> wrote:

I think WARNING is fine. After all, the parameter is called
"jit_warn_above_fraction".

I had a think about this patch. I guess it's a little similar to
checkpoint_warning. The good thing about the checkpoint_warning is
that in the LOG message we give instructions about how the DBA can fix
the issue, i.e increase max_wal_size.

With the proposed patch I see there is no hint about what might be
done to remove/reduce the warnings. I imagine that's because it's not
all that clear which GUC should be changed. In my view, likely
jit_above_cost is the most relevant but there is also
jit_inline_above_cost, jit_optimize_above_cost, jit_tuple_deforming
and jit_expressions which are relevant too.

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I really believe that the main problem here is that JIT only enables
when the *total* plan cost reaches a certain threshold. The number of
expressions to be compiled is not a factor in the decision at all.
That means that even if the total execution time of a plan was a true
reflection of the total estimated plan cost, then the fraction of time
spent (as is measured by jit_warn_above_fraction) doing JIT would
entirely depend on the number of expressions to compile. Of course,
the planner's not that good, but does that not indicate that the JIT
costing should really account for the number of expressions and not
just the total plan cost?

Anyway, what I'm trying to indicate here is that JIT is pretty much
impossible to tune properly and I don't really see why adding a
warning about it not being tuned correctly would help anyone. I think
it would be better to focus on making improvements to how the JIT
costing works.

I did propose a patch to address this in [1]/messages/by-id/CAApHDvpQJqLrNOSi8P1JLM8YE2C+ksKFpSdZg=q6sTbtQ-v=aw@mail.gmail.com. It does need more work
and I do plan to come back to it for v16.

I'd much rather see us address the costing problem before adding some
warning, especially a warning where it's not clear how to make go
away.

David

[1]: /messages/by-id/CAApHDvpQJqLrNOSi8P1JLM8YE2C+ksKFpSdZg=q6sTbtQ-v=aw@mail.gmail.com

#17

Peter Geoghegan

pg@bowt.ie

almost 4 years ago

In reply to: David Rowley (#16)

Re: Add parameter jit_warn_above_fraction

On Tue, Mar 29, 2022 at 1:06 PM David Rowley <dgrowleyml@gmail.com> wrote:

That means that even if the total execution time of a plan was a true
reflection of the total estimated plan cost, then the fraction of time
spent (as is measured by jit_warn_above_fraction) doing JIT would
entirely depend on the number of expressions to compile. Of course,
the planner's not that good, but does that not indicate that the JIT
costing should really account for the number of expressions and not
just the total plan cost?

That's a good point. The difference between the actual cost of
executing the query with and without JIT'ing is what we care about,
for the most part. Maybe we could do a lot better just by inventing a
fairly crude model that captures the benefits of JIT'ing -- that's
what "counting the number of expressions" sounds like to me. This
model could probably assume that JIT'ing itself was free -- maybe
something this simple would work well.

The planner has traditionally used the cost units to determine the
cheapest plan; it compared total plan cost for plans that were taken
from the universe of possible plans for *one specific query*. That's
completely different to any model that expects plan costs to be
meaningful in an absolute sense. I'm not completely sure how much that
difference matters, but I suspect that the answer is: "it depends, but
often it matters a great deal".

--
Peter Geoghegan

#18

Tom Lane

tgl@sss.pgh.pa.us

almost 4 years ago

In reply to: Peter Geoghegan (#17)

Re: Add parameter jit_warn_above_fraction

Peter Geoghegan <pg@bowt.ie> writes:

On Tue, Mar 29, 2022 at 1:06 PM David Rowley <dgrowleyml@gmail.com> wrote:

That means that even if the total execution time of a plan was a true
reflection of the total estimated plan cost, then the fraction of time
spent (as is measured by jit_warn_above_fraction) doing JIT would
entirely depend on the number of expressions to compile. Of course,
the planner's not that good, but does that not indicate that the JIT
costing should really account for the number of expressions and not
just the total plan cost?

That's a good point.

I think David's questions are sufficiently cogent and difficult
that we should not add jit_warn_above_fraction at this time.
Maybe we'll eventually decide it's the best we can do; but I do
not think we have a problem there that's so pressing that we need
to rush out a partially-baked bandaid. Especially not at the
tail end of the development cycle, with not much time to gain
experience with it before we ship.

regards, tom lane

#19

Peter Geoghegan

pg@bowt.ie

almost 4 years ago

In reply to: Tom Lane (#18)

Re: Add parameter jit_warn_above_fraction

On Tue, Mar 29, 2022 at 3:04 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:

I think David's questions are sufficiently cogent and difficult
that we should not add jit_warn_above_fraction at this time.

--
Peter Geoghegan

#20

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: David Rowley (#16)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-03-30 09:05:44 +1300, David Rowley wrote:

I really believe that the main problem here is that JIT only enables
when the *total* plan cost reaches a certain threshold.

Yes, that is/was a clear design mistake. It wasn't quite as bad back when it
was written - partitioning blows the problem up by an order of magnitude or
three. . The costing really needs to at least multiply the number of
to-be-compiled expressions with some cost to decide whether the cost of JITing
is worth it, rather than making "flat" decision.

I did propose a patch to address this in [1]. It does need more work
and I do plan to come back to it for v16.

FWIW, that doesn't seem quite right - won't it stop JITing e.g. on the inner
side of a nested loop, just because it's cheap, even though that's where the
bulk of the benefits comes from?

Greetings,

Andres Freund

#21

David Rowley

dgrowleyml@gmail.com

almost 4 years ago

In reply to: Andres Freund (#20)

Re: Add parameter jit_warn_above_fraction

On Wed, 30 Mar 2022 at 12:16, Andres Freund <andres@anarazel.de> wrote:

I did propose a patch to address this in [1]. It does need more work
and I do plan to come back to it for v16.

FWIW, that doesn't seem quite right - won't it stop JITing e.g. on the inner
side of a nested loop, just because it's cheap, even though that's where the
bulk of the benefits comes from?

Yeah, I think the total cost would need to be multiplied by the number
of times we expect to call that part of the plan. I've not yet sat
down to figure out if that's easy/cheap or hard/costly information to
obtain.

David

#22

Tom Lane

tgl@sss.pgh.pa.us

almost 4 years ago

In reply to: David Rowley (#21)

Re: Add parameter jit_warn_above_fraction

David Rowley <dgrowleyml@gmail.com> writes:

On Wed, 30 Mar 2022 at 12:16, Andres Freund <andres@anarazel.de> wrote:

FWIW, that doesn't seem quite right - won't it stop JITing e.g. on the inner
side of a nested loop, just because it's cheap, even though that's where the
bulk of the benefits comes from?

Yeah, I think the total cost would need to be multiplied by the number
of times we expect to call that part of the plan. I've not yet sat
down to figure out if that's easy/cheap or hard/costly information to
obtain.

As long as you don't need the info till the end of planning, it should be
reasonably simple to determine. I'm not sure if we actually record the
expected number of loops in the plan tree today, but the costing
mechanisms certainly estimate that along the way, so we could store it
if need be.

regards, tom lane

#23

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: David Rowley (#21)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-03-30 12:41:41 +1300, David Rowley wrote:

On Wed, 30 Mar 2022 at 12:16, Andres Freund <andres@anarazel.de> wrote:

I did propose a patch to address this in [1]. It does need more work
and I do plan to come back to it for v16.

FWIW, that doesn't seem quite right - won't it stop JITing e.g. on the inner
side of a nested loop, just because it's cheap, even though that's where the
bulk of the benefits comes from?

Yeah, I think the total cost would need to be multiplied by the number
of times we expect to call that part of the plan. I've not yet sat
down to figure out if that's easy/cheap or hard/costly information to
obtain.

I wonder whether it'd make sense to combine that with awareness of a few plan
types that can lead to large portions of child nodes never being executed. One
the case where the current behaviour is the worst is runtime partition pruning
in append - we compile expressions for whole subtrees that will never be
executed. We should be much more hesitant to compile there compared to a
cheap-ish node that we know will be executed as part of a large expensive part
of the plan tree.

- Andres

#24

David Rowley

dgrowleyml@gmail.com

almost 4 years ago

In reply to: Andres Freund (#23)

Re: Add parameter jit_warn_above_fraction

On Wed, 30 Mar 2022 at 13:20, Andres Freund <andres@anarazel.de> wrote:

I wonder whether it'd make sense to combine that with awareness of a few plan
types that can lead to large portions of child nodes never being executed. One
the case where the current behaviour is the worst is runtime partition pruning
in append - we compile expressions for whole subtrees that will never be
executed. We should be much more hesitant to compile there compared to a
cheap-ish node that we know will be executed as part of a large expensive part
of the plan tree.

I think that's also a problem but I think that might be better fixed
another way.

There is a patch [1]https://commitfest.postgresql.org/37/3071/ around that seems to change things to compile JIT
on-demand. I've not looked at the patch but imagine the overhead
might be kept minimal by initially setting the evalfunc to compile and
run, then set it to just run the compiled Expr for subsequent
executions. Maybe nodes below an Append/MergeAppend with run-time
pruning could compile on-demand and other nodes up-front. Or maybe
there's no problem with making everything on-demand.

David

[1]: https://commitfest.postgresql.org/37/3071/

#25

Andres Freund

andres@anarazel.de

almost 4 years ago

In reply to: David Rowley (#24)

Re: Add parameter jit_warn_above_fraction

Hi,

On 2022-03-30 14:30:32 +1300, David Rowley wrote:

On Wed, 30 Mar 2022 at 13:20, Andres Freund <andres@anarazel.de> wrote:

I wonder whether it'd make sense to combine that with awareness of a few plan
types that can lead to large portions of child nodes never being executed. One
the case where the current behaviour is the worst is runtime partition pruning
in append - we compile expressions for whole subtrees that will never be
executed. We should be much more hesitant to compile there compared to a
cheap-ish node that we know will be executed as part of a large expensive part
of the plan tree.

I think that's also a problem but I think that might be better fixed
another way.

There is a patch [1] around that seems to change things to compile JIT
on-demand.

That's a bad idea idea to do on a per-function basis. Emitting functions one
by one is considerably slower than doing so in larger chunks. Of course that
can be addressed to some degree by something like what you suggest below.

I've not looked at the patch but imagine the overhead might be kept minimal
by initially setting the evalfunc to compile and run, then set it to just
run the compiled Expr for subsequent executions.

That part I'm not worried about, there's such an indirection on the first call
either way IIRC.

Maybe nodes below an Append/MergeAppend with run-time pruning could compile
on-demand and other nodes up-front. Or maybe there's no problem with making
everything on-demand.

Yea, that could work. The expressions for one "partition query" would still
have to be emitted at once. For each such subtree we should make a separate
costing decision. But I think an additional "will be executed" sub-node is a
different story, the threshold shouldn't be done on a per-node basis. That
partitioning of the plan tree is kind of what I was trying to get at...

Greetings,

Andres Freund

#26

David Rowley

dgrowleyml@gmail.com

almost 4 years ago

In reply to: Andres Freund (#25)

Re: Add parameter jit_warn_above_fraction

On Wed, 30 Mar 2022 at 14:48, Andres Freund <andres@anarazel.de> wrote:

On 2022-03-30 14:30:32 +1300, David Rowley wrote:

Maybe nodes below an Append/MergeAppend with run-time pruning could compile
on-demand and other nodes up-front. Or maybe there's no problem with making
everything on-demand.

Yea, that could work. The expressions for one "partition query" would still
have to be emitted at once. For each such subtree we should make a separate
costing decision. But I think an additional "will be executed" sub-node is a
different story, the threshold shouldn't be done on a per-node basis. That
partitioning of the plan tree is kind of what I was trying to get at...

Maybe this point is moot if we get something like [1]https://commitfest.postgresql.org/37/3478/. It might mean
that run-time pruning would happen early enough that we could just JIT
compile non-pruned subnodes.

David

[1]: https://commitfest.postgresql.org/37/3478/

#27

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: David Rowley (#16)

Re: Add parameter jit_warn_above_fraction

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Wed, 30 Mar 2022 at 02:38, Robert Haas <robertmhaas@gmail.com> wrote:

I think WARNING is fine. After all, the parameter is called
"jit_warn_above_fraction".

I had a think about this patch. I guess it's a little similar to
checkpoint_warning. The good thing about the checkpoint_warning is
that in the LOG message we give instructions about how the DBA can fix
the issue, i.e increase max_wal_size.

With the proposed patch I see there is no hint about what might be
done to remove/reduce the warnings. I imagine that's because it's not
all that clear which GUC should be changed. In my view, likely
jit_above_cost is the most relevant but there is also
jit_inline_above_cost, jit_optimize_above_cost, jit_tuple_deforming
and jit_expressions which are relevant too.

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those costs
(or just turn it off) *for that individual query*. That doesn't mean you
"rarely JIT anything atll", it just means you rarely JIT that particular
query.

In fact, my goal is to specifically make people do that and *not* just turn
off JIT globally.

I'd much rather see us address the costing problem before adding some

warning, especially a warning where it's not clear how to make go
away.

The easiest way would be to add a HINT that says turn off jit for this
particular query or something?

I do agree that if we can make "spending too much time on JIT vs query
runtime" go away completely, then there is no need for a parameter like
this.

I still think the warning is useful. And I think it may stay useful even
after we have made the JIT costing smarter -- though that's not certain of
course.

--
Magnus Hagander
Me: https://www.hagander.net/ <http://www.hagander.net/>
Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>

#28

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: Magnus Hagander (#27)

1 attachment(s)

Re: Add parameter jit_warn_above_fraction

On Wed, Mar 30, 2022 at 3:04 PM Magnus Hagander <magnus@hagander.net> wrote:

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com>
wrote:

On Wed, 30 Mar 2022 at 02:38, Robert Haas <robertmhaas@gmail.com> wrote:

I think WARNING is fine. After all, the parameter is called
"jit_warn_above_fraction".

I had a think about this patch. I guess it's a little similar to
checkpoint_warning. The good thing about the checkpoint_warning is
that in the LOG message we give instructions about how the DBA can fix
the issue, i.e increase max_wal_size.

With the proposed patch I see there is no hint about what might be
done to remove/reduce the warnings. I imagine that's because it's not
all that clear which GUC should be changed. In my view, likely
jit_above_cost is the most relevant but there is also
jit_inline_above_cost, jit_optimize_above_cost, jit_tuple_deforming
and jit_expressions which are relevant too.

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those costs
(or just turn it off) *for that individual query*. That doesn't mean you
"rarely JIT anything atll", it just means you rarely JIT that particular
query.

In fact, my goal is to specifically make people do that and *not* just
turn off JIT globally.

I'd much rather see us address the costing problem before adding some

warning, especially a warning where it's not clear how to make go
away.

The easiest way would be to add a HINT that says turn off jit for this
particular query or something?

I do agree that if we can make "spending too much time on JIT vs query
runtime" go away completely, then there is no need for a parameter like
this.

I still think the warning is useful. And I think it may stay useful even
after we have made the JIT costing smarter -- though that's not certain of
course.

This patch is still sitting at "ready for committer".

As an example, I have added such a hint in the attached.

I still stand by that this patch is better than nothing. Sure, I would
love for us to adapt the JIT costing model and algorithm to make this not a
problem. And once we've done that, we should remove the parameter again.

It's not on by default, and it's trivial to remove in the future.

Yes, we're right up at the deadline. I'd still like to get it in, so I'd
really appreciate some further voices :)

--
Magnus Hagander
Me: https://www.hagander.net/ <http://www.hagander.net/>
Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>

Attachments:

jit_warn_above_fraction_v4.patchtext/x-patch; charset=US-ASCII; name=jit_warn_above_fraction_v4.patchDownload

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6e3e27bed7..fa35fe04e8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6718,6 +6718,21 @@ local0.*    /var/log/postgresql
        </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-jit-warn-above-fraction" xreflabel="jit_warn_above_fraction">
+      <term><varname>jit_warn_above_fraction</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>jit_warn_above_fraction</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+       <listitem>
+        <para>
+         Causes a warning to be written to the log if the time spent on JIT (see <xref linkend="jit"/>)
+         goes above this fraction of the total query runtime.
+         A value of 0 (the default) disables the warning.
+        </para>
+       </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-log-startup-progress-interval" xreflabel="log_startup_progress_interval">
       <term><varname>log_startup_progress_interval</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/tcop/fastpath.c b/src/backend/tcop/fastpath.c
index d429aa4663..a1bff893a3 100644
--- a/src/backend/tcop/fastpath.c
+++ b/src/backend/tcop/fastpath.c
@@ -196,6 +196,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	struct fp_info *fip;
 	bool		callit;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -305,7 +306,7 @@ HandleFunctionRequest(StringInfo msgBuf)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 95dc2e2c83..d1dd2273ad 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -998,7 +998,9 @@ exec_simple_query(const char *query_string)
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		was_logged = false;
 	bool		use_implicit_block;
+	int			msecs;
 	char		msec_str[32];
+	int64		jit_time = 0;
 
 	/*
 	 * Report query to various monitoring facilities.
@@ -1257,6 +1259,16 @@ exec_simple_query(const char *query_string)
 
 		receiver->rDestroy(receiver);
 
+		/* Collect JIT timings in case it's active */
+		if (jit_enabled && jit_warn_above_fraction > 0 && portal->queryDesc && portal->queryDesc->estate->es_jit)
+		{
+			jit_time +=
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+				INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+		}
+
 		PortalDrop(portal, false);
 
 		if (lnext(parsetree_list, parsetree_item) == NULL)
@@ -1327,7 +1339,7 @@ exec_simple_query(const char *query_string)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -1343,6 +1355,17 @@ exec_simple_query(const char *query_string)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %lld ms of %d ms",
+							(long long) jit_time, msecs)),
+					errhint("Consider turning of JIT or changing the threshold for this particular query"));
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("QUERY STATISTICS");
 
@@ -1370,6 +1393,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	CachedPlanSource *psrc;
 	bool		is_named;
 	bool		save_log_statement_stats = log_statement_stats;
+	int			msecs;
 	char		msec_str[32];
 
 	/*
@@ -1563,7 +1587,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -1610,6 +1634,7 @@ exec_bind_message(StringInfo input_message)
 	MemoryContext oldContext;
 	bool		save_log_statement_stats = log_statement_stats;
 	bool		snapshot_set = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2022,7 +2047,7 @@ exec_bind_message(StringInfo input_message)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, false))
+	switch (check_log_duration(&msecs, msec_str, false))
 	{
 		case 1:
 			ereport(LOG,
@@ -2068,6 +2093,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	bool		is_xact_command;
 	bool		execute_is_fetch;
 	bool		was_logged = false;
+	int			msecs;
 	char		msec_str[32];
 	ParamsErrorCbData params_data;
 	ErrorContextCallback params_errcxt;
@@ -2259,7 +2285,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 	/*
 	 * Emit duration logging if appropriate.
 	 */
-	switch (check_log_duration(msec_str, was_logged))
+	switch (check_log_duration(&msecs, msec_str, was_logged))
 	{
 		case 1:
 			ereport(LOG,
@@ -2282,6 +2308,24 @@ exec_execute_message(const char *portal_name, long max_rows)
 			break;
 	}
 
+	if (jit_enabled && jit_warn_above_fraction > 0)
+	{
+		int64 jit_time =
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.generation_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.inlining_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.optimization_counter) +
+			INSTR_TIME_GET_MILLISEC(portal->queryDesc->estate->es_jit->instr.emission_counter);
+
+		if (jit_time > msecs * jit_warn_above_fraction)
+		{
+			ereport(WARNING,
+					(errmsg("JIT total processing time was %lld ms of %d ms",
+							(long long) jit_time, msecs)),
+					errhint("Consider turning of JIT or changing the threshold for this particular query"));
+
+		}
+	}
+
 	if (save_log_statement_stats)
 		ShowUsage("EXECUTE MESSAGE STATISTICS");
 
@@ -2328,6 +2372,10 @@ check_log_statement(List *stmt_list)
  *		1 if just the duration should be logged
  *		2 if duration and query details should be logged
  *
+ * The millisecond runtime of the query is stored in msecs if any logging parameter
+ * that requires the calculation of time is enabled, even if the result is not that
+ * the entry should be logged. In other cases, it is set to -1.
+ *
  * If logging is needed, the duration in msec is formatted into msec_str[],
  * which must be a 32-byte buffer.
  *
@@ -2335,14 +2383,13 @@ check_log_statement(List *stmt_list)
  * essentially prevents 2 from being returned).
  */
 int
-check_log_duration(char *msec_str, bool was_logged)
+check_log_duration(int *msecs, char *msec_str, bool was_logged)
 {
 	if (log_duration || log_min_duration_sample >= 0 ||
-		log_min_duration_statement >= 0 || xact_is_sampled)
+	log_min_duration_statement >= 0 || xact_is_sampled || jit_warn_above_fraction > 0)
 	{
 		long		secs;
 		int			usecs;
-		int			msecs;
 		bool		exceeded_duration;
 		bool		exceeded_sample_duration;
 		bool		in_sample = false;
@@ -2350,7 +2397,7 @@ check_log_duration(char *msec_str, bool was_logged)
 		TimestampDifference(GetCurrentStatementStartTimestamp(),
 							GetCurrentTimestamp(),
 							&secs, &usecs);
-		msecs = usecs / 1000;
+		*msecs = usecs / 1000;
 
 		/*
 		 * This odd-looking test for log_min_duration_* being exceeded is
@@ -2360,12 +2407,12 @@ check_log_duration(char *msec_str, bool was_logged)
 		exceeded_duration = (log_min_duration_statement == 0 ||
 							 (log_min_duration_statement > 0 &&
 							  (secs > log_min_duration_statement / 1000 ||
-							   secs * 1000 + msecs >= log_min_duration_statement)));
+							   secs * 1000 + *msecs >= log_min_duration_statement)));
 
 		exceeded_sample_duration = (log_min_duration_sample == 0 ||
 									(log_min_duration_sample > 0 &&
 									 (secs > log_min_duration_sample / 1000 ||
-									  secs * 1000 + msecs >= log_min_duration_sample)));
+									  secs * 1000 + *msecs >= log_min_duration_sample)));
 
 		/*
 		 * Do not log if log_statement_sample_rate = 0. Log a sample if
@@ -2380,13 +2427,17 @@ check_log_duration(char *msec_str, bool was_logged)
 		if (exceeded_duration || in_sample || log_duration || xact_is_sampled)
 		{
 			snprintf(msec_str, 32, "%ld.%03d",
-					 secs * 1000 + msecs, usecs % 1000);
+					 secs * 1000 + *msecs, usecs % 1000);
 			if ((exceeded_duration || in_sample || xact_is_sampled) && !was_logged)
 				return 2;
 			else
 				return 1;
 		}
 	}
+	else
+	{
+		*msecs = -1;
+	}
 
 	return 0;
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 22b5571a70..7e9859a9e1 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -642,6 +642,7 @@ int			log_parameter_max_length_on_error = 0;
 int			log_temp_files = -1;
 double		log_statement_sample_rate = 1.0;
 double		log_xact_sample_rate = 0;
+double		jit_warn_above_fraction = 0;
 int			trace_recovery_messages = LOG;
 char	   *backtrace_functions;
 char	   *backtrace_symbol_list;
@@ -3943,6 +3944,19 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"jit_warn_above_fraction", PGC_USERSET, LOGGING_WHEN,
+			gettext_noop("Sets the fraction of query time spent on JIT before writing "
+						 "a warning to the log."),
+			gettext_noop("Write a message to the server log if more than this "
+						 "fraction of the query runtime is spent on JIT. "
+						 "Zero turns off the warning.")
+		},
+		&jit_warn_above_fraction,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 94270eb0ec..82d0f5135b 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -411,6 +411,8 @@
 #jit_optimize_above_cost = 500000	# use expensive JIT optimizations if
 					# query is more expensive than this;
 					# -1 disables
+#jit_warn_above_fraction = 0		# warn if JIT is more than this fraction of
+					# query runtime. 0 disables.
 
 # - Genetic Query Optimizer -
 
diff --git a/src/include/jit/jit.h b/src/include/jit/jit.h
index 707176d9ed..59b37d6dc0 100644
--- a/src/include/jit/jit.h
+++ b/src/include/jit/jit.h
@@ -89,6 +89,7 @@ extern bool jit_tuple_deforming;
 extern double jit_above_cost;
 extern double jit_inline_above_cost;
 extern double jit_optimize_above_cost;
+extern double jit_warn_above_fraction;
 
 
 extern void jit_reset_after_error(void);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 92291a750d..3f12f7fe25 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -87,7 +87,7 @@ extern void PostgresMain(const char *dbname,
 extern long get_stack_depth_rlimit(void);
 extern void ResetUsage(void);
 extern void ShowUsage(const char *title);
-extern int	check_log_duration(char *msec_str, bool was_logged);
+extern int	check_log_duration(int *msec, char *msec_str, bool was_logged);
 extern void set_debug_options(int debug_flag,
 							  GucContext context, GucSource source);
 extern bool set_plan_disabling_options(const char *arg,

#29

Stephen Frost

sfrost@snowman.net

almost 4 years ago

In reply to: Magnus Hagander (#28)

Re: Add parameter jit_warn_above_fraction

Greetings,

On Fri, Apr 8, 2022 at 07:27 Magnus Hagander <magnus@hagander.net> wrote:

On Wed, Mar 30, 2022 at 3:04 PM Magnus Hagander <magnus@hagander.net>
wrote:

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com>
wrote:

On Wed, 30 Mar 2022 at 02:38, Robert Haas <robertmhaas@gmail.com> wrote:

I think WARNING is fine. After all, the parameter is called
"jit_warn_above_fraction".

I had a think about this patch. I guess it's a little similar to
checkpoint_warning. The good thing about the checkpoint_warning is
that in the LOG message we give instructions about how the DBA can fix
the issue, i.e increase max_wal_size.

With the proposed patch I see there is no hint about what might be
done to remove/reduce the warnings. I imagine that's because it's not
all that clear which GUC should be changed. In my view, likely
jit_above_cost is the most relevant but there is also
jit_inline_above_cost, jit_optimize_above_cost, jit_tuple_deforming
and jit_expressions which are relevant too.

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those costs
(or just turn it off) *for that individual query*. That doesn't mean you
"rarely JIT anything atll", it just means you rarely JIT that particular
query.

In fact, my goal is to specifically make people do that and *not* just
turn off JIT globally.

I'd much rather see us address the costing problem before adding some

warning, especially a warning where it's not clear how to make go
away.

The easiest way would be to add a HINT that says turn off jit for this
particular query or something?

I do agree that if we can make "spending too much time on JIT vs query
runtime" go away completely, then there is no need for a parameter like
this.

I still think the warning is useful. And I think it may stay useful even
after we have made the JIT costing smarter -- though that's not certain of
course.

This patch is still sitting at "ready for committer".

As an example, I have added such a hint in the attached.

I still stand by that this patch is better than nothing. Sure, I would
love for us to adapt the JIT costing model and algorithm to make this not a
problem. And once we've done that, we should remove the parameter again.

It's not on by default, and it's trivial to remove in the future.

Yes, we're right up at the deadline. I'd still like to get it in, so I'd
really appreciate some further voices :)

Looks reasonable to me, so +1. The default is has it off and so I seriously
doubt it’ll cause any issues and it’ll be very handy on large and busy
systems with lots of queries finding those that have a serious amount of
time being spent in JIT (and hopefully avoid folks just turning jit off
across the board, since that’s worse- we need more data on jit and need to
work on improving it, not ending up with everyone turning it off).

Thanks!

Stephen

Show quoted text

#30

David Rowley

dgrowleyml@gmail.com

almost 4 years ago

In reply to: Magnus Hagander (#28)

Re: Add parameter jit_warn_above_fraction

On Fri, 8 Apr 2022 at 23:27, Magnus Hagander <magnus@hagander.net> wrote:

On Wed, Mar 30, 2022 at 3:04 PM Magnus Hagander <magnus@hagander.net> wrote:

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com> wrote:

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those costs (or just turn it off) *for that individual query*. That doesn't mean you "rarely JIT anything atll", it just means you rarely JIT that particular query.

I just struggle to imagine that anyone is going to spend much effort
tuning a warning parameter per query. I imagine they're far more
likely to just ramp it up to only catch some high percentile problems
or just (more likely) just not bother with it. It seems more likely
that if anyone was to tune anything per query here it would be
jit_above_cost, since that actually might have an affect on the
performance of the query, rather than if it spits out some warning
message or not. ISTM that if the user knows what to set it to per
query, then there's very little point in having a warning as we'd be
alerting them to something they already know about.

I looked in the -general list to see if we could get some common
explanations to give us an idea of the most common reason for high JIT
compilation time. It seems that the plans were never simple. [1]/messages/by-id/CAPL5KHq8zfWPzueCemXw4c+U568PoDfqo3wBDNm3KAyvybdaMQ@mail.gmail.com seems
due to a complex plan. I'm basing that off the "Functions: 167". I
didn't catch the full plan. From what I can tell, [2]/messages/by-id/CAHOFxGo5xJt02RmwAWrtv2K0jcqqxG-cDiR8FQbvb0WxdKhcgw@mail.gmail.com seems to be due
to "lots of empty tables", so assuming the clamping at 1 page is
causing issues there. I think both of those cases could be resolved
by building the costing the way I mentioned. I admit that 2 cases is
not a very large sample size.

David

[1]: /messages/by-id/CAPL5KHq8zfWPzueCemXw4c+U568PoDfqo3wBDNm3KAyvybdaMQ@mail.gmail.com
[2]: /messages/by-id/CAHOFxGo5xJt02RmwAWrtv2K0jcqqxG-cDiR8FQbvb0WxdKhcgw@mail.gmail.com

#31

Magnus Hagander

magnus@hagander.net

almost 4 years ago

In reply to: David Rowley (#30)

Re: Add parameter jit_warn_above_fraction

On Fri, Apr 8, 2022 at 2:19 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Fri, 8 Apr 2022 at 23:27, Magnus Hagander <magnus@hagander.net> wrote:

On Wed, Mar 30, 2022 at 3:04 PM Magnus Hagander <magnus@hagander.net>

wrote:

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com>

wrote:

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those

costs (or just turn it off) *for that individual query*. That doesn't mean
you "rarely JIT anything atll", it just means you rarely JIT that
particular query.

I just struggle to imagine that anyone is going to spend much effort
tuning a warning parameter per query. I imagine they're far more
likely to just ramp it up to only catch some high percentile problems
or just (more likely) just not bother with it. It seems more likely
that if anyone was to tune anything per query here it would be
jit_above_cost, since that actually might have an affect on the
performance of the query, rather than if it spits out some warning
message or not. ISTM that if the user knows what to set it to per
query, then there's very little point in having a warning as we'd be
alerting them to something they already know about.

I would not expect people to tune the *warning* at a query level. If
anything, then ys, they would tune the either jit_above_cost or just
jit=off. But the idea being you can do that on a per query level instead of
globally.

I looked in the -general list to see if we could get some common

explanations to give us an idea of the most common reason for high JIT
compilation time. It seems that the plans were never simple. [1] seems
due to a complex plan. I'm basing that off the "Functions: 167". I
didn't catch the full plan. From what I can tell, [2] seems to be due
to "lots of empty tables", so assuming the clamping at 1 page is
causing issues there. I think both of those cases could be resolved
by building the costing the way I mentioned. I admit that 2 cases is
not a very large sample size.

Again, I am very much for improvements of the costing model. This is in no
way intended to be a replacement for that. It's intended to be a stop-gap.

What I see much of today are things like
https://dba.stackexchange.com/questions/264955/handling-performance-problems-with-jit-in-postgres-12
or
https://dev.to/xenatisch/cascade-of-doom-jit-and-how-a-postgres-update-led-to-70-failure-on-a-critical-national-service-3f2a

The bottom line is that people end up with recommendations to turn off JIT
globally more or less by default. Because there's no real useful way today
to figure out when it causes problems vs when it helps.

The addition to pg_stat_statements I pushed a short while ago would help
with that. But I think having a warning like this would also be useful. As
a stop-gap measure, yes, but we really don't know when we will have an
improved costing model for it. I hope you're right and that we can have it
by 16, and then I will definitely advocate for removing the warning again
if it works.

--
Magnus Hagander
Me: https://www.hagander.net/ <http://www.hagander.net/>
Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>

#32

Stephen Frost

sfrost@snowman.net

almost 4 years ago

In reply to: Magnus Hagander (#31)

Re: Add parameter jit_warn_above_fraction

Greetings,

* Magnus Hagander (magnus@hagander.net) wrote:

On Fri, Apr 8, 2022 at 2:19 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Fri, 8 Apr 2022 at 23:27, Magnus Hagander <magnus@hagander.net> wrote:

On Wed, Mar 30, 2022 at 3:04 PM Magnus Hagander <magnus@hagander.net>

wrote:

On Tue, Mar 29, 2022 at 10:06 PM David Rowley <dgrowleyml@gmail.com>

wrote:

If we go with this patch, the problem I see here is that the amount
of work the JIT compiler must do for a given query depends mostly on
the number of expressions that must be compiled in the query (also to
a lesser extent jit_inline_above_cost, jit_optimize_above_cost,
jit_tuple_deforming and jit_expressions). The DBA does not really have
much control over the number of expressions in the query. All he or
she can do to get rid of the warning is something like increase
jit_above_cost. After a few iterations of that, the end result is
that jit_above_cost is now high enough that JIT no longer triggers
for, say, that query to that table with 1000 partitions where no
plan-time pruning takes place. Is that really a good thing? It likely
means that we just rarely JIT anything at all!

I don't agree with the conclusion of that.

What the parameter would be useful for is to be able to tune those

costs (or just turn it off) *for that individual query*. That doesn't mean
you "rarely JIT anything atll", it just means you rarely JIT that
particular query.

I just struggle to imagine that anyone is going to spend much effort
tuning a warning parameter per query. I imagine they're far more
likely to just ramp it up to only catch some high percentile problems
or just (more likely) just not bother with it. It seems more likely
that if anyone was to tune anything per query here it would be
jit_above_cost, since that actually might have an affect on the
performance of the query, rather than if it spits out some warning
message or not. ISTM that if the user knows what to set it to per
query, then there's very little point in having a warning as we'd be
alerting them to something they already know about.

I would not expect people to tune the *warning* at a query level. If
anything, then ys, they would tune the either jit_above_cost or just
jit=off. But the idea being you can do that on a per query level instead of
globally.

Yeah, exactly, this is about having a busy system and wanting to know
which queries are spending a lot of time doing JIT relative to the query
time, so that you can go adjust your JIT parameters or possibly disable
JIT for those queries (or maybe bring those cases to -hackers and try to
help make our costing better).

I looked in the -general list to see if we could get some common
explanations to give us an idea of the most common reason for high JIT
compilation time. It seems that the plans were never simple. [1] seems
due to a complex plan. I'm basing that off the "Functions: 167". I
didn't catch the full plan. From what I can tell, [2] seems to be due
to "lots of empty tables", so assuming the clamping at 1 page is
causing issues there. I think both of those cases could be resolved
by building the costing the way I mentioned. I admit that 2 cases is
not a very large sample size.

Again, I am very much for improvements of the costing model. This is in no
way intended to be a replacement for that. It's intended to be a stop-gap.

Not sure I'd say it's a 'stop-gap' as it's really very similar, imv
anyway, to log_min_duration_statement- you want to know what queries are
taking a lot of time but you can't log all of them.

What I see much of today are things like
https://dba.stackexchange.com/questions/264955/handling-performance-problems-with-jit-in-postgres-12
or
https://dev.to/xenatisch/cascade-of-doom-jit-and-how-a-postgres-update-led-to-70-failure-on-a-critical-national-service-3f2a

The bottom line is that people end up with recommendations to turn off JIT
globally more or less by default. Because there's no real useful way today
to figure out when it causes problems vs when it helps.

Yeah, that's frustrating.

The addition to pg_stat_statements I pushed a short while ago would help
with that. But I think having a warning like this would also be useful. As
a stop-gap measure, yes, but we really don't know when we will have an
improved costing model for it. I hope you're right and that we can have it
by 16, and then I will definitely advocate for removing the warning again
if it works.

Having this in pg_stat_statements is certainly helpful but having a
warning also is. I don't think we have to address this in only one way.
A lot faster to flip this guc and then look in the logs on a busy system
than to install pg_stat_statements, restart the cluster once you get
permission to do so, and then query it.

Thanks,

Stephen

#33

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Stephen Frost (#32)

Re: Add parameter jit_warn_above_fraction

Hi,

On Fri, Apr 08, 2022 at 09:39:18AM -0400, Stephen Frost wrote:

* Magnus Hagander (magnus@hagander.net) wrote:

The addition to pg_stat_statements I pushed a short while ago would help
with that. But I think having a warning like this would also be useful. As
a stop-gap measure, yes, but we really don't know when we will have an
improved costing model for it. I hope you're right and that we can have it
by 16, and then I will definitely advocate for removing the warning again
if it works.

Having this in pg_stat_statements is certainly helpful but having a
warning also is. I don't think we have to address this in only one way.
A lot faster to flip this guc and then look in the logs on a busy system
than to install pg_stat_statements, restart the cluster once you get
permission to do so, and then query it.

+1, especially if you otherwise don't really need or want to have
pg_stat_statements enabled, as it's far from being free. Sure you could enable
it by default with pg_stat_statements.track = none, but that seems a lot more
troublesome than just dynamically enabling a GUC, possibly for a short time
and/or for a specific database/role.

#34

Tom Lane

tgl@sss.pgh.pa.us

almost 4 years ago

In reply to: Julien Rouhaud (#33)

Re: Add parameter jit_warn_above_fraction

Julien Rouhaud <rjuju123@gmail.com> writes:

On Fri, Apr 08, 2022 at 09:39:18AM -0400, Stephen Frost wrote:

Having this in pg_stat_statements is certainly helpful but having a
warning also is. I don't think we have to address this in only one way.
A lot faster to flip this guc and then look in the logs on a busy system
than to install pg_stat_statements, restart the cluster once you get
permission to do so, and then query it.

+1, especially if you otherwise don't really need or want to have
pg_stat_statements enabled, as it's far from being free. Sure you could enable
it by default with pg_stat_statements.track = none, but that seems a lot more
troublesome than just dynamically enabling a GUC, possibly for a short time
and/or for a specific database/role.

The arguments against the GUC were not about whether it's convenient.
They were about whether the information it gives you is actually going
to be of any use.

Also, good luck with "looking in the logs", because by default
WARNING-level messages don't go to the postmaster log. If that's
the intended use-case then the message ought to appear at LOG
level (which'd also change the desirable name for the GUC).

regards, tom lane

#35

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Tom Lane (#34)

Re: Add parameter jit_warn_above_fraction

On Sat, Apr 09, 2022 at 10:42:12AM -0400, Tom Lane wrote:

Also, good luck with "looking in the logs", because by default
WARNING-level messages don't go to the postmaster log. If that's
the intended use-case then the message ought to appear at LOG
level (which'd also change the desirable name for the GUC).

I must be missing something because by default log_min_messages is WARNING, and
AFAICS I do get WARNING-level messages in some vanilla cluster logs, using
vanilla .psqlrc.

#36

Tom Lane

tgl@sss.pgh.pa.us

almost 4 years ago

In reply to: Julien Rouhaud (#35)

Re: Add parameter jit_warn_above_fraction

Julien Rouhaud <rjuju123@gmail.com> writes:

On Sat, Apr 09, 2022 at 10:42:12AM -0400, Tom Lane wrote:

Also, good luck with "looking in the logs", because by default
WARNING-level messages don't go to the postmaster log.

I must be missing something because by default log_min_messages is WARNING, and
AFAICS I do get WARNING-level messages in some vanilla cluster logs, using
vanilla .psqlrc.

Ah, sorry, I was looking at the wrong thing namely
log_min_error_statement. The point still holds though: if we emit this
at level WARNING, what will get logged is just the bare message and not
the statement that triggered it. How useful do you think that will be?

regards, tom lane

#37

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Tom Lane (#36)

Re: Add parameter jit_warn_above_fraction

On Sat, Apr 09, 2022 at 12:31:23PM -0400, Tom Lane wrote:

Julien Rouhaud <rjuju123@gmail.com> writes:

On Sat, Apr 09, 2022 at 10:42:12AM -0400, Tom Lane wrote:

Also, good luck with "looking in the logs", because by default
WARNING-level messages don't go to the postmaster log.

I must be missing something because by default log_min_messages is WARNING, and
AFAICS I do get WARNING-level messages in some vanilla cluster logs, using
vanilla .psqlrc.

Ah, sorry, I was looking at the wrong thing namely
log_min_error_statement. The point still holds though: if we emit this
at level WARNING, what will get logged is just the bare message and not
the statement that triggered it. How useful do you think that will be?

Ah right, I did miss that "small detail". And of course I agree, as-is that
would be entirely useless and it should be LOG, like the rest of similar
features.

#38

Ibrar Ahmed

ibrar.ahmad@gmail.com

over 3 years ago

In reply to: Julien Rouhaud (#37)

Re: Add parameter jit_warn_above_fraction

On Sat, Apr 9, 2022 at 8:43 PM Julien Rouhaud <rjuju123@gmail.com> wrote:

On Sat, Apr 09, 2022 at 12:31:23PM -0400, Tom Lane wrote:

Julien Rouhaud <rjuju123@gmail.com> writes:

On Sat, Apr 09, 2022 at 10:42:12AM -0400, Tom Lane wrote:

Also, good luck with "looking in the logs", because by default
WARNING-level messages don't go to the postmaster log.

I must be missing something because by default log_min_messages is

WARNING, and

AFAICS I do get WARNING-level messages in some vanilla cluster logs,

using

vanilla .psqlrc.

Ah, sorry, I was looking at the wrong thing namely
log_min_error_statement. The point still holds though: if we emit this
at level WARNING, what will get logged is just the bare message and not
the statement that triggered it. How useful do you think that will be?

Ah right, I did miss that "small detail". And of course I agree, as-is
that
would be entirely useless and it should be LOG, like the rest of similar
features.

The patch does not apply successfully; please rebase the patch.

patching file src/backend/utils/misc/guc.c
Hunk #1 FAILED at 642.
Hunk #2 FAILED at 3943.
2 out of 2 hunks FAILED -- saving rejects to file
src/backend/utils/misc/guc.c.rej
patching file src/backend/utils/misc/postgresql.conf.sample
patching file src/include/jit/jit.h

--
Ibrar Ahmed