pgbench throttling latency limit

Started by Fabien COELHOover 11 years ago59 messages

coelho@cri.ensmp.fr

over 11 years ago

1 attachment(s)

Add --limit to limit latency under throttling

Under throttling, transactions are scheduled for execution at certain
times. Transactions may be far behind schedule and the system may catch up
with the load later. This option allows to change this behavior by
skipping transactions which are too far behind schedule, and count those
as skipped.

The idea is to help simulate a latency-constrained environment such as a
database used by a web server.

--
Fabien.

Attachments:

pgbench-limit-1.patchtext/x-diff; charset=us-ascii; name=pgbench-limit-1.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..3d43321 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,13 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * When under throttling, execution time slots which are more than
+ * this late (in us) are skipped, and the corresponding transaction
+ * will be counted as somehow aborted.
+ */
+int64		throttle_latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +245,7 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +258,7 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
 } TResult;
 
 /*
@@ -367,6 +376,8 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          under throttling (--rate), skip transactions that\n"
+		   "                           far behind schedule in ms (default: do not skip)\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1027,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (throttle_latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - throttle_latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -2351,7 +2380,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped)
 {
 	double		time_include,
 				tps_include,
@@ -2417,6 +2447,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (throttle_latency_limit)
+			printf("number of skipped transactions: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_latency_skipped,
+				   100.0 * throttle_latency_skipped / normal_xacts);
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2539,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2569,7 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
 
 	int			i;
 
@@ -2775,6 +2811,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					throttle_latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -2895,6 +2943,12 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (throttle_latency_limit != 0 && throttle_delay == 0)
+	{
+		fprintf(stderr, "latency limit (-L) can only be specified with throttling (-R)\n");
+		exit(1);
+	}
+
 	/*
 	 * is_latencies only works with multiple threads in thread-based
 	 * implementations, not fork-based ones, because it supposes that the
@@ -3070,6 +3124,7 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3199,7 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3222,7 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped);
 
 	return 0;
 }
@@ -3191,7 +3247,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3451,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,12 +3475,21 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
+					if (throttle_latency_limit)
+						fprintf(stderr,
+								"progress %d: %.1f s, %.1f tps, "
+								"lat %.3f ms stddev %.3f, lag %.3f ms, "
+								INT64_FORMAT " skipped\n",
+								thread->tid, total_run, tps, latency, stdev, lag,
+								skipped);
+					else
+						fprintf(stderr,
+								"progress %d: %.1f s, %.1f tps, "
+								"lat %.3f ms stddev %.3f, lag %.3f ms\n",
+								thread->tid, total_run, tps, latency, stdev, lag);
 				else
 					fprintf(stderr,
 							"progress %d: %.1f s, %.1f tps, "
@@ -3434,6 +3501,7 @@ threadRun(void *arg)
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3520,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,12 +3546,20 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
+					if (throttle_latency_limit)
+						fprintf(stderr,
+								"progress: %.1f s, %.1f tps, "
+								"lat %.3f ms stddev %.3f, lag %.3f ms, "
+								INT64_FORMAT " skipped\n",
+								total_run, tps, latency, stdev, lag, skipped);
+					else
+						fprintf(stderr,
+								"progress: %.1f s, %.1f tps, "
+								"lat %.3f ms stddev %.3f, lag %.3f ms\n",
+								total_run, tps, latency, stdev, lag);
 				else
 					fprintf(stderr,
 							"progress: %.1f s, %.1f tps, "
@@ -3494,6 +3571,7 @@ threadRun(void *arg)
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3592,7 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 1551686..6177d52 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,17 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind schedule
+        by this number of ms will be skipped and counted as somehow aborted.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -437,7 +448,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Fabien COELHO (#1)

1 attachment(s)

Re: pgbench throttling latency limit

Add --limit to limit latency under throttling

Under throttling, transactions are scheduled for execution at certain times.
Transactions may be far behind schedule and the system may catch up with the
load later. This option allows to change this behavior by skipping
transactions which are too far behind schedule, and count those as skipped.

The idea is to help simulate a latency-constrained environment such as a
database used by a web server.

Find attached a new version:
- fix dropped percent computation in the final report
- simplify progress report code

--
Fabien.

Attachments:

pgbench-limit-2.patchtext/x-diff; name=pgbench-limit-2.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..37a4a8f 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,13 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * When under throttling, execution time slots which are more than
+ * this late (in us) are skipped, and the corresponding transaction
+ * will be counted as somehow aborted.
+ */
+int64		throttle_latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +245,7 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +258,7 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
 } TResult;
 
 /*
@@ -367,6 +376,8 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          under throttling (--rate), skip transactions that\n"
+		   "                           far behind schedule in ms (default: do not skip)\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1027,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (throttle_latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - throttle_latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -2351,7 +2380,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped)
 {
 	double		time_include,
 				tps_include,
@@ -2417,6 +2447,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (throttle_latency_limit)
+			printf("number of skipped transactions: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_latency_skipped,
+				   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2539,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2569,7 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
 
 	int			i;
 
@@ -2775,6 +2811,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					throttle_latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -2895,6 +2943,12 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (throttle_latency_limit != 0 && throttle_delay == 0)
+	{
+		fprintf(stderr, "latency limit (-L) can only be specified with throttling (-R)\n");
+		exit(1);
+	}
+
 	/*
 	 * is_latencies only works with multiple threads in thread-based
 	 * implementations, not fork-based ones, because it supposes that the
@@ -3070,6 +3124,7 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3199,7 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3222,7 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped);
 
 	return 0;
 }
@@ -3191,7 +3247,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3451,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3475,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_latency_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3511,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3537,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_latency_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3575,7 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..43a5582 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,17 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind schedule
+        by this number of ms will be skipped and counted as somehow aborted.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -437,7 +448,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

Rukh Meski

rukh.meski@gmail.com

over 11 years ago

In reply to: Fabien COELHO (#2)

Re: pgbench throttling latency limit

Hi Fabien,

On Sun, Aug 24, 2014 at 9:16 AM, Fabien COELHO <coelho@cri.ensmp.fr> wrote:

Find attached a new version:
- fix dropped percent computation in the final report
- simplify progress report code

I have reviewed this patch.

Is the patch in a patch format which has context?
Yes.
Does it apply cleanly to the current git master?
Yes.
Does it include reasonable tests, necessary doc patches, etc?
Yes.

Does the patch actually implement that?
Yes.
Do we want that?
I think we do, yes.
Do we already have it?
No.
Are there dangers?
None that I can see.

Does the feature work as advertised?
Almost, see below.
Are there corner cases the author has failed to consider?
None that I can see.
Are there any assertion failures or crashes?
No.

I can't make the -L option work at all. If I do this:
./pgbench -R 100 -L 1
I get:
pgbench: invalid option -- L
Which appears to be caused by the fact that the call to getopt_long()
has not been updated to reflect the new parameter.

Also this part:
+          "  -L, --limit=NUM          under throttling (--rate), skip
transactions that\n"
+          "                           far behind schedule in ms
(default: do not skip)\n"
I would suggest rewording this to something like "skip transactions
that are more than NUM milliseconds behind schedule (default: do not
skip)".

Marking Waiting for Author until these small issues have been fixed.

Thanks,

♜

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Rukh Meski (#3)

1 attachment(s)

Re: pgbench throttling latency limit

Hello Rukh,

I have reviewed this patch.

Thanks!

[...] I get: pgbench: invalid option -- L
Which appears to be caused by the fact that the call to getopt_long()
has not been updated to reflect the new parameter.

Indeed, I only tested/used it with the --limit= syntax.

Also this part:
+          "  -L, --limit=NUM          under throttling (--rate), skip
transactions that\n"
+          "                           far behind schedule in ms
(default: do not skip)\n"
I would suggest rewording this to something like "skip transactions
that are more than NUM milliseconds behind schedule (default: do not
skip)".

Done, with milliseconds written as "ms" to keep it short.

Marking Waiting for Author until these small issues have been fixed.

Please find attached a new version which fixes these two points.

--
Fabien.

Attachments:

pgbench-limit-3.patchtext/x-diff; name=pgbench-limit-3.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..de77817 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,13 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * When under throttling, execution time slots which are more than
+ * this late (in us) are skipped, and the corresponding transaction
+ * will be counted as somehow aborted.
+ */
+int64		throttle_latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +245,7 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +258,7 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
 } TResult;
 
 /*
@@ -367,6 +376,8 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          under throttling (--rate), skip transactions that\n"
+		   "                           are more than NUM ms behind schedule (default: do not skip)\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1027,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (throttle_latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - throttle_latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -2351,7 +2380,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped)
 {
 	double		time_include,
 				tps_include,
@@ -2417,6 +2447,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (throttle_latency_limit)
+			printf("number of skipped transactions: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_latency_skipped,
+				   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2539,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2569,7 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
 
 	int			i;
 
@@ -2578,7 +2614,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2775,6 +2811,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					throttle_latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -2895,6 +2943,12 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (throttle_latency_limit != 0 && throttle_delay == 0)
+	{
+		fprintf(stderr, "latency limit (-L) can only be specified with throttling (-R)\n");
+		exit(1);
+	}
+
 	/*
 	 * is_latencies only works with multiple threads in thread-based
 	 * implementations, not fork-based ones, because it supposes that the
@@ -3070,6 +3124,7 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3199,7 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3222,7 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped);
 
 	return 0;
 }
@@ -3191,7 +3247,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3451,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3475,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_latency_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3511,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3537,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_latency_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3575,7 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..43a5582 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,17 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind schedule
+        by this number of ms will be skipped and counted as somehow aborted.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -437,7 +448,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Rukh Meski (#3)

Re: pgbench throttling latency limit

Marking Waiting for Author until these small issues have been fixed.

I've put it back to "Needs review". Feel free to set it to "Ready" if it
is ok for you.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Rukh Meski

rukh.meski@gmail.com

over 11 years ago

In reply to: Fabien COELHO (#4)

Re: pgbench throttling latency limit

Hi Fabien,

On Tue, Aug 26, 2014 at 04:07 AM, Fabien COELHO <coelho@cri.ensmp.fr> wrote:

Please find attached a new version which fixes these two points.

Indeed it does. Marking the patch ready for a committer.

Thanks,

♜

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Rukh Meski (#6)

Re: pgbench throttling latency limit

On 08/27/2014 03:47 AM, Rukh Meski wrote:

Hi Fabien,

On Tue, Aug 26, 2014 at 04:07 AM, Fabien COELHO <coelho@cri.ensmp.fr> wrote:

Please find attached a new version which fixes these two points.

Indeed it does. Marking the patch ready for a committer.

I find the definition of the latency limit a bit strange. It's a limit
on how late a transaction can *start* compared to it's scheduled
starting time, not how long a query is allowed to last. How do figure
out what it should be set to?

That model might make some sense if you think e.g. of a web application,
where the web server has a timeout for how long it waits to get a
database connection from a pool, but once a query is started, the
transaction is considered a succeess no matter how long it takes. The
latency limit would be that timeout. But I think a more useful model is
that when the user clicks a button, he waits at most X seconds for the
result. If that deadline is exceeded, the web server will give a 404, or
the user will simply get bored and go away, and the transaction is
considered a failure.

So I think a more useful model is that new queries arrive at a given
rate, and each query is expected to finish in X milliseconds from its
arrival time (i.e the time the query is scheduled to begin, not the time
it was sent to the server) or it's counted as failed. If a transaction
cannot even be started by that deadline, because the connection is still
busy with the previous query, it's counted as failed without even
sending it to the server.

With that definition, it makes sense to specify the latency limit even
without --rate. In that case, it's simply a limit on how long each
query's execution is allowed to last until it's considered as failed.
IOW, each query's scheduled start time is when the previous query ends.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#7)

1 attachment(s)

Re: pgbench throttling latency limit

Hello Heikki,

I find the definition of the latency limit a bit strange. It's a limit on how
late a transaction can *start* compared to it's scheduled starting time, not
how long a query is allowed to last.

Yes. This is what can be done easily with pgbench under throttling. Note
that if transactions take long it is recorded (average, stddev...) so it
appears elsewhere.

How do figure out what it should be set to?

It is really just a simple tool to measure unresponsiveness under
throttling, which I'm testing and complaining about in another thread.

The underlying model I have in mind would some timeout from an
application, say a web server, or a pooling process which is handling a
queue of requests...

Now, if I describe that as "lag limit" instead if "latency limit", maybe
it is clearer and better?

That model might make some sense if you think e.g. of a web application,
[...]

Yep, that is what I had in mind, but the primary objective is really to
check whether pg is responsive or not.

So I think a more useful model is that new queries arrive at a given
rate, and each query is expected to finish in X milliseconds from its
arrival time (i.e the time the query is scheduled to begin, not the time
it was sent to the server) or it's counted as failed. If a transaction
cannot even be started by that deadline, because the connection is still
busy with the previous query, it's counted as failed without even
sending it to the server. With that definition, it makes sense to
specify the latency limit even without --rate.

Yep. But that is not what I'm doing here. It would be interesting as well.
It would be another patch.

In that case, it's simply a limit on how long each query's
execution is allowed to last until it's considered as failed. IOW, each
query's scheduled start time is when the previous query ends.

Not under --rate... that is the point of throttling! Under throttling,
the latency should really be computed wrt to the schedule start time and
not the actual start time which may be 10 seconds afterwards when things
are going bad... Also, there is the question of whether the "failed query"
is executed or not. Here I'm not executing them, in effect they were
aborted by the application. With your suggestion they would be executed
anyway but considered failed.

So what you are suggesting is another (interesting) functionnality, that
could indeed be named "latency limit" (count slow above a threshold
queries), what I'm doing here is "lag limit" (scheduled query could not
start on time and are skipped, this is really specific to --rate).

In the updated patch attached, I changed the explanations, documentation
and name to "lag limit" instead of "latency limit" to clarify this point.
It was really a misnommer.

--
Fabien.

Attachments:

pgbench-limit-4.patchtext/x-diff; name=pgbench-limit-4.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..d7ee1c9 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,13 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * When under throttling, execution time slots which are more than
+ * this late (in us) are skipped, and the corresponding transaction
+ * will be counted as somehow aborted.
+ */
+int64		throttle_lag_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +245,7 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_lag_skipped; /* lagging transactions skipped */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +258,7 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_lag_skipped;
 } TResult;
 
 /*
@@ -367,6 +376,9 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --lag-limit=NUM      under throttling (--rate), skip transactions that\n"
+		   "                           are lagging more than NUM ms behind schedule\n"
+		   "                           (default: do not skip)\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1028,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (throttle_lag_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - throttle_lag_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_lag_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -2351,7 +2381,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_lag_skipped)
 {
 	double		time_include,
 				tps_include,
@@ -2417,6 +2448,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (throttle_lag_limit)
+			printf("number of skipped lag transactions: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_lag_skipped,
+				   100.0 * throttle_lag_skipped / (throttle_lag_skipped + normal_xacts));
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2540,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"lag-limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2570,7 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_lag_skipped = 0;
 
 	int			i;
 
@@ -2578,7 +2615,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2775,6 +2812,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid lag limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					throttle_lag_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -2895,6 +2944,12 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (throttle_lag_limit != 0 && throttle_delay == 0)
+	{
+		fprintf(stderr, "lag limit (-L) can only be specified with throttling (-R)\n");
+		exit(1);
+	}
+
 	/*
 	 * is_latencies only works with multiple threads in thread-based
 	 * implementations, not fork-based ones, because it supposes that the
@@ -3070,6 +3125,7 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_lag_skipped = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3200,7 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_lag_skipped += r->throttle_lag_skipped;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3223,7 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_lag_skipped);
 
 	return 0;
 }
@@ -3191,7 +3248,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3452,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3476,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_lag_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_lag_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_lag_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3512,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3538,24 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_lag_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+					fprintf(stderr, ", lag %.3f ms", lag);
+				if (throttle_lag_limit)
+					fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_lag_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3576,7 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_lag_skipped = thread->throttle_lag_skipped;
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..3c637bc 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,18 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--lag-limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions lagging
+        behind schedule by this number of ms will be skipped and reported as
+        such.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -437,7 +449,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--lag-limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#8)

Re: pgbench throttling latency limit

On 08/27/2014 12:41 PM, Fabien COELHO wrote:

Hello Heikki,

I find the definition of the latency limit a bit strange. It's a limit on how
late a transaction can *start* compared to it's scheduled starting time, not
how long a query is allowed to last.

Yes. This is what can be done easily with pgbench under throttling. Note
that if transactions take long it is recorded (average, stddev...) so it
appears elsewhere.

How do figure out what it should be set to?

It is really just a simple tool to measure unresponsiveness under
throttling, which I'm testing and complaining about in another thread.

Ok, but wouldn't the definition I gave be just as useful for that
purpose, and more useful in general?

You didn't really answer my question: How do you figure out what to set
it to? With a latency limit on when the query should finish, as opposed
to how late it can start, it's a lot easier to give a number. For
example, your requirements might state that a user must always get a
response to a click on a web page in 200 ms, so you set the limit to 200 ms.

So I think a more useful model is that new queries arrive at a given
rate, and each query is expected to finish in X milliseconds from its
arrival time (i.e the time the query is scheduled to begin, not the time
it was sent to the server) or it's counted as failed. If a transaction
cannot even be started by that deadline, because the connection is still
busy with the previous query, it's counted as failed without even
sending it to the server. With that definition, it makes sense to
specify the latency limit even without --rate.

Yep. But that is not what I'm doing here. It would be interesting as well.
It would be another patch.

Why is your patch more interesting than what I described? I'm pretty
sure we don't need both.

In that case, it's simply a limit on how long each query's
execution is allowed to last until it's considered as failed. IOW, each
query's scheduled start time is when the previous query ends.

Not under --rate... that is the point of throttling!

Right, I got that. With "in that case", I meant when you're not throttling.

Under throttling,
the latency should really be computed wrt to the schedule start time and
not the actual start time which may be 10 seconds afterwards when things
are going bad... Also, there is the question of whether the "failed query"
is executed or not. Here I'm not executing them, in effect they were
aborted by the application. With your suggestion they would be executed
anyway but considered failed.

I was thinking that if a query is already late when the connection
becomes free to execute it, it would not be executed. It would be
skipped, just as in your patch.

So what you are suggesting is another (interesting) functionnality, that
could indeed be named "latency limit" (count slow above a threshold
queries), what I'm doing here is "lag limit" (scheduled query could not
start on time and are skipped, this is really specific to --rate).

Ok, but *why* are you doing a "lag limit", and not a "latency limit"?
Under what circumstances is the lag limit a more useful setting?

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#9)

Re: pgbench throttling latency limit

Hello Heikki,

[...]
With a latency limit on when the query should finish, as opposed to how
late it can start, it's a lot easier to give a number. For example, your
requirements might state that a user must always get a response to a click on
a web page in 200 ms, so you set the limit to 200 ms.

Yep. See below for the details.

[...] Why is your patch more interesting than what I described?

It is more interesting because it exists, it is short and simple, it
works, and it is useful right now to test pg responsiveness and also to
model some timeout behavior on the client side?

I'm pretty sure we don't need both.

Why not? Testing performance is tricky enough, the tool must be flexible.

I'm pretty sure that I'm interested in testing pg responsiveness right
now, so I did the simpler one I need for that purpose. It somehow models
an application/pooler queue management timeout, that would anyway proceed
with what is already started.

[...]

I was thinking that if a query is already late when the connection becomes
free to execute it, it would not be executed. It would be skipped, just as in
your patch.

As for an actual "latency limit" under throttling, this is significantly
more tricky and invasive to implement... ISTM that it would mean:

- if the tx is not stated an the latency is already consummed, SKIP++.

- if the tx is after its schedule start time but under latency, then
start it, and maybe inject a "SET TIMEOUT...".

- if a tx is being processed but reaches its latency limit (after
schedule start time), abort it coldly, ROLLBACK++ (well if the tx is
really started, there could also be shell commands and \set stuff in a
pgbench script, which mean started is not really started, so it would
be INTERRUPT++ if no BEGIN was sent).

- if a tx is finished but the final commit returned after the latency
deadline, you cannot abort it anymore but it is late nevertheless,
LATE++.

This is doable but far beyond my current needs. Moreover, I'm not sure
that such a patch would pass because of invasiveness and complexity, so it
could be a total loss of time.

Ok, but *why* are you doing a "lag limit", and not a "latency limit"?

Because it is much simpler (see above) and is enough for testing pg
responsiveness issue, which is my current objective, and models some
client timeout behavior.

Under what circumstances is the lag limit a more useful setting?

It is not "more" useful" per se, it is what I'm using to test pg
unresponsivness with a simple to define and interpret measure wrt
throttling.

If I would do "latency limit" under throttling, it would be (1) more time
to develop, more complex, more invasive in the code (see above, + also the
implementation when not under throttling), (2) more complex to interpret,
with at least 5 possible outcomes (skipped, interrupted, committed on
time, committed but late, aborted), (3) this added information would not
be useful to me.

I've submitted this "simple" lag limit version because being able to
measure quickly and simply (un)responsiveness seems like a good idea,
especially given the current state of things.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#10)

Re: pgbench throttling latency limit

On 08/27/2014 02:37 PM, Fabien COELHO wrote:

As for an actual "latency limit" under throttling, this is significantly
more tricky and invasive to implement... ISTM that it would mean:

- if the tx is not stated an the latency is already consummed, SKIP++.

- if the tx is after its schedule start time but under latency, then
start it, and maybe inject a "SET TIMEOUT...".

- if a tx is being processed but reaches its latency limit (after
schedule start time), abort it coldly, ROLLBACK++ (well if the tx is
really started, there could also be shell commands and \set stuff in a
pgbench script, which mean started is not really started, so it would
be INTERRUPT++ if no BEGIN was sent).

- if a tx is finished but the final commit returned after the latency
deadline, you cannot abort it anymore but it is late nevertheless,
LATE++.

Yeah, something like that. I don't think it would be necessary to set
statement_timeout, you can inject that in your script or postgresql.conf
if you want. I don't think aborting a transaction that's already started
is necessary either. You could count it as LATE, but let it finish first.

This is doable but far beyond my current needs. Moreover, I'm not sure
that such a patch would pass because of invasiveness and complexity, so it
could be a total loss of time.

Ok, but *why* are you doing a "lag limit", and not a "latency limit"?

Because it is much simpler (see above) and is enough for testing pg
responsiveness issue, which is my current objective, and models some
client timeout behavior.

Under what circumstances is the lag limit a more useful setting?

It is not "more" useful" per se, it is what I'm using to test pg
unresponsivness with a simple to define and interpret measure wrt
throttling.

If I would do "latency limit" under throttling, it would be (1) more time
to develop, more complex, more invasive in the code (see above, + also the
implementation when not under throttling), (2) more complex to interpret,
with at least 5 possible outcomes (skipped, interrupted, committed on
time, committed but late, aborted), (3) this added information would not
be useful to me.

I've submitted this "simple" lag limit version because being able to
measure quickly and simply (un)responsiveness seems like a good idea,
especially given the current state of things.

Ok, fair enough. I don't think doing a "latency limit" would be
significantly harder, but I can't force you. I'll mark this as Returned
with Feedback then.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#11)

Re: pgbench throttling latency limit

As for an actual "latency limit" under throttling, this is significantly
more tricky and invasive to implement... ISTM that it would mean:
[...]

Yeah, something like that. I don't think it would be necessary to set
statement_timeout, you can inject that in your script or postgresql.conf if
you want. I don't think aborting a transaction that's already started is
necessary either. You could count it as LATE, but let it finish first.

If you remove all difficult cases from the spec, it is obviously much
simpler to implement:-) It seems that your simplified version of "latency
limit" would be just to distinguish LATE from ONTIME among the committed
ones, compared to the current version, and not to actually limit the
latency, which is the tricky part.

I've submitted this "simple" lag limit version because being able to
measure quickly and simply (un)responsiveness seems like a good idea,
especially given the current state of things.

Ok, fair enough. I don't think doing a "latency limit" would be significantly
harder, but I can't force you. I'll mark this as Returned with Feedback then.

Hmmm. I can distinguish just the two cases. Rather mark it as "waiting on
author", I may give it a go.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#12)

Re: pgbench throttling latency limit

On 08/27/2014 06:08 PM, Fabien COELHO wrote:

I've submitted this "simple" lag limit version because being able to
measure quickly and simply (un)responsiveness seems like a good idea,
especially given the current state of things.

Ok, fair enough. I don't think doing a "latency limit" would be significantly
harder, but I can't force you. I'll mark this as Returned with Feedback then.

Hmmm. I can distinguish just the two cases. Rather mark it as "waiting on
author", I may give it a go.

Feel free to mark it as such if you think you can get a new version
posted in the next few days.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#11)

1 attachment(s)

Re: pgbench throttling latency limit

[...]

Yeah, something like that. I don't think it would be necessary to set
statement_timeout, you can inject that in your script or postgresql.conf if
you want. I don't think aborting a transaction that's already started is
necessary either. You could count it as LATE, but let it finish first.

I've implemented something along these simplified lines. The latency is
not limited as such, but slow (over the limit) queries are counted and
reported.

--
Fabien.

Attachments:

pgbench-limit-5.patchtext/x-diff; name=pgbench-limit-5.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..96e5fb9 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -367,6 +383,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1036,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1080,13 +1118,17 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * ot if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			instr_time	diff;
-			int64		latency;
+			int64		latency, now;
 
 			INSTR_TIME_SET_CURRENT(diff);
+			now = INSTR_TIME_GET_MICROSEC(diff);
 			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
 			latency = INSTR_TIME_GET_MICROSEC(diff);
 			st->txn_latencies += latency;
@@ -1099,6 +1141,19 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit)
+			{
+				/* Under throttling, late means wrt to the initial schedule,
+				 * not the actual transaction start
+				 */
+				if (throttle_delay)
+					latency = now - thread->throttle_trigger;
+				if (latency > latency_limit)
+					thread->latency_late++;
+			}
 		}
 
 		/*
@@ -1294,7 +1349,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
 	/* Record statement start time if per-command latencies are requested */
@@ -2351,7 +2406,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2407,6 +2463,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   1000.0 * duration * nclients / normal_xacts);
 	}
 
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "\n",
+			   latency_limit / 1000.0, latency_late);
+
 	if (throttle_delay)
 	{
 		/*
@@ -2417,6 +2477,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (latency_limit)
+			printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_latency_skipped,
+				   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2569,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2599,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2578,7 +2645,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2775,6 +2842,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3070,6 +3149,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3225,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3249,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3191,7 +3275,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3479,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3503,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3541,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3567,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3607,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..f80116f 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -437,7 +454,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

#15

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#14)

Re: pgbench throttling latency limit

On 08/27/2014 08:05 PM, Fabien COELHO wrote:

[...]

Yeah, something like that. I don't think it would be necessary to set
statement_timeout, you can inject that in your script or postgresql.conf if
you want. I don't think aborting a transaction that's already started is
necessary either. You could count it as LATE, but let it finish first.

I've implemented something along these simplified lines. The latency is
not limited as such, but slow (over the limit) queries are counted and
reported.

Ok, thanks.

This now begs the question:

In --rate mode, shouldn't the reported transaction latency also be
calculated from the *scheduled* start time, not the time the transaction
actually started? Otherwise we're using two different definitions of
"latency", one for the purpose of the limit, and another for reporting.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#15)

1 attachment(s)

Re: pgbench throttling latency limit

Hello Heikki,

This now begs the question:

In --rate mode, shouldn't the reported transaction latency also be calculated
from the *scheduled* start time, not the time the transaction actually
started? Otherwise we're using two different definitions of "latency", one
for the purpose of the limit, and another for reporting.

It could. Find a small patch **on top of v5** which does that. I've tried
to update the documentation accordingly as well.

Note that the information is already there as the average lag time is
reported, ISTM that:

avg latency2 ~ avg lag + avg latency1

so it is just a matter of choice, both are ok somehow. I would be fine
with both.

--
Fabien.

Attachments:

pgbench-limit-5b.patchtext/x-diff; name=pgbench-limit-5b.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 96e5fb9..40427a3 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -1125,12 +1125,24 @@ top:
 			commands[st->state + 1] == NULL)
 		{
 			instr_time	diff;
-			int64		latency, now;
+			int64		latency;
 
 			INSTR_TIME_SET_CURRENT(diff);
-			now = INSTR_TIME_GET_MICROSEC(diff);
-			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-			latency = INSTR_TIME_GET_MICROSEC(diff);
+
+			if (throttle_delay)
+			{
+				/* Under throttling, compute latency wrt to the initial schedule,
+				 * not the actual transaction start.
+				 */
+				int64 now = INSTR_TIME_GET_MICROSEC(diff);
+				latency = now - thread->throttle_trigger;
+			}
+			else
+			{
+				INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+				latency = INSTR_TIME_GET_MICROSEC(diff);
+			}
+
 			st->txn_latencies += latency;
 
 			/*
@@ -1144,16 +1156,8 @@ top:
 
 			/* record over the limit transactions if needed.
 			 */
-			if (latency_limit)
-			{
-				/* Under throttling, late means wrt to the initial schedule,
-				 * not the actual transaction start
-				 */
-				if (throttle_delay)
-					latency = now - thread->throttle_trigger;
-				if (latency > latency_limit)
-					thread->latency_late++;
-			}
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index f80116f..453ae4b 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -420,8 +420,10 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         Show progress report every <literal>sec</> seconds.  The report
         includes the time since the beginning of the run, the tps since the
         last report, and the transaction latency average and standard
-        deviation since the last report.  Under throttling (<option>-R</>), it
-        also includes the average schedule lag time since the last report.
+        deviation since the last report.  Under throttling (<option>-R</>),
+        the latency is computed with respect to the transaction scheduled
+        start time, not the actual transaction beginning time, and it also
+        includes the average schedule lag time since the last report.
        </para>
       </listitem>
      </varlistentry>

#17

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Fabien COELHO (#16)

1 attachment(s)

Re: pgbench throttling latency limit

Hello Heikki,

[...] I would be fine with both.

After giving it some thought, ISTM better to choose consistency over
intuition, and have latency under throttling always defined wrt the
scheduled start time and not the actual start time, even if having a
latency of 10000 ms for an OLTP load might seem surprising to some.

The other one can be computed by substracting the average lag time.

I attached a v6 which is a consolidate patch of v5 + the small update for
the latency definition.

--
Fabien.

Attachments:

pgbench-limit-6.patchtext/x-diff; name=pgbench-limit-6.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..40427a3 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -367,6 +383,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1036,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1080,15 +1118,31 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * ot if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			instr_time	diff;
 			int64		latency;
 
 			INSTR_TIME_SET_CURRENT(diff);
-			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-			latency = INSTR_TIME_GET_MICROSEC(diff);
+
+			if (throttle_delay)
+			{
+				/* Under throttling, compute latency wrt to the initial schedule,
+				 * not the actual transaction start.
+				 */
+				int64 now = INSTR_TIME_GET_MICROSEC(diff);
+				latency = now - thread->throttle_trigger;
+			}
+			else
+			{
+				INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+				latency = INSTR_TIME_GET_MICROSEC(diff);
+			}
+
 			st->txn_latencies += latency;
 
 			/*
@@ -1099,6 +1153,11 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
@@ -1294,7 +1353,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
 	/* Record statement start time if per-command latencies are requested */
@@ -2351,7 +2410,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2407,6 +2467,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   1000.0 * duration * nclients / normal_xacts);
 	}
 
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "\n",
+			   latency_limit / 1000.0, latency_late);
+
 	if (throttle_delay)
 	{
 		/*
@@ -2417,6 +2481,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (latency_limit)
+			printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+				   throttle_latency_skipped,
+				   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
 	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
@@ -2505,6 +2573,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2603,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2578,7 +2649,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2775,6 +2846,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3070,6 +3153,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3229,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3253,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3191,7 +3279,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3483,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3507,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3545,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3571,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3611,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..453ae4b 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -403,8 +420,10 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         Show progress report every <literal>sec</> seconds.  The report
         includes the time since the beginning of the run, the tps since the
         last report, and the transaction latency average and standard
-        deviation since the last report.  Under throttling (<option>-R</>), it
-        also includes the average schedule lag time since the last report.
+        deviation since the last report.  Under throttling (<option>-R</>),
+        the latency is computed with respect to the transaction scheduled
+        start time, not the actual transaction beginning time, and it also
+        includes the average schedule lag time since the last report.
        </para>
       </listitem>
      </varlistentry>
@@ -437,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction

#18

Andres Freund

andres@2ndquadrant.com

over 11 years ago

In reply to: Fabien COELHO (#17)

Re: pgbench throttling latency limit

Hi,

I generally want to say that having a feature like this feels *very*
helpful to me. Lots of pg development hasn't really paid attention to
anything but the final pgbench results...

On 2014-08-29 19:48:43 +0200, Fabien COELHO wrote:

+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "\n",
+			   latency_limit / 1000.0, latency_late);
+

Any reason not to report a percentage here?

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Andres Freund (#18)

1 attachment(s)

Re: pgbench throttling latency limit

+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "\n",
+			   latency_limit / 1000.0, latency_late);
+

Any reason not to report a percentage here?

Yes: I did not thought of it.

Here is a v7, with a percent. I also added a paragraph in the documenation
about how the latency is computed under throttling, and I tried to reorder
the reported stuff so that it is more logical.

--
Fabien.

Attachments:

pgbench-limit-7.patchtext/x-diff; name=pgbench-limit-7.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..c102ad7 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -367,6 +383,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1016,6 +1036,24 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = (int64) (throttle_delay *
+					1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->until = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1080,15 +1118,31 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * ot if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			instr_time	diff;
 			int64		latency;
 
 			INSTR_TIME_SET_CURRENT(diff);
-			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-			latency = INSTR_TIME_GET_MICROSEC(diff);
+
+			if (throttle_delay)
+			{
+				/* Under throttling, compute latency wrt to the initial schedule,
+				 * not the actual transaction start.
+				 */
+				int64 now = INSTR_TIME_GET_MICROSEC(diff);
+				latency = now - thread->throttle_trigger;
+			}
+			else
+			{
+				INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+				latency = INSTR_TIME_GET_MICROSEC(diff);
+			}
+
 			st->txn_latencies += latency;
 
 			/*
@@ -1099,6 +1153,11 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
@@ -1294,7 +1353,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
 	/* Record statement start time if per-command latencies are requested */
@@ -2351,7 +2410,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2390,7 +2450,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2408,7 +2478,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 	}
 
 	if (throttle_delay)
-	{
 		/*
 		 * Report average transaction lag under rate limit throttling.  This
 		 * is the delay between scheduled and actual start times for the
@@ -2417,7 +2486,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
-	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
 	printf("tps = %f (excluding connections establishing)\n", tps_exclude);
@@ -2505,6 +2573,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2603,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2578,7 +2649,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2775,6 +2846,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3070,6 +3153,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3144,6 +3229,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3166,7 +3253,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3191,7 +3279,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3394,7 +3483,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3417,23 +3507,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3452,7 +3545,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3477,23 +3571,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3514,6 +3611,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 23bfa9e..1c0d20e 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -403,8 +420,10 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         Show progress report every <literal>sec</> seconds.  The report
         includes the time since the beginning of the run, the tps since the
         last report, and the transaction latency average and standard
-        deviation since the last report.  Under throttling (<option>-R</>), it
-        also includes the average schedule lag time since the last report.
+        deviation since the last report.  Under throttling (<option>-R</>),
+        the latency is computed with respect to the transaction scheduled
+        start time, not the actual transaction beginning time, thus it also
+        includes the average schedule lag time.
        </para>
       </listitem>
      </varlistentry>
@@ -437,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the average and maximum transaction
@@ -452,6 +473,17 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         output.
        </para>
        <para>
+        Also, when throttling is active, the transaction latency reported
+        at the end of the run is computed with respect to the scheduled start
+        time, not the actual transaction start time, so it includes the lag time.
+        The perspective is from the client point of view who wanted to perform
+        a transaction but could not even start it for some time.  Very high
+        latencies may be reported when the transactions are lagging behind
+        schedule.  The transaction latency with respect to the actual
+        transaction start time can be computed by substracting the transaction
+        lag time from the reported latency.
+       </para>
+       <para>
         High rate limit schedule lag values, that is lag values that are large
         compared to the actual transaction latency, indicate that something is
         amiss in the throttling process.  High schedule lag can highlight a subtle

#20

Jan Wieck

jan@wi3ck.info

over 11 years ago

In reply to: Heikki Linnakangas (#7)

Re: pgbench throttling latency limit

On 08/27/2014 04:08 AM, Heikki Linnakangas wrote:

That model might make some sense if you think e.g. of a web application,
where the web server has a timeout for how long it waits to get a
database connection from a pool, but once a query is started, the
transaction is considered a succeess no matter how long it takes. The
latency limit would be that timeout. But I think a more useful model is
that when the user clicks a button, he waits at most X seconds for the
result. If that deadline is exceeded, the web server will give a 404, or
the user will simply get bored and go away, and the transaction is
considered a failure.

Correct, the whole TPC-B model better fits an application where client
requests enter a queue at the specified TPS rate and that queue is
processed.

While we are at it,

Note that in the original TPC-B specification the transaction duration
measured is the time from receiving the client request (in current
pgbench under throttling that is for when the transaction is scheduled)
and when the request is answered. This is the client visible response
time, which has nothing to do with the database latency.

As per TPC-B, the entire test is only valid if 90% of all client
response times are within 2 seconds.

It would be useful if pgbench would

A) measure and report that client response time in the per transaction
log files and

B) report at the end what percentage of transactions finished within
a specified response time constraint (default 2 seconds).

Regards,
Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Jan Wieck (#20)

Re: pgbench throttling latency limit

That model might make some sense if you think e.g. of a web application,
where the web server has a timeout for how long it waits to get a
database connection from a pool, but once a query is started, the
transaction is considered a succeess no matter how long it takes. The
latency limit would be that timeout. But I think a more useful model is
that when the user clicks a button, he waits at most X seconds for the
result. If that deadline is exceeded, the web server will give a 404, or
the user will simply get bored and go away, and the transaction is
considered a failure.

Correct, the whole TPC-B model better fits an application where client
requests enter a queue at the specified TPS rate and that queue is processed.

While we are at it,

Note that in the original TPC-B specification the transaction duration
measured is the time from receiving the client request (in current pgbench
under throttling that is for when the transaction is scheduled) and when the
request is answered. This is the client visible response time, which has
nothing to do with the database latency.

Ok. This correspond to the definition used in the current patch. However
ISTM that the tpc-b bench is "as fast as possible", there is no underlying
schedule as with the throttled pgbench.

As per TPC-B, the entire test is only valid if 90% of all client response
times are within 2 seconds.

It would be useful if pgbench would

A) measure and report that client response time in the per transaction
log files and

I never used the per transaction log file. I think that it may already be
the case that it contains this information when not throttled. When under
throttling, I do not know.

B) report at the end what percentage of transactions finished within
a specified response time constraint (default 2 seconds).

What is currently reported is the complement (% of transactions completed
over the time limit).

Note that despite pg appaling latency performance, in may stay well over
the 90% limit, or even 100%: when things are going well a lot of
transaction run in about ms, while when things are going bad transactions
would take a long time (although possibly under or about 1s anyway), *but*
very few transactions are passed, the throughput is very small. The fact
that during 15 seconds only 30 transactions are processed is a detail that
does not show up in the metric.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22

Jan Wieck

jan@wi3ck.info

over 11 years ago

In reply to: Fabien COELHO (#21)

Re: pgbench throttling latency limit

On 09/05/2014 10:12 AM, Fabien COELHO wrote:

Note that despite pg appaling latency performance, in may stay well over
the 90% limit, or even 100%: when things are going well a lot of
transaction run in about ms, while when things are going bad transactions
would take a long time (although possibly under or about 1s anyway), *but*
very few transactions are passed, the throughput is very small. The fact
that during 15 seconds only 30 transactions are processed is a detail that
does not show up in the metric.

I haven't used the real pgbench for a long time. I will have to look at
your patch and see what the current version actually does or does not.

What I have been using is a Python version of pgbench that I wrote for
myself when I started learning that language. That one does record both
values, the DB transaction latency and the client response time (time
from the request being entered into the Queue until transaction commit).
When I look at those results it is possible to have an utterly failing
run, with <60% of client response times being within 2 seconds, but all
the DB transactions are still in milliseconds.

As said, I'll have to take a look at it. Since I am on vacation next
week, getting ready for my first day at EnterpriseDB, this may actually
happen.

Jan

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Jan Wieck (#22)

Re: pgbench throttling latency limit

On 09/05/2014 06:38 PM, Jan Wieck wrote:

On 09/05/2014 10:12 AM, Fabien COELHO wrote:

Note that despite pg appaling latency performance, in may stay well over
the 90% limit, or even 100%: when things are going well a lot of
transaction run in about ms, while when things are going bad transactions
would take a long time (although possibly under or about 1s anyway), *but*
very few transactions are passed, the throughput is very small. The fact
that during 15 seconds only 30 transactions are processed is a detail that
does not show up in the metric.

Yeah, it makes much more sense to measure the latency from the
"scheduled" time than the actual time.

I haven't used the real pgbench for a long time. I will have to look at
your patch and see what the current version actually does or does not.

What I have been using is a Python version of pgbench that I wrote for
myself when I started learning that language. That one does record both
values, the DB transaction latency and the client response time (time
from the request being entered into the Queue until transaction commit).
When I look at those results it is possible to have an utterly failing
run, with <60% of client response times being within 2 seconds, but all
the DB transactions are still in milliseconds.

I think we have to reconsider what we're reporting in 9.4, when --rate
is enabled, even though it's already very late in the release cycle.
It's a bad idea to change the definition of latency between 9.4 and 9.5,
so let's get it right in 9.4.

As said, I'll have to take a look at it. Since I am on vacation next
week, getting ready for my first day at EnterpriseDB, this may actually
happen.

Oh, congrats! :-)

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#24

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Heikki Linnakangas (#23)

1 attachment(s)

Re: pgbench throttling latency limit

On 09/09/2014 01:49 PM, Heikki Linnakangas wrote:

I think we have to reconsider what we're reporting in 9.4, when --rate
is enabled, even though it's already very late in the release cycle.
It's a bad idea to change the definition of latency between 9.4 and 9.5,
so let's get it right in 9.4.

As per the attached patch. I think we should commit this to 9.4. Any
objections?

The text this patch adds to the documentation needs some rewording,
though. As does this existing paragraph:

High rate limit schedule lag values, that is lag values that are
large compared to the actual transaction latency, indicate that
something is amiss in the throttling process. High schedule lag can
highlight a subtle problem there even if the target rate limit is met
in the end. One possible cause of schedule lag is insufficient
pgbench threads to handle all of the clients. To improve that,
consider reducing the number of clients, increasing the number of
threads in pgbench, or running pgbench on a separate host. Another
possibility is that the database is not keeping up with the load at
some point. When that happens, you will have to reduce the expected
transaction rate to lower schedule lag.

It took me ages to parse "high rate limit schedule lag values".

- Heikki

Attachments:

pgbench-calculate-latency-from-scheduled-time.patchtext/x-diff; name=pgbench-calculate-latency-from-scheduled-time.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..bf9c4ea 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -1087,8 +1087,22 @@ top:
 			int64		latency;
 
 			INSTR_TIME_SET_CURRENT(diff);
-			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-			latency = INSTR_TIME_GET_MICROSEC(diff);
+
+			if (throttle_delay)
+			{
+				/*
+				 * Under throttling, compute latency from the scheduled start
+				 * time, not the actual transaction start.
+				 */
+				int64 now = INSTR_TIME_GET_MICROSEC(diff);
+				latency = now - thread->throttle_trigger;
+			}
+			else
+			{
+				INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+				latency = INSTR_TIME_GET_MICROSEC(diff);
+			}
+
 			st->txn_latencies += latency;
 
 			/*
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index b479105..75fd8ec 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -403,8 +403,10 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         Show progress report every <literal>sec</> seconds.  The report
         includes the time since the beginning of the run, the tps since the
         last report, and the transaction latency average and standard
-        deviation since the last report.  Under throttling (<option>-R</>), it
-        also includes the average schedule lag time since the last report.
+        deviation since the last report.  Under throttling (<option>-R</>),
+        the latency is computed with respect to the transaction scheduled
+        start time, not the actual transaction beginning time, thus it also
+        includes the average schedule lag time.
        </para>
       </listitem>
      </varlistentry>
@@ -452,6 +454,17 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         output.
        </para>
        <para>
+        Also, when throttling is active, the transaction latency reported
+        at the end of the run is computed from the scheduled start time, not
+        the actual transaction start time, so it includes the lag time.
+        The perspective is from the client point of view who wanted to perform
+        a transaction but could not even start it for some time.  Very high
+        latencies may be reported when the transactions are lagging behind
+        schedule.  The transaction latency with respect to the actual
+        transaction start time can be computed by substracting the transaction
+        lag time from the reported latency.
+       </para>
+       <para>
         High rate limit schedule lag values, that is lag values that are large
         compared to the actual transaction latency, indicate that something is
         amiss in the throttling process.  High schedule lag can highlight a subtle

#25

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#24)

Re: pgbench throttling latency limit

Hello Heikki,

I think we have to reconsider what we're reporting in 9.4, when --rate
is enabled, even though it's already very late in the release cycle.
It's a bad idea to change the definition of latency between 9.4 and 9.5,
so let's get it right in 9.4.

Indeed.

As per the attached patch. I think we should commit this to 9.4. Any
objections?

Ok for me. Some more propositions about the doc below.

The text this patch adds to the documentation needs some rewording, though.

Probably. Not sure how to improve.

As does this existing paragraph:

High rate limit schedule lag values, that is lag values that are
large compared to the actual transaction latency, indicate that
something is amiss in the throttling process. High schedule lag can
highlight a subtle problem there even if the target rate limit is met
in the end.

One possible cause of schedule lag is insufficient
pgbench threads to handle all of the clients. To improve that,
consider reducing the number of clients, increasing the number of
threads in pgbench, or running pgbench on a separate host. Another
possibility is that the database is not keeping up with the load at
some point. When that happens, you will have to reduce the expected
transaction rate to lower schedule lag.

It took me ages to parse "high rate limit schedule lag values".

Indeed, I'm not proud of that one... Moreover the first sentence becomes
false with the new latency computation, as the lag time is included.

I would suggest:

"When under throttling, the reported lag time measures the delay between
the scheduled start time for the transaction and its actual start time. A
high value, where the lag time represent most of the transaction latency,
may indicate that something is amiss in the throttling process itself,
even if the target rate is met in the end. One possible cause ..."

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#26

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#25)

1 attachment(s)

Re: pgbench throttling latency limit

On 09/09/2014 03:35 PM, Fabien COELHO wrote:

Hello Heikki,

I think we have to reconsider what we're reporting in 9.4, when --rate
is enabled, even though it's already very late in the release cycle.
It's a bad idea to change the definition of latency between 9.4 and 9.5,
so let's get it right in 9.4.

Indeed.

As per the attached patch. I think we should commit this to 9.4. Any
objections?

Ok for me. Some more propositions about the doc below.

I looked closer at the this, and per Jan's comments, realized that we
don't log the lag time in the per-transaction log file. I think that's a
serious omission; when --rate is used, the schedule lag time is
important information to make sense of the result. I think we have to
apply the attached patch, and backpatch to 9.4. (The documentation on
the log file format needs updating)

Also, this is bizarre:

/*
* Use inverse transform sampling to randomly generate a delay, such
* that the series of delays will approximate a Poisson distribution
* centered on the throttle_delay time.
*
* 10000 implies a 9.2 (-log(1/10000)) to 0.0 (log 1) delay
* multiplier, and results in a 0.055 % target underestimation bias:
*
* SELECT 1.0/AVG(-LN(i/10000.0)) FROM generate_series(1,10000) AS i;
* = 1.000552717032611116335474
*
* If transactions are too slow or a given wait is shorter than a
* transaction, the next transaction will start right away.
*/
int64 wait = (int64) (throttle_delay *
1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));

We're using getrand() to generate a uniformly distributed random value
between 1 and 10000, and then convert it to a double between 0.0 and
1.0. But getrand() is implemented by taking a double between 0.0 and 1.0
and converting it an integer, so we're just truncating the original
floating point number unnecessarily. I think we should add a new
function, getPoissonRand(), that uses pg_erand48() directly. We already
have similiar getGaussianRand() and getExponentialRand() functions.
Barring objections, I'll prepare another patch to do that, and backpatch
to 9.4.
- Heikki

Attachments:

pgbench-latency-calculations-1.patchtext/x-diff; name=pgbench-latency-calculations-1.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..f7a3a5f 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -210,10 +210,10 @@ typedef struct
 								 * sent */
 	int			sleeping;		/* 1 indicates that the client is napping */
 	bool		throttling;		/* whether nap is for throttling */
-	int64		until;			/* napping until (usec) */
 	Variable   *variables;		/* array of variable definitions */
 	int			nvariables;
-	instr_time	txn_begin;		/* used for measuring transaction latencies */
+	int64		txn_scheduled;	/* scheduled start time of transaction (usec) */
+	instr_time	txn_begin;		/* used for measuring schedule lag times */
 	instr_time	stmt_begin;		/* used for measuring statement latencies */
 	int64		txn_latencies;	/* cumulated latencies */
 	int64		txn_sqlats;		/* cumulated square latencies */
@@ -284,12 +284,17 @@ typedef struct
 
 	long		start_time;		/* when does the interval start */
 	int			cnt;			/* number of transactions */
-	double		min_duration;	/* min/max durations */
-	double		max_duration;
-	double		sum;			/* sum(duration), sum(duration^2) - for
+
+	double		min_latency;	/* min/max latencies */
+	double		max_latency;
+	double		sum_latency;	/* sum(latency), sum(latency^2) - for
 								 * estimates */
-	double		sum2;
+	double		sum2_latency;
 
+	double		min_lag;
+	double		max_lag;
+	double		sum_lag;		/* sum(lag) */
+	double		sum2_lag;		/* sum(lag*lag) */
 } AggVals;
 
 static Command **sql_files[MAX_FILES];	/* SQL script files */
@@ -968,12 +973,18 @@ agg_vals_init(AggVals *aggs, instr_time start)
 {
 	/* basic counters */
 	aggs->cnt = 0;				/* number of transactions */
-	aggs->sum = 0;				/* SUM(duration) */
-	aggs->sum2 = 0;				/* SUM(duration*duration) */
+	aggs->sum_latency = 0;		/* SUM(latency) */
+	aggs->sum2_latency = 0;				/* SUM(latency*latency) */
 
 	/* min and max transaction duration */
-	aggs->min_duration = 0;
-	aggs->max_duration = 0;
+	aggs->min_latency = 0;
+	aggs->max_latency = 0;
+
+	/* schedule lag counters */
+	aggs->sum_lag = 0;
+	aggs->sum2_lag = 0;
+	aggs->min_lag = 0;
+	aggs->max_lag = 0;
 
 	/* start of the current interval */
 	aggs->start_time = INSTR_TIME_GET_DOUBLE(start);
@@ -1016,7 +1027,7 @@ top:
 
 		thread->throttle_trigger += wait;
 
-		st->until = thread->throttle_trigger;
+		st->txn_scheduled = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
 		st->is_throttled = true;
@@ -1032,13 +1043,13 @@ top:
 
 		INSTR_TIME_SET_CURRENT(now);
 		now_us = INSTR_TIME_GET_MICROSEC(now);
-		if (st->until <= now_us)
+		if (st->txn_scheduled <= now_us)
 		{
 			st->sleeping = 0;	/* Done sleeping, go ahead with next command */
 			if (st->throttling)
 			{
 				/* Measure lag of throttled transaction relative to target */
-				int64		lag = now_us - st->until;
+				int64		lag = now_us - st->txn_scheduled;
 
 				thread->throttle_lag += lag;
 				if (lag > thread->throttle_lag_max)
@@ -1052,6 +1063,11 @@ top:
 
 	if (st->listen)
 	{							/* are we receiver? */
+		instr_time	now;
+		bool		now_valid = false;
+
+		INSTR_TIME_SET_ZERO(now); /* initialize to keep compiler quiet */
+
 		if (commands[st->state]->type == SQL_COMMAND)
 		{
 			if (debug)
@@ -1071,10 +1087,13 @@ top:
 		 */
 		if (is_latencies)
 		{
-			instr_time	now;
 			int			cnum = commands[st->state]->command_num;
 
-			INSTR_TIME_SET_CURRENT(now);
+			if (!now_valid)
+			{
+				INSTR_TIME_SET_CURRENT(now);
+				now_valid = true;
+			}
 			INSTR_TIME_ACCUM_DIFF(thread->exec_elapsed[cnum],
 								  now, st->stmt_begin);
 			thread->exec_count[cnum]++;
@@ -1083,12 +1102,16 @@ top:
 		/* transaction finished: record latency under progress or throttling */
 		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
 		{
-			instr_time	diff;
 			int64		latency;
 
-			INSTR_TIME_SET_CURRENT(diff);
-			INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-			latency = INSTR_TIME_GET_MICROSEC(diff);
+			if (!now_valid)
+			{
+				INSTR_TIME_SET_CURRENT(now);
+				now_valid = true;
+			}
+
+			latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
+
 			st->txn_latencies += latency;
 
 			/*
@@ -1106,9 +1129,8 @@ top:
 		 */
 		if (logfile && commands[st->state + 1] == NULL)
 		{
-			instr_time	now;
-			instr_time	diff;
-			double		usec;
+			double		lag;
+			double		latency;
 
 			/*
 			 * write the log entry if this row belongs to the random sample,
@@ -1117,10 +1139,13 @@ top:
 			if (sample_rate == 0.0 ||
 				pg_erand48(thread->random_state) <= sample_rate)
 			{
-				INSTR_TIME_SET_CURRENT(now);
-				diff = now;
-				INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-				usec = (double) INSTR_TIME_GET_MICROSEC(diff);
+				if (!now_valid)
+				{
+					INSTR_TIME_SET_CURRENT(now);
+					now_valid = true;
+				}
+				latency = (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
+				lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
 
 				/* should we aggregate the results or not? */
 				if (agg_interval > 0)
@@ -1132,15 +1157,27 @@ top:
 					if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
 					{
 						agg->cnt += 1;
-						agg->sum += usec;
-						agg->sum2 += usec * usec;
+						agg->sum_latency += latency;
+						agg->sum2_latency += latency * latency;
 
 						/* first in this aggregation interval */
-						if ((agg->cnt == 1) || (usec < agg->min_duration))
-							agg->min_duration = usec;
+						if ((agg->cnt == 1) || (latency < agg->min_latency))
+							agg->min_latency = latency;
+
+						if ((agg->cnt == 1) || (latency > agg->max_latency))
+							agg->max_latency = latency;
+
+						/* and the same for schedule lag */
+						if (throttle_delay)
+						{
+							agg->sum_lag += lag;
+							agg->sum2_lag += lag * lag;
 
-						if ((agg->cnt == 1) || (usec > agg->max_duration))
-							agg->max_duration = usec;
+							if ((agg->cnt == 1) || (lag < agg->min_lag))
+								agg->min_lag = lag;
+							if ((agg->cnt == 1) || (lag > agg->max_lag))
+								agg->max_lag = lag;
+						}
 					}
 					else
 					{
@@ -1156,23 +1193,32 @@ top:
 							 * ifdef in usage), so we don't need to handle
 							 * this in a special way (see below).
 							 */
-							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f\n",
+							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f\n",
 									agg->start_time,
 									agg->cnt,
-									agg->sum,
-									agg->sum2,
-									agg->min_duration,
-									agg->max_duration);
+									agg->sum_latency,
+									agg->sum2_latency,
+									agg->min_latency,
+									agg->max_latency,
+									agg->sum_lag,
+									agg->sum2_lag,
+									agg->min_lag,
+									agg->max_lag
+								);
 
 							/* move to the next inteval */
 							agg->start_time = agg->start_time + agg_interval;
 
 							/* reset for "no transaction" intervals */
 							agg->cnt = 0;
-							agg->min_duration = 0;
-							agg->max_duration = 0;
-							agg->sum = 0;
-							agg->sum2 = 0;
+							agg->min_latency = 0;
+							agg->max_latency = 0;
+							agg->sum_latency = 0;
+							agg->sum2_latency = 0;
+							agg->min_lag = 0;
+							agg->max_lag = 0;
+							agg->sum2_lag = 0;
+							agg->sum2_lag = 0;
 						}
 
 						/*
@@ -1180,10 +1226,14 @@ top:
 						 * current)
 						 */
 						agg->cnt = 1;
-						agg->min_duration = usec;
-						agg->max_duration = usec;
-						agg->sum = usec;
-						agg->sum2 = usec * usec;
+						agg->min_latency = latency;
+						agg->max_latency = latency;
+						agg->sum_latency = latency;
+						agg->sum2_latency = latency * latency;
+						agg->min_lag = lag;
+						agg->max_lag = lag;
+						agg->sum_lag = lag;
+						agg->sum2_lag = lag * lag;
 					}
 				}
 				else
@@ -1195,17 +1245,17 @@ top:
 					 * This is more than we really ought to know about
 					 * instr_time
 					 */
-					fprintf(logfile, "%d %d %.0f %d %ld %ld\n",
-							st->id, st->cnt, usec, st->use_file,
-							(long) now.tv_sec, (long) now.tv_usec);
+					fprintf(logfile, "%d %d %.0f %d %ld %ld %.0f\n",
+							st->id, st->cnt, latency, st->use_file,
+							(long) now.tv_sec, (long) now.tv_usec, lag);
 #else
 
 					/*
 					 * On Windows, instr_time doesn't provide a timestamp
 					 * anyway
 					 */
-					fprintf(logfile, "%d %d %.0f %d 0 0\n",
-							st->id, st->cnt, usec, st->use_file);
+					fprintf(logfile, "%d %d %.0f %d 0 0 %.0f\n",
+							st->id, st->cnt, usec, st->use_file, lag);
 #endif
 				}
 			}
@@ -1295,8 +1345,17 @@ top:
 
 	/* Record transaction start time under logging, progress or throttling */
 	if ((logfile || progress || throttle_delay) && st->state == 0)
+	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
+		/*
+		 * When not throttling, this is also the transaction's scheduled start
+		 * time.
+		 */
+		if (!throttle_delay)
+			st->txn_scheduled = INSTR_TIME_GET_MICROSEC(st->txn_begin);
+	}
+
 	/* Record statement start time if per-command latencies are requested */
 	if (is_latencies)
 		INSTR_TIME_SET_CURRENT(st->stmt_begin);
@@ -1620,7 +1679,7 @@ top:
 				usec *= 1000000;
 
 			INSTR_TIME_SET_CURRENT(now);
-			st->until = INSTR_TIME_GET_MICROSEC(now) + usec;
+			st->txn_scheduled = INSTR_TIME_GET_MICROSEC(now) + usec;
 			st->sleeping = 1;
 
 			st->listen = 1;
@@ -3309,7 +3368,7 @@ threadRun(void *arg)
 						now_usec = INSTR_TIME_GET_MICROSEC(now);
 					}
 
-					this_usec = st->until - now_usec;
+					this_usec = st->txn_scheduled - now_usec;
 					if (min_usec > this_usec)
 						min_usec = this_usec;
 				}
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index b479105..f293954 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -403,8 +403,10 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         Show progress report every <literal>sec</> seconds.  The report
         includes the time since the beginning of the run, the tps since the
         last report, and the transaction latency average and standard
-        deviation since the last report.  Under throttling (<option>-R</>), it
-        also includes the average schedule lag time since the last report.
+        deviation since the last report.  Under throttling (<option>-R</>),
+        the latency is computed with respect to the transaction scheduled
+        start time, not the actual transaction beginning time, thus it also
+        includes the average schedule lag time.
        </para>
       </listitem>
      </varlistentry>
@@ -440,29 +442,25 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         possible for later ones to catch up again.
        </para>
        <para>
-        When throttling is active, the average and maximum transaction
-        schedule lag time are reported in ms.  This is the delay between
-        the original scheduled transaction time and the actual transaction
-        start times.  The schedule lag shows whether a transaction was
-        started on time or late.  Once a client starts running behind its
-        schedule, every following transaction can continue to be penalized
-        for schedule lag.  If faster transactions are able to catch up, it's
-        possible for them to get back on schedule again.  The lag measurement
-        of every transaction is shown when pgbench is run with debugging
-        output.
+        When throttling is active, the transaction latency reported at the
+        end of the run is calculated from the scheduled start times, so it
+        includes the time each transaction had to wait for the previous
+        transaction to finish. The wait time is called the schedule lag time,
+        and its average and maximum are also reported separately. The
+        transaction latency with respect to the actual transaction start time,
+        i.e. the time spent executing the transaction in the database, can be
+        computed by subtracting the reported schedule lag time from the
+        latency.
        </para>
+
        <para>
-        High rate limit schedule lag values, that is lag values that are large
-        compared to the actual transaction latency, indicate that something is
-        amiss in the throttling process.  High schedule lag can highlight a subtle
-        problem there even if the target rate limit is met in the end.  One
-        possible cause of schedule lag is insufficient pgbench threads to
-        handle all of the clients.  To improve that, consider reducing the
-        number of clients, increasing the number of threads in pgbench, or
-        running pgbench on a separate host.  Another possibility is that the
-        database is not keeping up with the load at some point.  When that
-        happens, you will have to reduce the expected transaction rate to
-        lower schedule lag.
+        A high schedule lag time is an indication that the system cannot
+        process transactions at the specified rate, with the chosen number of
+        clients and threads. When the average transaction execution time is
+        longer than the scheduled interval between each transaction, each
+        successive transaction will fall further behind, and the schedule lag
+        time will keep increasing the longer the test run is. When that
+        happens, you will have to reduce the specified transaction rate.
        </para>
       </listitem>
      </varlistentry>

#27

Mitsumasa KONDO

kondo.mitsumasa@gmail.com

over 11 years ago

In reply to: Heikki Linnakangas (#26)

Re: pgbench throttling latency limit

Hi,

I find typo in your patch. Please confirm.

@line 239
- agg->sum2_lag = 0;
+  agg->sum_lag = 0;

And back patch is welcome for me.

Best Regards,
--
Mitsumasa KONDO

#28

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#26)

Re: pgbench throttling latency limit

Hello Heikki,

I looked closer at the this, and per Jan's comments, realized that we don't
log the lag time in the per-transaction log file. I think that's a serious
omission; when --rate is used, the schedule lag time is important information
to make sense of the result. I think we have to apply the attached patch, and
backpatch to 9.4. (The documentation on the log file format needs updating)

Indeed. I think that people do not like it to change. I remember that I
suggested to change timestamps to "xxxx.yyyyyy" instead of the unreadable
"xxxx yyy", and be told not to, because some people have tool which
process the output so the format MUST NOT CHANGE. So my behavior is not to
avoid touching anything in this area.

I'm fine if you do it, though:-) However I have not time to have a precise
look at your patch to cross-check it before Friday.

Also, this is bizarre:

int64 wait = (int64) (throttle_delay *
1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));

We're using getrand() to generate a uniformly distributed random value
between 1 and 10000, and then convert it to a double between 0.0 and 1.0.

The reason for this conversion is to have randomness but to still avoid
going to extreme multiplier values. The idea is to avoid a very large
multiplier which would generate (even if it is not often) a 0 tps when 100
tps is required. The 10000 granularity is basically random but the
multiplier stays bounded (max 9.2, so the minimum possible tps would be
around 11 for a target of 100 tps, bar issues from the database for
processing the transactions).

So although this feature can be discussed and amended, it is fully
voluntary. I think that it make sense so I would prefer to keep it as is.
Maybe the comments could be update to be clearer.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#28)

Re: pgbench throttling latency limit

On 09/10/2014 05:57 PM, Fabien COELHO wrote:

Hello Heikki,

I looked closer at the this, and per Jan's comments, realized that we don't
log the lag time in the per-transaction log file. I think that's a serious
omission; when --rate is used, the schedule lag time is important information
to make sense of the result. I think we have to apply the attached patch, and
backpatch to 9.4. (The documentation on the log file format needs updating)

Indeed. I think that people do not like it to change. I remember that I
suggested to change timestamps to "xxxx.yyyyyy" instead of the unreadable
"xxxx yyy", and be told not to, because some people have tool which
process the output so the format MUST NOT CHANGE. So my behavior is not to
avoid touching anything in this area.

I'm fine if you do it, though:-) However I have not time to have a precise
look at your patch to cross-check it before Friday.

This is different from changing "xxx yyy" to "xxx.yyy" in two ways.
First, this adds new information to the log that just isn't there
otherwise, it's not just changing the format for the sake of it. Second,
in this case it's easy to write a parser for the log format so that it
works with the old and new formats. Many such programs would probably
ignore any unexpected extra fields, as a matter of lazy programming,
while changing the separator from space to a dot would surely require
changing every parsing program.

We could leave out the lag fields, though, when --rate is not used.
Without --rate, the lag is always zero anyway. That would keep the file
format unchanged, when you don't use the new --rate feature. I'm not
sure if that would be better or worse for programs that might want to
parse the information.

Also, this is bizarre:

int64 wait = (int64) (throttle_delay *
1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));

We're using getrand() to generate a uniformly distributed random value
between 1 and 10000, and then convert it to a double between 0.0 and 1.0.

The reason for this conversion is to have randomness but to still avoid
going to extreme multiplier values. The idea is to avoid a very large
multiplier which would generate (even if it is not often) a 0 tps when 100
tps is required. The 10000 granularity is basically random but the
multiplier stays bounded (max 9.2, so the minimum possible tps would be
around 11 for a target of 100 tps, bar issues from the database for
processing the transactions).

So although this feature can be discussed and amended, it is fully
voluntary. I think that it make sense so I would prefer to keep it as is.
Maybe the comments could be update to be clearer.

Ok, yeah, the comments indeed didn't mention anything about that. I
don't think such clamping is necessary. With that 9.2x clamp on the
multiplier, the probability that any given transaction hits it is about
1/10000. And a delay 9.2 times the average is still quite reasonable,
IMHO. The maximum delay on my laptop, when pg_erand48() returns DBL_MIN,
seems to be about 700 x the average, which is still reasonable if you
run a decent number of transactions. And of course, the probability of
hitting such an extreme value is miniscule, so if you're just doing a
few quick test runs with a small total number of transactions, you won't
hit that.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#30

Jan Wieck

jan@wi3ck.info

over 11 years ago

In reply to: Heikki Linnakangas (#29)

Re: pgbench throttling latency limit

On 09/10/2014 11:28 AM, Heikki Linnakangas wrote:

On 09/10/2014 05:57 PM, Fabien COELHO wrote:

Hello Heikki,

I looked closer at the this, and per Jan's comments, realized that we don't
log the lag time in the per-transaction log file. I think that's a serious
omission; when --rate is used, the schedule lag time is important information
to make sense of the result. I think we have to apply the attached patch, and
backpatch to 9.4. (The documentation on the log file format needs updating)

Indeed. I think that people do not like it to change. I remember that I
suggested to change timestamps to "xxxx.yyyyyy" instead of the unreadable
"xxxx yyy", and be told not to, because some people have tool which
process the output so the format MUST NOT CHANGE. So my behavior is not to
avoid touching anything in this area.

I'm fine if you do it, though:-) However I have not time to have a precise
look at your patch to cross-check it before Friday.

This is different from changing "xxx yyy" to "xxx.yyy" in two ways.
First, this adds new information to the log that just isn't there
otherwise, it's not just changing the format for the sake of it. Second,
in this case it's easy to write a parser for the log format so that it
works with the old and new formats. Many such programs would probably
ignore any unexpected extra fields, as a matter of lazy programming,
while changing the separator from space to a dot would surely require
changing every parsing program.

We could leave out the lag fields, though, when --rate is not used.
Without --rate, the lag is always zero anyway. That would keep the file
format unchanged, when you don't use the new --rate feature. I'm not
sure if that would be better or worse for programs that might want to
parse the information.

We could also leave the default output format as is and introduce
another option with a % style format string.

Jan

Also, this is bizarre:

int64 wait = (int64) (throttle_delay *
1.00055271703 * -log(getrand(thread, 1, 10000) / 10000.0));

We're using getrand() to generate a uniformly distributed random value
between 1 and 10000, and then convert it to a double between 0.0 and 1.0.

The reason for this conversion is to have randomness but to still avoid
going to extreme multiplier values. The idea is to avoid a very large
multiplier which would generate (even if it is not often) a 0 tps when 100
tps is required. The 10000 granularity is basically random but the
multiplier stays bounded (max 9.2, so the minimum possible tps would be
around 11 for a target of 100 tps, bar issues from the database for
processing the transactions).

So although this feature can be discussed and amended, it is fully
voluntary. I think that it make sense so I would prefer to keep it as is.
Maybe the comments could be update to be clearer.

Ok, yeah, the comments indeed didn't mention anything about that. I
don't think such clamping is necessary. With that 9.2x clamp on the
multiplier, the probability that any given transaction hits it is about
1/10000. And a delay 9.2 times the average is still quite reasonable,
IMHO. The maximum delay on my laptop, when pg_erand48() returns DBL_MIN,
seems to be about 700 x the average, which is still reasonable if you
run a decent number of transactions. And of course, the probability of
hitting such an extreme value is miniscule, so if you're just doing a
few quick test runs with a small total number of transactions, you won't
hit that.

- Heikki

--
Jan Wieck
Senior Software Engineer
http://slony.info

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#31

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Mitsumasa KONDO (#27)

Re: pgbench throttling latency limit

On 09/10/2014 05:47 PM, Mitsumasa KONDO wrote:

Hi,

I find typo in your patch. Please confirm.
@line 239
- agg->sum2_lag = 0;
+  agg->sum_lag = 0;

Ah thanks, cood catch!

And back patch is welcome for me.

I've committed and backpatched this, as well as a patch to refactor the
way the Poisson delay is computed.

I kept the log file format unchanged when --rate is not used, so it now
has a different number of fields depending on whether --rate is used or not.

Please review the changes I made one more time, to double-check that I
mess up something.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#32

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#19)

Re: pgbench throttling latency limit

On 08/30/2014 07:16 PM, Fabien COELHO wrote:

+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "\n",
+			   latency_limit / 1000.0, latency_late);
+
Any reason not to report a percentage here?
Yes: I did not thought of it.

Here is a v7, with a percent. I also added a paragraph in the documenation
about how the latency is computed under throttling, and I tried to reorder
the reported stuff so that it is more logical.

Now that I've finished the detour and committed and backpatched the
changes to the way latency is calculated, we can get back to this patch.
It needs to be rebased.

How should skipped transactions should be taken into account in the log
file output, with and without aggregation? I assume we'll want to have
some trace of skipped transactions in the logs.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#33

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#32)

Re: pgbench throttling latency limit

Hello Heikki,

Now that I've finished the detour and committed and backpatched the changes
to the way latency is calculated, we can get back to this patch. It needs to
be rebased.

Before rebasing, I think that there are a few small problems with the
modification applied to switch from an integer range to double.

(1) ISTM that the + 0.5 which remains in the PoissonRand computation comes
from the previous integer approach and is not needed here. If I'm not
mistaken the formula should be plain:

-log(uniform) * center

(2) I'm not sure of the name "center", I think that "lambda" or
"mean" would be more appropriate.

(3) I wish that the maximum implied multiplier could be explicitely
documented in the source code. From pg_rand48 source code, I think
that it is 33.27106466687737

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#34

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Fabien COELHO (#33)

1 attachment(s)

Re: pgbench throttling latency limit

(3) I wish that the maximum implied multiplier could be explicitely
documented in the source code. From pg_rand48 source code, I think
that it is 33.27106466687737

Small possibly buggy code attached, to show how I computed the above
figure.

--
Fabien.

#35

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#32)

1 attachment(s)

Re: pgbench throttling latency limit

Hello Heikki

Now that I've finished the detour and committed and backpatched the changes
to the way latency is calculated, we can get back to this patch. It needs to
be rebased.

Here is the rebase, which seems ok.

See also the small issues raised aboud getPoissonRand in another email.

--
Fabien.

Attachments:

pgbench-limit-8.patchtext/x-diff; name=pgbench-limit-8.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 4001a98..d36fa2c 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -372,6 +388,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -1038,6 +1058,23 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = getPoissonRand(thread, throttle_delay);
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+			}
+		}
+
 		st->txn_scheduled = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1110,8 +1147,11 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * or if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			int64		latency;
 
@@ -1133,6 +1173,11 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
@@ -1360,7 +1405,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
@@ -2426,7 +2471,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2465,7 +2511,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2483,7 +2539,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 	}
 
 	if (throttle_delay)
-	{
 		/*
 		 * Report average transaction lag under rate limit throttling.  This
 		 * is the delay between scheduled and actual start times for the
@@ -2492,7 +2547,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
-	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
 	printf("tps = %f (excluding connections establishing)\n", tps_exclude);
@@ -2580,6 +2634,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2609,6 +2664,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2653,7 +2710,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2850,6 +2907,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3145,6 +3214,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3219,6 +3290,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3241,7 +3314,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3266,7 +3340,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3469,7 +3544,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3492,23 +3568,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3527,7 +3606,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3552,23 +3632,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3589,6 +3672,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index c4e0cbd..026b02b 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -439,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the transaction latency reported at the

#36

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#32)

Re: pgbench throttling latency limit

How should skipped transactions should be taken into account in the log file
output, with and without aggregation? I assume we'll want to have some trace
of skipped transactions in the logs.

The problem with this point is that how to report something "not done" is
unclear, especially as the logic of the log is one line per performed
transaction.

Obviously we can log something, but as the transaction are not performed
the format would be different, which break the expectation of a simple and
homogeneous log file format that people like to process directly.

So bar any great idea, I would suggest not to log skipped transactions and
to wait for someone to need to have access to this detailed information
and for whom the final report is not enough.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#37

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#33)

Re: pgbench throttling latency limit

On 09/11/2014 03:36 PM, Fabien COELHO wrote:

Hello Heikki,

Now that I've finished the detour and committed and backpatched the changes
to the way latency is calculated, we can get back to this patch. It needs to
be rebased.

Before rebasing, I think that there are a few small problems with the
modification applied to switch from an integer range to double.

(1) ISTM that the + 0.5 which remains in the PoissonRand computation comes
from the previous integer approach and is not needed here. If I'm not
mistaken the formula should be plain:

-log(uniform) * center

No. The +0.5 is to round the result to the nearest integer, instead of
truncating it down.

(2) I'm not sure of the name "center", I think that "lambda" or
"mean" would be more appropriate.

(shrug), I guess. The comment says that it's the value the average of a
series values is centered on, so "center" doesn't seem too bad. I guess
the mathematically accurate term would be "expected value".

(3) I wish that the maximum implied multiplier could be explicitely
documented in the source code. From pg_rand48 source code, I think
that it is 33.27106466687737

Oh, ok. That's an even smaller multiplier than I got just by feeding
DBL_MIN to the formula. I don't think that's worth worrying about. That
might change if the implementation of pg_erand48() is changed, so I'm a
bit reluctant to state it explicitly.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#38

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#36)

Re: pgbench throttling latency limit

On 09/11/2014 05:16 PM, Fabien COELHO wrote:

How should skipped transactions should be taken into account in the log file
output, with and without aggregation? I assume we'll want to have some trace
of skipped transactions in the logs.

The problem with this point is that how to report something "not done" is
unclear, especially as the logic of the log is one line per performed
transaction.

Obviously we can log something, but as the transaction are not performed
the format would be different, which break the expectation of a simple and
homogeneous log file format that people like to process directly.

So bar any great idea, I would suggest not to log skipped transactions and
to wait for someone to need to have access to this detailed information
and for whom the final report is not enough.

We have to come up with something. The point of the log file is that it
contains all the information you need to build your own graphs, when
pgbench's built-in reporting features are not enough. If it doesn't
contain detailed information about skipped transactions, a report based
on the log file would be inaccurate, or at least misleading.

How about printing a line in the log for every skipped transaction, with
the string "skipped" in place of the latency. The "completion time" can
show the time when the transaction was skipped, and the lag can show the
difference between the scheduled time and the time it was skipped. Or
put another way, print a line as if the transaction completed
immediately, but with the "skipped" string in the latency field.

The "skipped" string will trip a program that doesn't expect that, but
since this is a new feature that you have to enable manually, that's OK.

The output would look something like this (modified from the manual's
example by hand, so the numbers don't add up):

0 199 2241 0 1175850568 995598 1020
0 200 2465 0 1175850568 998079 1010
0 201 skipped 1175850569 608 3011
0 202 skipped 1175850569 608 2400
0 203 skipped 1175850569 608 1000
0 204 2513 0 1175850569 608 500
0 205 2038 0 1175850569 2663 500

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#39

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#37)

Re: pgbench throttling latency limit

(1) ISTM that the + 0.5 which remains in the PoissonRand computation comes
from the previous integer approach and is not needed here. If I'm not
mistaken the formula should be plain:

-log(uniform) * center

No. The +0.5 is to round the result to the nearest integer, instead of
truncating it down.

Hmmm... probably ok. I'll have to think about it a bit.

In that case, it seems much clearer to do: "round(-log(uniform) * xxx)"
instead of relying on the truncation of the cast.

(2) I'm not sure of the name "center", I think that "lambda" or
"mean" would be more appropriate.

(shrug), I guess. The comment says that it's the value the average of a
series values is centered on, so "center" doesn't seem too bad. I guess the
mathematically accurate term would be "expected value".

The word "center" does not appear once of the wikipedia page about
"Poisson distribution". Its "mean" is called "lambda" (or rather οΏ½:-) all
over the place. I find "expected value" rather too generic, but it is
better than "center".

(3) I wish that the maximum implied multiplier could be explicitely
documented in the source code. From pg_rand48 source code, I think
that it is 33.27106466687737

Oh, ok. That's an even smaller multiplier than I got just by feeding DBL_MIN
to the formula. I don't think that's worth worrying about. That might change
if the implementation of pg_erand48() is changed, so I'm a bit reluctant to
state it explicitly.

I think that it is an important information, so it deserves to appear.

If pg_erand48 implementation changes, it should not be called "48",
because the max value and the above multiplier limit is completely linked
to the "16*3=48" structure of the random construction.

If the code change then the comments need be updated, that is life.

--
Fabien.
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#40

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#38)

Re: pgbench throttling latency limit

The output would look something like this (modified from the manual's example
by hand, so the numbers don't add up):

0 199 2241 0 1175850568 995598 1020
0 200 2465 0 1175850568 998079 1010
0 201 skipped 1175850569 608 3011
0 202 skipped 1175850569 608 2400
0 203 skipped 1175850569 608 1000
0 204 2513 0 1175850569 608 500
0 205 2038 0 1175850569 2663 500

My 0.02ï¿½: ISTM that the number of columns should stay the same whether it
is skipped or not, so the "file_no" should be kept. Maybe to keep it a
number would make sense (-1) or just a sign (-) which means "no value"
with gnuplot for instance. Or "skipped".

Basically I would be fine with that, but as I do not use the log file
feature I'm not sure that my opinion should count.

Note that there are also potential issues with the aggregate logging and
the sampling stuff.

--
Fabien.
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#41

Gregory Smith

gregsmithpgsql@gmail.com

over 11 years ago

In reply to: Fabien COELHO (#28)

Re: pgbench throttling latency limit

On 9/10/14, 10:57 AM, Fabien COELHO wrote:

Indeed. I think that people do not like it to change. I remember that
I suggested to change timestamps to "xxxx.yyyyyy" instead of the
unreadable "xxxx yyy", and be told not to, because some people have
tool which process the output so the format MUST NOT CHANGE. So my
behavior is not to avoid touching anything in this area.

That somewhat hysterical version of events isn't what I said. Heikki has
the right idea for backpatching, so let me expand on that rationale,
with an eye toward whether 9.5 is the right time to deal with this.

Not all software out there will process epoch timestamps with
milliseconds added as a fraction at the end. Being able to read an
epoch time in seconds as an integer is a well defined standard; the
fraction part is not.

Here's an example of the problem, from a Mac OS X system:

$ date -j -f "%a %b %d %T %Z %Y" "`date`" "+%s"
1410544903
$ date -r 1410544903
Fri Sep 12 14:01:43 EDT 2014
$ date -r 1410544903.532
usage: date [-jnu] [-d dst] [-r seconds] [-t west] [-v[+|-]val[ymwdHMS]] ...
[-f fmt date | [[[mm]dd]HH]MM[[cc]yy][.ss]] [+format]

The current file format allows any random shell script to use a tool
like cut to pull out the second resolution timestamp column as an epoch
integer field, then pass it through even a utility as simple as date to
reformat that. And for a lot of people, second resolution is perfectly
fine anyway.

The change you propose will make that job harder for some people, in
order to make the job you're interested in easier. I picked the
simplest possible example, but there are more. Whether epoch timestamps
can have millisecond parts depends on your time library in Java, in
Python some behavior depends on whether you have 2.6 or earlier, I don't
think gnuplot handles milllisecond ones at all yet; the list goes on and
on. Some people will just have to apply a second split for timestamp
string pgbench outputs, at the period and use the left side, where right
now they can just split the whole thing on a space.

What you want to do is actually fine with me--and as far as I know, I'm
the producer of the most popular pgbench latency parsing script
around--but it will be a new sort of headache. I just wanted the
benefit to outweigh that. Breaking the existing scripts and burning
compatibility with simple utilities like date was not worth the tiny
improvement you wanted in your personal workflow. That's just not how
we do things in PostgreSQL.

If there's a good case that the whole format needs to be changed anyway,
like adding a new field, then we might as well switch to fractional
epoch timestamps too now though. When I added timestamps to the latency
log in 8.3, parsers that handled milliseconds were even more rare.
Today it's still inconsistent, but the workarounds are good enough to me
now. There's a lot more people using things like Python instead of bash
pipelines here in 2014 too.

--
Greg Smith greg.smith@crunchydatasolutions.com
Chief PostgreSQL Evangelist - http://crunchydatasolutions.com/

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#42

Robert Haas

robertmhaas@gmail.com

over 11 years ago

In reply to: Gregory Smith (#41)

Re: pgbench throttling latency limit

On Fri, Sep 12, 2014 at 2:27 PM, Gregory Smith <gregsmithpgsql@gmail.com> wrote:

If there's a good case that the whole format needs to be changed anyway,
like adding a new field, then we might as well switch to fractional epoch
timestamps too now though. When I added timestamps to the latency log in
8.3, parsers that handled milliseconds were even more rare. Today it's
still inconsistent, but the workarounds are good enough to me now. There's
a lot more people using things like Python instead of bash pipelines here in
2014 too.

+1. s/\..*// is not an onerous requirement.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#43

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#40)

Re: pgbench throttling latency limit

On 09/12/2014 08:59 PM, Fabien COELHO wrote:

The output would look something like this (modified from the manual's example
by hand, so the numbers don't add up):

0 199 2241 0 1175850568 995598 1020
0 200 2465 0 1175850568 998079 1010
0 201 skipped 1175850569 608 3011
0 202 skipped 1175850569 608 2400
0 203 skipped 1175850569 608 1000
0 204 2513 0 1175850569 608 500
0 205 2038 0 1175850569 2663 500

My 0.02ï¿½: ISTM that the number of columns should stay the same whether it
is skipped or not, so the "file_no" should be kept.

Oh, sorry, I totally agree. I left file_no out by mistake.

Maybe to keep it a
number would make sense (-1) or just a sign (-) which means "no value"
with gnuplot for instance. Or "skipped".

Basically I would be fine with that, but as I do not use the log file
feature I'm not sure that my opinion should count.

Note that there are also potential issues with the aggregate logging and
the sampling stuff.

Yep.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#44

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#43)

1 attachment(s)

Re: pgbench throttling latency limit

[about logging...]

Here is an attempt at updating the log features, including the aggregate
and sampling stuff, with skipped transactions under throttling.

I moved the logging stuff into a function which is called when a
transaction is skipped or finished.

From a log file format perspective, I think that "-" would be better than
"skipped".

--
Fabien.

Attachments:

pgbench-limit-9.patchtext/x-diff; name=pgbench-limit-9.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 087e0d3..72f00d8 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -295,6 +311,8 @@ typedef struct
 	double		max_lag;
 	double		sum_lag;		/* sum(lag) */
 	double		sum2_lag;		/* sum(lag*lag) */
+
+	int			skipped;		/* skipped transactions under --rate --limit */
 } AggVals;
 
 static Command **sql_files[MAX_FILES];	/* SQL script files */
@@ -372,6 +390,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -991,13 +1013,13 @@ void
 agg_vals_init(AggVals *aggs, instr_time start)
 {
 	/* basic counters */
-	aggs->cnt = 0;				/* number of transactions */
+	aggs->cnt = 0;				/* number of transactions (some were possibly skipped) */
 	aggs->sum_latency = 0;		/* SUM(latency) */
 	aggs->sum2_latency = 0;				/* SUM(latency*latency) */
 
 	/* min and max transaction duration */
-	aggs->min_latency = 0;
-	aggs->max_latency = 0;
+	aggs->min_latency = -1.0;
+	aggs->max_latency = -1.0;
 
 	/* schedule lag counters */
 	aggs->sum_lag = 0;
@@ -1005,10 +1027,173 @@ agg_vals_init(AggVals *aggs, instr_time start)
 	aggs->min_lag = 0;
 	aggs->max_lag = 0;
 
+	/* skipped transactions under --rate & --limit */
+	aggs->skipped = 0;
+
 	/* start of the current interval */
 	aggs->start_time = INSTR_TIME_GET_DOUBLE(start);
 }
 
+/* generate log
+ */
+static void doLog(TState *thread, CState *st, FILE *logfile, AggVals *agg, bool skipped)
+{
+	/*
+	 * write the log entry if this row belongs to the random sample,
+	 * or no sampling rate was given which means log everything.
+	 */
+	if (sample_rate == 0.0 ||
+		pg_erand48(thread->random_state) <= sample_rate)
+	{
+		double		lag;
+		double		latency;
+		instr_time	now;
+
+		INSTR_TIME_SET_CURRENT(now);
+
+		latency = skipped? -1.0: (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
+		lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
+
+		/* should we aggregate the results or not? */
+		if (agg_interval > 0)
+		{
+			/*
+			 * are we still in the same interval? if yes, accumulate
+			 * the values (print them otherwise)
+			 */
+			if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
+			{
+				agg->cnt += 1;
+				if (skipped)
+					agg->skipped += 1;
+				else
+				{
+					/* there is no latency to record if the transaction was skipped */
+					agg->sum_latency += latency;
+					agg->sum2_latency += latency * latency;
+
+					/* first non-skipped in this aggregation interval */
+					if ((agg->min_latency == -1.0) || (latency < agg->min_latency))
+						agg->min_latency = latency;
+
+					if ((agg->max_latency == -1.0) || (latency > agg->max_latency))
+						agg->max_latency = latency;
+				}
+
+				/* and the same for schedule lag */
+				if (throttle_delay)
+				{
+					agg->sum_lag += lag;
+					agg->sum2_lag += lag * lag;
+
+					if ((agg->cnt == 1) || (lag < agg->min_lag))
+						agg->min_lag = lag;
+					if ((agg->cnt == 1) || (lag > agg->max_lag))
+						agg->max_lag = lag;
+				}
+			}
+			else
+			{
+				/*
+				 * Loop until we reach the interval of the current
+				 * transaction (and print all the empty intervals in
+				 * between).
+				 */
+				while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
+				{
+					/*
+					 * This is a non-Windows branch (thanks to the
+					 * ifdef in usage), so we don't need to handle
+					 * this in a special way (see below).
+					 */
+					fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
+							agg->start_time,
+							agg->cnt,
+							agg->sum_latency,
+							agg->sum2_latency,
+							agg->min_latency,
+							agg->max_latency);
+					if (throttle_delay) {
+						fprintf(logfile, " %.0f %.0f %.0f %.0f",
+								agg->sum_lag,
+								agg->sum2_lag,
+								agg->min_lag,
+								agg->max_lag);
+						if (latency_limit)
+							fprintf(logfile, " %d", agg->skipped);
+					}
+					fputc('\n', logfile);
+
+					/* move to the next inteval */
+					agg->start_time = agg->start_time + agg_interval;
+
+					/* reset for "no transaction" intervals */
+					agg->cnt = 0;
+					agg->min_latency = -1.0;
+					agg->max_latency = -1.0;
+					agg->sum_latency = 0;
+					agg->sum2_latency = 0;
+					agg->min_lag = 0;
+					agg->max_lag = 0;
+					agg->sum_lag = 0;
+					agg->sum2_lag = 0;
+					agg->skipped = 0;
+				}
+
+				/*
+				 * and now update the reset values (include the
+				 * current)
+				 */
+				agg->cnt = 1;
+				agg->skipped = skipped? 1: 0;
+				agg->min_latency = latency;
+				agg->max_latency = latency;
+				agg->sum_latency = skipped? 0.0: latency;
+				agg->sum2_latency = skipped? 0.0: latency * latency;
+				agg->min_lag = lag;
+				agg->max_lag = lag;
+				agg->sum_lag = lag;
+				agg->sum2_lag = lag * lag;
+			}
+		}
+		else
+		{
+			/* no, print raw transactions */
+#ifndef WIN32
+
+			/*
+			 * This is more than we really ought to know about
+			 * instr_time
+			 */
+			if (skipped)
+				fprintf(logfile, "%d %d skipped %d %ld %ld",
+						st->id, st->cnt, st->use_file,
+						(long) now.tv_sec, (long) now.tv_usec);
+			else
+				fprintf(logfile, "%d %d %.0f %d %ld %ld",
+						st->id, st->cnt, latency, st->use_file,
+						(long) now.tv_sec, (long) now.tv_usec);
+#else
+
+			/*
+			 * On Windows, instr_time doesn't provide a timestamp
+			 * anyway
+			 */
+			if (skipped)
+				fprintf(logfile, "%d %d skipped %d 0 0",
+						st->id, st->cnt, st->use_file);
+			else
+				fprintf(logfile, "%d %d %.0f %d 0 0",
+						st->id, st->cnt, usec, st->use_file);
+#endif
+			if (throttle_delay)
+				fprintf(logfile, " %.0f", lag);
+			fputc('\n', logfile);
+		}
+	}
+}
+
+
 /* return false iff client should be disconnected */
 static bool
 doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVals *agg)
@@ -1038,6 +1223,27 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = getPoissonRand(thread, throttle_delay);
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+
+				/* record in log if required */
+				if (logfile)
+					doLog(thread, st, logfile, agg, true);
+			}
+		}
+
 		st->txn_scheduled = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1110,8 +1316,11 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * or if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			int64		latency;
 
@@ -1133,149 +1342,18 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
 		 * if transaction finished, record the time it took in the log
 		 */
 		if (logfile && commands[st->state + 1] == NULL)
-		{
-			double		lag;
-			double		latency;
-
-			/*
-			 * write the log entry if this row belongs to the random sample,
-			 * or no sampling rate was given which means log everything.
-			 */
-			if (sample_rate == 0.0 ||
-				pg_erand48(thread->random_state) <= sample_rate)
-			{
-				if (!now_valid)
-				{
-					INSTR_TIME_SET_CURRENT(now);
-					now_valid = true;
-				}
-				latency = (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
-				lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
-
-				/* should we aggregate the results or not? */
-				if (agg_interval > 0)
-				{
-					/*
-					 * are we still in the same interval? if yes, accumulate
-					 * the values (print them otherwise)
-					 */
-					if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
-					{
-						agg->cnt += 1;
-						agg->sum_latency += latency;
-						agg->sum2_latency += latency * latency;
-
-						/* first in this aggregation interval */
-						if ((agg->cnt == 1) || (latency < agg->min_latency))
-							agg->min_latency = latency;
-
-						if ((agg->cnt == 1) || (latency > agg->max_latency))
-							agg->max_latency = latency;
-
-						/* and the same for schedule lag */
-						if (throttle_delay)
-						{
-							agg->sum_lag += lag;
-							agg->sum2_lag += lag * lag;
-
-							if ((agg->cnt == 1) || (lag < agg->min_lag))
-								agg->min_lag = lag;
-							if ((agg->cnt == 1) || (lag > agg->max_lag))
-								agg->max_lag = lag;
-						}
-					}
-					else
-					{
-						/*
-						 * Loop until we reach the interval of the current
-						 * transaction (and print all the empty intervals in
-						 * between).
-						 */
-						while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
-						{
-							/*
-							 * This is a non-Windows branch (thanks to the
-							 * ifdef in usage), so we don't need to handle
-							 * this in a special way (see below).
-							 */
-							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
-									agg->start_time,
-									agg->cnt,
-									agg->sum_latency,
-									agg->sum2_latency,
-									agg->min_latency,
-									agg->max_latency);
-							if (throttle_delay)
-								fprintf(logfile, " %.0f %.0f %.0f %.0f",
-										agg->sum_lag,
-										agg->sum2_lag,
-										agg->min_lag,
-										agg->max_lag);
-							fputc('\n', logfile);
-
-							/* move to the next inteval */
-							agg->start_time = agg->start_time + agg_interval;
-
-							/* reset for "no transaction" intervals */
-							agg->cnt = 0;
-							agg->min_latency = 0;
-							agg->max_latency = 0;
-							agg->sum_latency = 0;
-							agg->sum2_latency = 0;
-							agg->min_lag = 0;
-							agg->max_lag = 0;
-							agg->sum_lag = 0;
-							agg->sum2_lag = 0;
-						}
-
-						/*
-						 * and now update the reset values (include the
-						 * current)
-						 */
-						agg->cnt = 1;
-						agg->min_latency = latency;
-						agg->max_latency = latency;
-						agg->sum_latency = latency;
-						agg->sum2_latency = latency * latency;
-						agg->min_lag = lag;
-						agg->max_lag = lag;
-						agg->sum_lag = lag;
-						agg->sum2_lag = lag * lag;
-					}
-				}
-				else
-				{
-					/* no, print raw transactions */
-#ifndef WIN32
-
-					/*
-					 * This is more than we really ought to know about
-					 * instr_time
-					 */
-					fprintf(logfile, "%d %d %.0f %d %ld %ld",
-							st->id, st->cnt, latency, st->use_file,
-							(long) now.tv_sec, (long) now.tv_usec);
-#else
-
-					/*
-					 * On Windows, instr_time doesn't provide a timestamp
-					 * anyway
-					 */
-					fprintf(logfile, "%d %d %.0f %d 0 0",
-							st->id, st->cnt, latency, st->use_file);
-#endif
-					if (throttle_delay)
-						fprintf(logfile, " %.0f", lag);
-					fputc('\n', logfile);
-				}
-			}
-		}
+			doLog(thread, st, logfile, agg, false);
 
 		if (commands[st->state]->type == SQL_COMMAND)
 		{
@@ -1360,7 +1438,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
@@ -2426,7 +2504,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2465,7 +2544,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2483,7 +2572,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 	}
 
 	if (throttle_delay)
-	{
 		/*
 		 * Report average transaction lag under rate limit throttling.  This
 		 * is the delay between scheduled and actual start times for the
@@ -2492,7 +2580,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
-	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
 	printf("tps = %f (excluding connections establishing)\n", tps_exclude);
@@ -2580,6 +2667,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2609,6 +2697,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2653,7 +2743,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2850,6 +2940,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3145,6 +3247,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3219,6 +3323,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3241,7 +3347,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3266,7 +3373,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3469,7 +3577,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3492,23 +3601,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3527,7 +3639,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3552,23 +3665,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3589,6 +3705,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index c4e0cbd..d3e8985 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -439,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the transaction latency reported at the
@@ -940,7 +959,7 @@ END;
    The format of the log is:
 
 <synopsis>
-<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable>]
+<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>time</> is the total elapsed transaction time in microseconds,
@@ -950,10 +969,13 @@ END;
    UNIX epoch format timestamp and an offset
    in microseconds (suitable for creating an ISO 8601
    timestamp with fractional seconds) showing when
-   the transaction completed. The last field, <replaceable>schedule_lag</>, is
-   the difference between the transaction's scheduled start time, and the
-   time it actually started, in microseconds. It is only present when the
-   <option>--rate</> option is used.
+   the transaction completed.
+   Field <replaceable>schedule_lag</> is the difference between the transaction's
+   scheduled start time, and the time it actually started, in microseconds. It is only present
+   when the <option>--rate</> option is used.
+   The last field <replaceable>skipped_transactions</> reports the number of transactions
+   skipped because they were too far behind schedule. It is only present when both options
+   <option>--rate</> and <option>--limit</> are used.
   </para>
 
   <para>
@@ -974,12 +996,12 @@ END;
 
  <refsect2>
   <title>Aggregated Logging</title>
-  
+
   <para>
    With the <option>--aggregate-interval</option> option, the logs use a bit different format:
 
 <synopsis>
-<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</>]
+<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>interval_start</> is the start of the interval (UNIX epoch
@@ -990,8 +1012,9 @@ END;
    <replaceable>latency_2_sum</> is a sum of 2nd powers of latencies. The last two
    fields are <replaceable>min_latency</> - a minimum latency within the interval, and
    <replaceable>max_latency</> - maximum latency within the interval. A transaction is
-   counted into the interval when it was committed. The last four fields, 
-   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the --rate option is used.
+   counted into the interval when it was committed. The fields in the end,
+   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the <option>--rate</> option is used.
+   The very last one <replaceable>skipped_transactions</> also requires option <option>--limit</>.
    They are calculated from the time each transaction had to wait for the
    previous one to finish, i.e. the difference between each transaction's
    scheduled start time and the time it actually started.

#45

Robert Haas

robertmhaas@gmail.com

over 11 years ago

In reply to: Fabien COELHO (#44)

Re: pgbench throttling latency limit

On Sat, Sep 13, 2014 at 4:25 AM, Fabien COELHO <coelho@cri.ensmp.fr> wrote:

[about logging...]

Here is an attempt at updating the log features, including the aggregate and
sampling stuff, with skipped transactions under throttling.

I moved the logging stuff into a function which is called when a transaction
is skipped or finished.

From a log file format perspective, I think that "-" would be better than
"skipped".

I like skipped. That seems a lot more clear, and less likely to get
parsed as a numeric value by a careless regex like [+-]\d*

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#46

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#44)

2 attachment(s)

Re: pgbench throttling latency limit

On 09/13/2014 11:25 AM, Fabien COELHO wrote:

[about logging...]

Here is an attempt at updating the log features, including the aggregate
and sampling stuff, with skipped transactions under throttling.

I moved the logging stuff into a function which is called when a
transaction is skipped or finished.

Makes sense.

I spent some time on this, and this is what I ended up with. Notable
changes:

* I split this into two patches for easier review. The first patch
contains the refactoring of the logging stuff, and the second patch
contains the new functionality.

* I renamed many of the variables, to be more consistent with existing
similar variables

* progress reporting was broken with !PTHREAD_FORK_EMULATION. Need to
collect the number of skipped xacts from all threads.

* I renamed the long option to --latency-limit. --limit is too generic.

Please have a look. I have not looked at the docs changes yet.

One thing that needs some thinking and changing is the progress
reporting. It currently looks like this:

progress: 1.0 s, 4863.0 tps, lat 3.491 ms stddev 2.487, lag 1.809 ms, 99
skipped
progress: 2.0 s, 5042.8 tps, lat 3.265 ms stddev 2.264, lag 1.584 ms, 16
skipped
progress: 3.0 s, 4926.1 tps, lat 2.731 ms stddev 2.371, lag 1.196 ms, 45
skipped
progress: 4.0 s, 4963.9 tps, lat 1.904 ms stddev 1.212, lag 0.429 ms, 0
skipped
progress: 5.0 s, 4971.2 tps, lat 2.592 ms stddev 1.722, lag 0.975 ms, 0
skipped

The absolute number of skipped transactions doesn't make much sense when
all the other numbers are averages, and tps is a 1/s value. If you don't
know the total number of transactions executed, the absolute number is
meaningless. (Although you can calculate the absolute number of
transactions executed by multiplying the TPS value with the interval). I
think a percentage would be better here.

Should we also print the number of late transactions here? I think that
would be an even more important detail than the number of skipped
transactions. It might be better to print only the percentage of late
transactions, including skipped ones. Or both, but it's difficult to
cram everything on a single line. This needs some further thinking..

- Heikki

Attachments:

pgbench-log-refactor-1.patchtext/x-diff; name=pgbench-log-refactor-1.patchDownload

commit 6f1158e9a7e77710164d503b40d77fd9dcde7a08
Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date:   Mon Sep 15 11:09:29 2014 +0300

    Refactor log-printing in pgbench to a separate function.
    
    The doCustom function was getting really unwieldy. Upcoming patches will add
    even more code there.

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 087e0d3..0daf92f 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -347,6 +347,9 @@ static char *select_only = {
 static void setalarm(int seconds);
 static void *threadRun(void *arg);
 
+static void doLog(TState *thread, CState *st, FILE *logfile, instr_time *now,
+				  AggVals *agg);
+
 static void
 usage(void)
 {
@@ -1016,6 +1019,16 @@ doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVa
 	PGresult   *res;
 	Command   **commands;
 	bool		trans_needs_throttle = false;
+	instr_time	now;
+
+	/*
+	 * gettimeofday() isn't free, so we get the current timestamp lazily the
+	 * first time it's needed, we reuse the same value throughout this
+	 * function after that. This also ensures that e.g. the calculated latency
+	 * reported in the log file and in the totals are the same. Zero means
+	 * "not set yet".
+	 */
+	INSTR_TIME_SET_ZERO(now);
 
 top:
 	commands = sql_files[st->use_file];
@@ -1049,10 +1062,10 @@ top:
 
 	if (st->sleeping)
 	{							/* are we sleeping? */
-		instr_time	now;
 		int64		now_us;
 
-		INSTR_TIME_SET_CURRENT(now);
+		if (INSTR_TIME_IS_ZERO(now))
+			INSTR_TIME_SET_CURRENT(now);
 		now_us = INSTR_TIME_GET_MICROSEC(now);
 		if (st->txn_scheduled <= now_us)
 		{
@@ -1074,11 +1087,6 @@ top:
 
 	if (st->listen)
 	{							/* are we receiver? */
-		instr_time	now;
-		bool		now_valid = false;
-
-		INSTR_TIME_SET_ZERO(now); /* initialize to keep compiler quiet */
-
 		if (commands[st->state]->type == SQL_COMMAND)
 		{
 			if (debug)
@@ -1100,181 +1108,40 @@ top:
 		{
 			int			cnum = commands[st->state]->command_num;
 
-			if (!now_valid)
-			{
+			if (INSTR_TIME_IS_ZERO(now))
 				INSTR_TIME_SET_CURRENT(now);
-				now_valid = true;
-			}
 			INSTR_TIME_ACCUM_DIFF(thread->exec_elapsed[cnum],
 								  now, st->stmt_begin);
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: calculate latency and log the transaction */
+		if (commands[st->state + 1] == NULL)
 		{
-			int64		latency;
-
-			if (!now_valid)
+			/* only calculate latency if an option is used that needs it. */
+			if (progress || throttle_delay)
 			{
-				INSTR_TIME_SET_CURRENT(now);
-				now_valid = true;
-			}
-
-			latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
+				int64		latency;
 
-			st->txn_latencies += latency;
-
-			/*
-			 * XXX In a long benchmark run of high-latency transactions, this
-			 * int64 addition eventually overflows.  For example, 100 threads
-			 * running 10s transactions will overflow it in 2.56 hours.  With
-			 * a more-typical OLTP workload of .1s transactions, overflow
-			 * would take 256 hours.
-			 */
-			st->txn_sqlats += latency * latency;
-		}
-
-		/*
-		 * if transaction finished, record the time it took in the log
-		 */
-		if (logfile && commands[st->state + 1] == NULL)
-		{
-			double		lag;
-			double		latency;
-
-			/*
-			 * write the log entry if this row belongs to the random sample,
-			 * or no sampling rate was given which means log everything.
-			 */
-			if (sample_rate == 0.0 ||
-				pg_erand48(thread->random_state) <= sample_rate)
-			{
-				if (!now_valid)
-				{
+				if (INSTR_TIME_IS_ZERO(now))
 					INSTR_TIME_SET_CURRENT(now);
-					now_valid = true;
-				}
-				latency = (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
-				lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
-
-				/* should we aggregate the results or not? */
-				if (agg_interval > 0)
-				{
-					/*
-					 * are we still in the same interval? if yes, accumulate
-					 * the values (print them otherwise)
-					 */
-					if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
-					{
-						agg->cnt += 1;
-						agg->sum_latency += latency;
-						agg->sum2_latency += latency * latency;
-
-						/* first in this aggregation interval */
-						if ((agg->cnt == 1) || (latency < agg->min_latency))
-							agg->min_latency = latency;
-
-						if ((agg->cnt == 1) || (latency > agg->max_latency))
-							agg->max_latency = latency;
-
-						/* and the same for schedule lag */
-						if (throttle_delay)
-						{
-							agg->sum_lag += lag;
-							agg->sum2_lag += lag * lag;
-
-							if ((agg->cnt == 1) || (lag < agg->min_lag))
-								agg->min_lag = lag;
-							if ((agg->cnt == 1) || (lag > agg->max_lag))
-								agg->max_lag = lag;
-						}
-					}
-					else
-					{
-						/*
-						 * Loop until we reach the interval of the current
-						 * transaction (and print all the empty intervals in
-						 * between).
-						 */
-						while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
-						{
-							/*
-							 * This is a non-Windows branch (thanks to the
-							 * ifdef in usage), so we don't need to handle
-							 * this in a special way (see below).
-							 */
-							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
-									agg->start_time,
-									agg->cnt,
-									agg->sum_latency,
-									agg->sum2_latency,
-									agg->min_latency,
-									agg->max_latency);
-							if (throttle_delay)
-								fprintf(logfile, " %.0f %.0f %.0f %.0f",
-										agg->sum_lag,
-										agg->sum2_lag,
-										agg->min_lag,
-										agg->max_lag);
-							fputc('\n', logfile);
-
-							/* move to the next inteval */
-							agg->start_time = agg->start_time + agg_interval;
-
-							/* reset for "no transaction" intervals */
-							agg->cnt = 0;
-							agg->min_latency = 0;
-							agg->max_latency = 0;
-							agg->sum_latency = 0;
-							agg->sum2_latency = 0;
-							agg->min_lag = 0;
-							agg->max_lag = 0;
-							agg->sum_lag = 0;
-							agg->sum2_lag = 0;
-						}
-
-						/*
-						 * and now update the reset values (include the
-						 * current)
-						 */
-						agg->cnt = 1;
-						agg->min_latency = latency;
-						agg->max_latency = latency;
-						agg->sum_latency = latency;
-						agg->sum2_latency = latency * latency;
-						agg->min_lag = lag;
-						agg->max_lag = lag;
-						agg->sum_lag = lag;
-						agg->sum2_lag = lag * lag;
-					}
-				}
-				else
-				{
-					/* no, print raw transactions */
-#ifndef WIN32
-
-					/*
-					 * This is more than we really ought to know about
-					 * instr_time
-					 */
-					fprintf(logfile, "%d %d %.0f %d %ld %ld",
-							st->id, st->cnt, latency, st->use_file,
-							(long) now.tv_sec, (long) now.tv_usec);
-#else
-
-					/*
-					 * On Windows, instr_time doesn't provide a timestamp
-					 * anyway
-					 */
-					fprintf(logfile, "%d %d %.0f %d 0 0",
-							st->id, st->cnt, latency, st->use_file);
-#endif
-					if (throttle_delay)
-						fprintf(logfile, " %.0f", lag);
-					fputc('\n', logfile);
-				}
+				latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
+
+				st->txn_latencies += latency;
+
+				/*
+				 * XXX In a long benchmark run of high-latency transactions,
+				 * this int64 addition eventually overflows.  For example, 100
+				 * threads running 10s transactions will overflow it in 2.56
+				 * hours.  With a more-typical OLTP workload of .1s
+				 * transactions, overflow would take 256 hours.
+				 */
+				st->txn_sqlats += latency * latency;
 			}
+
+			/* record the time it took in the log */
+			if (logfile)
+				doLog(thread, st, logfile, &now, agg);
 		}
 
 		if (commands[st->state]->type == SQL_COMMAND)
@@ -1734,6 +1601,137 @@ top:
 	return true;
 }
 
+/*
+ * print log entry after completing one transaction.
+ */
+static void
+doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
+{
+	double		lag;
+	double		latency;
+
+	/*
+	 * Skip the log entry if sampling is enabled and this row doesn't belong
+	 * to the random sample.
+	 */
+	if (sample_rate != 0.0 &&
+		pg_erand48(thread->random_state) > sample_rate)
+		return;
+
+	if (INSTR_TIME_IS_ZERO(*now))
+		INSTR_TIME_SET_CURRENT(*now);
+
+	latency = (double) (INSTR_TIME_GET_MICROSEC(*now) - st->txn_scheduled);
+	lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
+
+	/* should we aggregate the results or not? */
+	if (agg_interval > 0)
+	{
+		/*
+		 * Are we still in the same interval? If yes, accumulate the values
+		 * (print them otherwise)
+		 */
+		if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(*now))
+		{
+			agg->cnt += 1;
+			agg->sum_latency += latency;
+			agg->sum2_latency += latency * latency;
+
+			/* first in this aggregation interval */
+			if ((agg->cnt == 1) || (latency < agg->min_latency))
+				agg->min_latency = latency;
+
+			if ((agg->cnt == 1) || (latency > agg->max_latency))
+				agg->max_latency = latency;
+
+			/* and the same for schedule lag */
+			if (throttle_delay)
+			{
+				agg->sum_lag += lag;
+				agg->sum2_lag += lag * lag;
+
+				if ((agg->cnt == 1) || (lag < agg->min_lag))
+					agg->min_lag = lag;
+				if ((agg->cnt == 1) || (lag > agg->max_lag))
+					agg->max_lag = lag;
+			}
+		}
+		else
+		{
+			/*
+			 * Loop until we reach the interval of the current transaction
+			 * (and print all the empty intervals in between).
+			 */
+			while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(*now))
+			{
+				/*
+				 * This is a non-Windows branch (thanks to the
+				 * ifdef in usage), so we don't need to handle
+				 * this in a special way (see below).
+				 */
+				fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
+						agg->start_time,
+						agg->cnt,
+						agg->sum_latency,
+						agg->sum2_latency,
+						agg->min_latency,
+						agg->max_latency);
+				if (throttle_delay)
+					fprintf(logfile, " %.0f %.0f %.0f %.0f",
+							agg->sum_lag,
+							agg->sum2_lag,
+							agg->min_lag,
+							agg->max_lag);
+				fputc('\n', logfile);
+
+				/* move to the next inteval */
+				agg->start_time = agg->start_time + agg_interval;
+
+				/* reset for "no transaction" intervals */
+				agg->cnt = 0;
+				agg->min_latency = 0;
+				agg->max_latency = 0;
+				agg->sum_latency = 0;
+				agg->sum2_latency = 0;
+				agg->min_lag = 0;
+				agg->max_lag = 0;
+				agg->sum_lag = 0;
+				agg->sum2_lag = 0;
+			}
+
+			/* reset the values to include only the current transaction. */
+			agg->cnt = 1;
+			agg->min_latency = latency;
+			agg->max_latency = latency;
+			agg->sum_latency = latency;
+			agg->sum2_latency = latency * latency;
+			agg->min_lag = lag;
+			agg->max_lag = lag;
+			agg->sum_lag = lag;
+			agg->sum2_lag = lag * lag;
+		}
+	}
+	else
+	{
+		/* no, print raw transactions */
+#ifndef WIN32
+
+		/* This is more than we really ought to know about instr_time */
+		fprintf(logfile, "%d %d %.0f %d %ld %ld",
+				st->id, st->cnt, latency, st->use_file,
+				(long) now->tv_sec, (long) now->tv_usec);
+#else
+
+		/* On Windows, instr_time doesn't provide a timestamp anyway */
+		fprintf(logfile, "%d %d %.0f %d 0 0",
+				st->id, st->cnt, latency, st->use_file);
+#endif
+		if (throttle_delay)
+			fprintf(logfile, " %.0f", lag);
+		fputc('\n', logfile);
+	}
+}
+
 /* discard connections */
 static void
 disconnect_all(CState *state, int length)

pgbench-limit-9.patchtext/x-diff; name=pgbench-limit-9.patchDownload

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 087e0d3..72f00d8 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late; /* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -295,6 +311,8 @@ typedef struct
 	double		max_lag;
 	double		sum_lag;		/* sum(lag) */
 	double		sum2_lag;		/* sum(lag*lag) */
+
+	int			skipped;		/* skipped transactions under --rate --limit */
 } AggVals;
 
 static Command **sql_files[MAX_FILES];	/* SQL script files */
@@ -372,6 +390,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --limit=NUM          count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -991,13 +1013,13 @@ void
 agg_vals_init(AggVals *aggs, instr_time start)
 {
 	/* basic counters */
-	aggs->cnt = 0;				/* number of transactions */
+	aggs->cnt = 0;				/* number of transactions (some were possibly skipped) */
 	aggs->sum_latency = 0;		/* SUM(latency) */
 	aggs->sum2_latency = 0;				/* SUM(latency*latency) */
 
 	/* min and max transaction duration */
-	aggs->min_latency = 0;
-	aggs->max_latency = 0;
+	aggs->min_latency = -1.0;
+	aggs->max_latency = -1.0;
 
 	/* schedule lag counters */
 	aggs->sum_lag = 0;
@@ -1005,10 +1027,173 @@ agg_vals_init(AggVals *aggs, instr_time start)
 	aggs->min_lag = 0;
 	aggs->max_lag = 0;
 
+	/* skipped transactions under --rate & --limit */
+	aggs->skipped = 0;
+
 	/* start of the current interval */
 	aggs->start_time = INSTR_TIME_GET_DOUBLE(start);
 }
 
+/* generate log
+ */
+static void doLog(TState *thread, CState *st, FILE *logfile, AggVals *agg, bool skipped)
+{
+	/*
+	 * write the log entry if this row belongs to the random sample,
+	 * or no sampling rate was given which means log everything.
+	 */
+	if (sample_rate == 0.0 ||
+		pg_erand48(thread->random_state) <= sample_rate)
+	{
+		double		lag;
+		double		latency;
+		instr_time	now;
+
+		INSTR_TIME_SET_CURRENT(now);
+
+		latency = skipped? -1.0: (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
+		lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
+
+		/* should we aggregate the results or not? */
+		if (agg_interval > 0)
+		{
+			/*
+			 * are we still in the same interval? if yes, accumulate
+			 * the values (print them otherwise)
+			 */
+			if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
+			{
+				agg->cnt += 1;
+				if (skipped)
+					agg->skipped += 1;
+				else
+				{
+					/* there is no latency to record if the transaction was skipped */
+					agg->sum_latency += latency;
+					agg->sum2_latency += latency * latency;
+
+					/* first non-skipped in this aggregation interval */
+					if ((agg->min_latency == -1.0) || (latency < agg->min_latency))
+						agg->min_latency = latency;
+
+					if ((agg->max_latency == -1.0) || (latency > agg->max_latency))
+						agg->max_latency = latency;
+				}
+
+				/* and the same for schedule lag */
+				if (throttle_delay)
+				{
+					agg->sum_lag += lag;
+					agg->sum2_lag += lag * lag;
+
+					if ((agg->cnt == 1) || (lag < agg->min_lag))
+						agg->min_lag = lag;
+					if ((agg->cnt == 1) || (lag > agg->max_lag))
+						agg->max_lag = lag;
+				}
+			}
+			else
+			{
+				/*
+				 * Loop until we reach the interval of the current
+				 * transaction (and print all the empty intervals in
+				 * between).
+				 */
+				while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
+				{
+					/*
+					 * This is a non-Windows branch (thanks to the
+					 * ifdef in usage), so we don't need to handle
+					 * this in a special way (see below).
+					 */
+					fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
+							agg->start_time,
+							agg->cnt,
+							agg->sum_latency,
+							agg->sum2_latency,
+							agg->min_latency,
+							agg->max_latency);
+					if (throttle_delay) {
+						fprintf(logfile, " %.0f %.0f %.0f %.0f",
+								agg->sum_lag,
+								agg->sum2_lag,
+								agg->min_lag,
+								agg->max_lag);
+						if (latency_limit)
+							fprintf(logfile, " %d", agg->skipped);
+					}
+					fputc('\n', logfile);
+
+					/* move to the next inteval */
+					agg->start_time = agg->start_time + agg_interval;
+
+					/* reset for "no transaction" intervals */
+					agg->cnt = 0;
+					agg->min_latency = -1.0;
+					agg->max_latency = -1.0;
+					agg->sum_latency = 0;
+					agg->sum2_latency = 0;
+					agg->min_lag = 0;
+					agg->max_lag = 0;
+					agg->sum_lag = 0;
+					agg->sum2_lag = 0;
+					agg->skipped = 0;
+				}
+
+				/*
+				 * and now update the reset values (include the
+				 * current)
+				 */
+				agg->cnt = 1;
+				agg->skipped = skipped? 1: 0;
+				agg->min_latency = latency;
+				agg->max_latency = latency;
+				agg->sum_latency = skipped? 0.0: latency;
+				agg->sum2_latency = skipped? 0.0: latency * latency;
+				agg->min_lag = lag;
+				agg->max_lag = lag;
+				agg->sum_lag = lag;
+				agg->sum2_lag = lag * lag;
+			}
+		}
+		else
+		{
+			/* no, print raw transactions */
+#ifndef WIN32
+
+			/*
+			 * This is more than we really ought to know about
+			 * instr_time
+			 */
+			if (skipped)
+				fprintf(logfile, "%d %d skipped %d %ld %ld",
+						st->id, st->cnt, st->use_file,
+						(long) now.tv_sec, (long) now.tv_usec);
+			else
+				fprintf(logfile, "%d %d %.0f %d %ld %ld",
+						st->id, st->cnt, latency, st->use_file,
+						(long) now.tv_sec, (long) now.tv_usec);
+#else
+
+			/*
+			 * On Windows, instr_time doesn't provide a timestamp
+			 * anyway
+			 */
+			if (skipped)
+				fprintf(logfile, "%d %d skipped %d 0 0",
+						st->id, st->cnt, st->use_file);
+			else
+				fprintf(logfile, "%d %d %.0f %d 0 0",
+						st->id, st->cnt, usec, st->use_file);
+#endif
+			if (throttle_delay)
+				fprintf(logfile, " %.0f", lag);
+			fputc('\n', logfile);
+		}
+	}
+}
+
+
 /* return false iff client should be disconnected */
 static bool
 doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVals *agg)
@@ -1038,6 +1223,27 @@ top:
 
 		thread->throttle_trigger += wait;
 
+		if (latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				/* if too far behind, this slot is skipped, and we
+				 * iterate till the next nearly on time slot.
+				 */
+				int64 wait = getPoissonRand(thread, throttle_delay);
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped ++;
+
+				/* record in log if required */
+				if (logfile)
+					doLog(thread, st, logfile, agg, true);
+			}
+		}
+
 		st->txn_scheduled = thread->throttle_trigger;
 		st->sleeping = 1;
 		st->throttling = true;
@@ -1110,8 +1316,11 @@ top:
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: record latency under progress or throttling,
+		 * or if latency limit is set
+		 */
+		if ((progress || throttle_delay || latency_limit) &&
+			commands[st->state + 1] == NULL)
 		{
 			int64		latency;
 
@@ -1133,149 +1342,18 @@ top:
 			 * would take 256 hours.
 			 */
 			st->txn_sqlats += latency * latency;
+
+			/* record over the limit transactions if needed.
+			 */
+			if (latency_limit && latency > latency_limit)
+				thread->latency_late++;
 		}
 
 		/*
 		 * if transaction finished, record the time it took in the log
 		 */
 		if (logfile && commands[st->state + 1] == NULL)
-		{
-			double		lag;
-			double		latency;
-
-			/*
-			 * write the log entry if this row belongs to the random sample,
-			 * or no sampling rate was given which means log everything.
-			 */
-			if (sample_rate == 0.0 ||
-				pg_erand48(thread->random_state) <= sample_rate)
-			{
-				if (!now_valid)
-				{
-					INSTR_TIME_SET_CURRENT(now);
-					now_valid = true;
-				}
-				latency = (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
-				lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
-
-				/* should we aggregate the results or not? */
-				if (agg_interval > 0)
-				{
-					/*
-					 * are we still in the same interval? if yes, accumulate
-					 * the values (print them otherwise)
-					 */
-					if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
-					{
-						agg->cnt += 1;
-						agg->sum_latency += latency;
-						agg->sum2_latency += latency * latency;
-
-						/* first in this aggregation interval */
-						if ((agg->cnt == 1) || (latency < agg->min_latency))
-							agg->min_latency = latency;
-
-						if ((agg->cnt == 1) || (latency > agg->max_latency))
-							agg->max_latency = latency;
-
-						/* and the same for schedule lag */
-						if (throttle_delay)
-						{
-							agg->sum_lag += lag;
-							agg->sum2_lag += lag * lag;
-
-							if ((agg->cnt == 1) || (lag < agg->min_lag))
-								agg->min_lag = lag;
-							if ((agg->cnt == 1) || (lag > agg->max_lag))
-								agg->max_lag = lag;
-						}
-					}
-					else
-					{
-						/*
-						 * Loop until we reach the interval of the current
-						 * transaction (and print all the empty intervals in
-						 * between).
-						 */
-						while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
-						{
-							/*
-							 * This is a non-Windows branch (thanks to the
-							 * ifdef in usage), so we don't need to handle
-							 * this in a special way (see below).
-							 */
-							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
-									agg->start_time,
-									agg->cnt,
-									agg->sum_latency,
-									agg->sum2_latency,
-									agg->min_latency,
-									agg->max_latency);
-							if (throttle_delay)
-								fprintf(logfile, " %.0f %.0f %.0f %.0f",
-										agg->sum_lag,
-										agg->sum2_lag,
-										agg->min_lag,
-										agg->max_lag);
-							fputc('\n', logfile);
-
-							/* move to the next inteval */
-							agg->start_time = agg->start_time + agg_interval;
-
-							/* reset for "no transaction" intervals */
-							agg->cnt = 0;
-							agg->min_latency = 0;
-							agg->max_latency = 0;
-							agg->sum_latency = 0;
-							agg->sum2_latency = 0;
-							agg->min_lag = 0;
-							agg->max_lag = 0;
-							agg->sum_lag = 0;
-							agg->sum2_lag = 0;
-						}
-
-						/*
-						 * and now update the reset values (include the
-						 * current)
-						 */
-						agg->cnt = 1;
-						agg->min_latency = latency;
-						agg->max_latency = latency;
-						agg->sum_latency = latency;
-						agg->sum2_latency = latency * latency;
-						agg->min_lag = lag;
-						agg->max_lag = lag;
-						agg->sum_lag = lag;
-						agg->sum2_lag = lag * lag;
-					}
-				}
-				else
-				{
-					/* no, print raw transactions */
-#ifndef WIN32
-
-					/*
-					 * This is more than we really ought to know about
-					 * instr_time
-					 */
-					fprintf(logfile, "%d %d %.0f %d %ld %ld",
-							st->id, st->cnt, latency, st->use_file,
-							(long) now.tv_sec, (long) now.tv_usec);
-#else
-
-					/*
-					 * On Windows, instr_time doesn't provide a timestamp
-					 * anyway
-					 */
-					fprintf(logfile, "%d %d %.0f %d 0 0",
-							st->id, st->cnt, latency, st->use_file);
-#endif
-					if (throttle_delay)
-						fprintf(logfile, " %.0f", lag);
-					fputc('\n', logfile);
-				}
-			}
-		}
+			doLog(thread, st, logfile, agg, false);
 
 		if (commands[st->state]->type == SQL_COMMAND)
 		{
@@ -1360,7 +1438,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
@@ -2426,7 +2504,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2465,7 +2544,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2483,7 +2572,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 	}
 
 	if (throttle_delay)
-	{
 		/*
 		 * Report average transaction lag under rate limit throttling.  This
 		 * is the delay between scheduled and actual start times for the
@@ -2492,7 +2580,6 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n",
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
-	}
 
 	printf("tps = %f (including connections establishing)\n", tps_include);
 	printf("tps = %f (excluding connections establishing)\n", tps_exclude);
@@ -2580,6 +2667,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2609,6 +2697,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2653,7 +2743,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2850,6 +2940,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3145,6 +3247,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3219,6 +3323,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3241,7 +3347,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3266,7 +3373,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3469,7 +3577,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3492,23 +3601,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3527,7 +3639,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3552,23 +3665,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3589,6 +3705,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index c4e0cbd..d3e8985 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -439,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the transaction latency reported at the
@@ -940,7 +959,7 @@ END;
    The format of the log is:
 
 <synopsis>
-<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable>]
+<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>time</> is the total elapsed transaction time in microseconds,
@@ -950,10 +969,13 @@ END;
    UNIX epoch format timestamp and an offset
    in microseconds (suitable for creating an ISO 8601
    timestamp with fractional seconds) showing when
-   the transaction completed. The last field, <replaceable>schedule_lag</>, is
-   the difference between the transaction's scheduled start time, and the
-   time it actually started, in microseconds. It is only present when the
-   <option>--rate</> option is used.
+   the transaction completed.
+   Field <replaceable>schedule_lag</> is the difference between the transaction's
+   scheduled start time, and the time it actually started, in microseconds. It is only present
+   when the <option>--rate</> option is used.
+   The last field <replaceable>skipped_transactions</> reports the number of transactions
+   skipped because they were too far behind schedule. It is only present when both options
+   <option>--rate</> and <option>--limit</> are used.
   </para>
 
   <para>
@@ -974,12 +996,12 @@ END;
 
  <refsect2>
   <title>Aggregated Logging</title>
-  
+
   <para>
    With the <option>--aggregate-interval</option> option, the logs use a bit different format:
 
 <synopsis>
-<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</>]
+<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>interval_start</> is the start of the interval (UNIX epoch
@@ -990,8 +1012,9 @@ END;
    <replaceable>latency_2_sum</> is a sum of 2nd powers of latencies. The last two
    fields are <replaceable>min_latency</> - a minimum latency within the interval, and
    <replaceable>max_latency</> - maximum latency within the interval. A transaction is
-   counted into the interval when it was committed. The last four fields, 
-   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the --rate option is used.
+   counted into the interval when it was committed. The fields in the end,
+   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the <option>--rate</> option is used.
+   The very last one <replaceable>skipped_transactions</> also requires option <option>--limit</>.
    They are calculated from the time each transaction had to wait for the
    previous one to finish, i.e. the difference between each transaction's
    scheduled start time and the time it actually started.

#47

Robert Haas

robertmhaas@gmail.com

over 11 years ago

In reply to: Heikki Linnakangas (#46)

Re: pgbench throttling latency limit

On Mon, Sep 15, 2014 at 6:34 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Please have a look. I have not looked at the docs changes yet.

One thing that needs some thinking and changing is the progress reporting.
It currently looks like this:

progress: 1.0 s, 4863.0 tps, lat 3.491 ms stddev 2.487, lag 1.809 ms, 99
skipped
progress: 2.0 s, 5042.8 tps, lat 3.265 ms stddev 2.264, lag 1.584 ms, 16
skipped
progress: 3.0 s, 4926.1 tps, lat 2.731 ms stddev 2.371, lag 1.196 ms, 45
skipped
progress: 4.0 s, 4963.9 tps, lat 1.904 ms stddev 1.212, lag 0.429 ms, 0
skipped
progress: 5.0 s, 4971.2 tps, lat 2.592 ms stddev 1.722, lag 0.975 ms, 0
skipped

The absolute number of skipped transactions doesn't make much sense when all
the other numbers are averages, and tps is a 1/s value. If you don't know
the total number of transactions executed, the absolute number is
meaningless. (Although you can calculate the absolute number of transactions
executed by multiplying the TPS value with the interval). I think a
percentage would be better here.

Should we also print the number of late transactions here? I think that
would be an even more important detail than the number of skipped
transactions. It might be better to print only the percentage of late
transactions, including skipped ones. Or both, but it's difficult to cram
everything on a single line. This needs some further thinking..

I'm not sure I like the idea of printing a percentage. It might be
unclear what the denominator was if somebody feels the urge to work
back to the actual number of skipped transactions. I mean, I guess
it's probably just the value you passed to -R, so maybe that's easy
enough, but then why bother dividing in the first place? The user can
do that easily enough if they want the data that way.

I agree with you that it would be good to get some statistics on
late/skipped transactions, but it's not obvious what people will want.
Late transactions, straight up? Late by more than a threshold value?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#48

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Robert Haas (#47)

Re: pgbench throttling latency limit

I'm not sure I like the idea of printing a percentage. It might be
unclear what the denominator was if somebody feels the urge to work
back to the actual number of skipped transactions. I mean, I guess
it's probably just the value you passed to -R, so maybe that's easy
enough, but then why bother dividing in the first place? The user can
do that easily enough if they want the data that way.

Indeed "skipped" and "late" per second may have an unclear denominator. If
you divide by the time, the unit would be "tps", but 120 tps performance
including 20 late tps, plus 10 skipped tps... I do not think it is that
clear. Reporting "tps" for transaction *not* performed looks strange.

Maybe late transactions could be given as a percentage of all processed
transactions in the interval. But for skipped the percentage of what? The
only number that would make sense is the total number of transactions
schedule in the interval, but that would mean that the denominator for
late would be different than the denominator for skipped, which is
basically uncomprehensible.

I agree with you that it would be good to get some statistics on
late/skipped transactions, but it's not obvious what people will want.
Late transactions, straight up? Late by more than a threshold value?

Yes.

Under throttling transaction are given a schedule start time. When the
transactions can actually start:

(1) if it is already more late (before even starting) than the latency
limit (a threshold), it is *NOT* started, but counted "skipped"

(2) otherwise it is started. When it finishes, it may be
(2a) out of the latency limit (scheduled time + limit)
=> it is counted as "late"
(2b) within the latency limit
=> all is well

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#49

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#48)

2 attachment(s)

Re: pgbench throttling latency limit

On 09/15/2014 08:46 PM, Fabien COELHO wrote:

I'm not sure I like the idea of printing a percentage. It might be
unclear what the denominator was if somebody feels the urge to work
back to the actual number of skipped transactions. I mean, I guess
it's probably just the value you passed to -R, so maybe that's easy
enough, but then why bother dividing in the first place? The user can
do that easily enough if they want the data that way.

Indeed "skipped" and "late" per second may have an unclear denominator. If
you divide by the time, the unit would be "tps", but 120 tps performance
including 20 late tps, plus 10 skipped tps... I do not think it is that
clear. Reporting "tps" for transaction *not* performed looks strange.

Maybe late transactions could be given as a percentage of all processed
transactions in the interval. But for skipped the percentage of what? The
only number that would make sense is the total number of transactions
schedule in the interval, but that would mean that the denominator for
late would be different than the denominator for skipped, which is
basically uncomprehensible.

Hmm. I guess the absolute number makes sense, if you expect that there
are normally zero skipped transactions, or at least a very small number.
It's more like a "good or no good" indicator. Ok, I'm fine with that.

The version I'm now working on prints output like this:

$ ./pgbench -T10 -P1 --rate=1600 --latency-limit=10
starting vacuum...end.
progress: 1.0 s, 1579.0 tps, lat 2.973 ms stddev 2.493, lag 2.414 ms, 4 skipped
progress: 2.0 s, 1570.0 tps, lat 2.140 ms stddev 1.783, lag 1.599 ms, 0 skipped
progress: 3.0 s, 1663.0 tps, lat 2.372 ms stddev 1.742, lag 1.843 ms, 4 skipped
progress: 4.0 s, 1603.2 tps, lat 2.435 ms stddev 2.247, lag 1.902 ms, 13 skipped
progress: 5.0 s, 1540.9 tps, lat 1.845 ms stddev 1.270, lag 1.303 ms, 0 skipped
progress: 6.0 s, 1588.0 tps, lat 1.630 ms stddev 1.003, lag 1.097 ms, 0 skipped
progress: 7.0 s, 1577.0 tps, lat 2.071 ms stddev 1.445, lag 1.517 ms, 0 skipped
progress: 8.0 s, 1669.9 tps, lat 2.375 ms stddev 1.917, lag 1.846 ms, 0 skipped
progress: 9.0 s, 1636.0 tps, lat 2.801 ms stddev 2.354, lag 2.250 ms, 5 skipped
progress: 10.0 s, 1606.1 tps, lat 2.751 ms stddev 2.117, lag 2.197 ms, 2 skipped
transaction type: TPC-B (sort of)
scaling factor: 5
query mode: simple
number of clients: 1
number of threads: 1
duration: 10 s
number of transactions actually processed: 16034
number of transactions skipped: 28 (0.174 %)
number of transactions above the 10.0 ms latency limit: 70 (0.436 %)
latency average: 2.343 ms
latency stddev: 1.940 ms
rate limit schedule lag: avg 1.801 (max 9.994) ms
tps = 1603.370819 (including connections establishing)
tps = 1603.619536 (excluding connections establishing)

Those progress lines are 79 or 80 characters wide, so they *just* fit in
a 80-char terminal. Of course, if any of the printed numbers were
higher, it would not fit. I don't see how to usefully make it more
terse, though. I think we can live with this - these days it shouldn't
be a huge problem to enlare the terminal to make the output fit.

Here are new patches, again the first one is just refactoring, and the
second one contains this feature. I'm planning to commit the first one
shortly, and the second one later after people have had a chance to look
at it.

Greg: As the author of pgbench-tools, what do you think of this patch?
The log file format, in particular.

- Heikki

Attachments:

0001-Refactor-pgbench-log-writing-code-to-a-separate-func.patchtext/x-diff; name=0001-Refactor-pgbench-log-writing-code-to-a-separate-func.patchDownload

>From 512fde5dc3fde5fc1368b3bf0c09e3ea8e022fad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 2 Oct 2014 12:58:14 +0300
Subject: [PATCH 1/2] Refactor pgbench log-writing code to a separate function.

The doCustom function was incredibly long, this makes it a little bit more
readable.
---
 contrib/pgbench/pgbench.c | 340 +++++++++++++++++++++++-----------------------
 1 file changed, 169 insertions(+), 171 deletions(-)

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 087e0d3..c14a577 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -347,6 +347,9 @@ static char *select_only = {
 static void setalarm(int seconds);
 static void *threadRun(void *arg);
 
+static void doLog(TState *thread, CState *st, FILE *logfile, instr_time *now,
+	  AggVals *agg);
+
 static void
 usage(void)
 {
@@ -1016,6 +1019,16 @@ doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVa
 	PGresult   *res;
 	Command   **commands;
 	bool		trans_needs_throttle = false;
+	instr_time	now;
+
+	/*
+	 * gettimeofday() isn't free, so we get the current timestamp lazily the
+	 * first time it's needed, and reuse the same value throughout this
+	 * function after that. This also ensures that e.g. the calculated latency
+	 * reported in the log file and in the totals are the same. Zero means
+	 * "not set yet".
+	 */
+	INSTR_TIME_SET_ZERO(now);
 
 top:
 	commands = sql_files[st->use_file];
@@ -1049,10 +1062,10 @@ top:
 
 	if (st->sleeping)
 	{							/* are we sleeping? */
-		instr_time	now;
 		int64		now_us;
 
-		INSTR_TIME_SET_CURRENT(now);
+		if (INSTR_TIME_IS_ZERO(now))
+			INSTR_TIME_SET_CURRENT(now);
 		now_us = INSTR_TIME_GET_MICROSEC(now);
 		if (st->txn_scheduled <= now_us)
 		{
@@ -1074,11 +1087,6 @@ top:
 
 	if (st->listen)
 	{							/* are we receiver? */
-		instr_time	now;
-		bool		now_valid = false;
-
-		INSTR_TIME_SET_ZERO(now); /* initialize to keep compiler quiet */
-
 		if (commands[st->state]->type == SQL_COMMAND)
 		{
 			if (debug)
@@ -1100,181 +1108,40 @@ top:
 		{
 			int			cnum = commands[st->state]->command_num;
 
-			if (!now_valid)
-			{
+			if (INSTR_TIME_IS_ZERO(now))
 				INSTR_TIME_SET_CURRENT(now);
-				now_valid = true;
-			}
 			INSTR_TIME_ACCUM_DIFF(thread->exec_elapsed[cnum],
 								  now, st->stmt_begin);
 			thread->exec_count[cnum]++;
 		}
 
-		/* transaction finished: record latency under progress or throttling */
-		if ((progress || throttle_delay) && commands[st->state + 1] == NULL)
+		/* transaction finished: calculate latency and log the transaction */
+		if (commands[st->state + 1] == NULL)
 		{
-			int64		latency;
-
-			if (!now_valid)
+			/* only calculate latency if an option is used that needs it */
+			if (progress || throttle_delay)
 			{
-				INSTR_TIME_SET_CURRENT(now);
-				now_valid = true;
-			}
-
-			latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
+				int64		latency;
 
-			st->txn_latencies += latency;
-
-			/*
-			 * XXX In a long benchmark run of high-latency transactions, this
-			 * int64 addition eventually overflows.  For example, 100 threads
-			 * running 10s transactions will overflow it in 2.56 hours.  With
-			 * a more-typical OLTP workload of .1s transactions, overflow
-			 * would take 256 hours.
-			 */
-			st->txn_sqlats += latency * latency;
-		}
-
-		/*
-		 * if transaction finished, record the time it took in the log
-		 */
-		if (logfile && commands[st->state + 1] == NULL)
-		{
-			double		lag;
-			double		latency;
-
-			/*
-			 * write the log entry if this row belongs to the random sample,
-			 * or no sampling rate was given which means log everything.
-			 */
-			if (sample_rate == 0.0 ||
-				pg_erand48(thread->random_state) <= sample_rate)
-			{
-				if (!now_valid)
-				{
+				if (INSTR_TIME_IS_ZERO(now))
 					INSTR_TIME_SET_CURRENT(now);
-					now_valid = true;
-				}
-				latency = (double) (INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled);
-				lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
-
-				/* should we aggregate the results or not? */
-				if (agg_interval > 0)
-				{
-					/*
-					 * are we still in the same interval? if yes, accumulate
-					 * the values (print them otherwise)
-					 */
-					if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(now))
-					{
-						agg->cnt += 1;
-						agg->sum_latency += latency;
-						agg->sum2_latency += latency * latency;
-
-						/* first in this aggregation interval */
-						if ((agg->cnt == 1) || (latency < agg->min_latency))
-							agg->min_latency = latency;
-
-						if ((agg->cnt == 1) || (latency > agg->max_latency))
-							agg->max_latency = latency;
-
-						/* and the same for schedule lag */
-						if (throttle_delay)
-						{
-							agg->sum_lag += lag;
-							agg->sum2_lag += lag * lag;
-
-							if ((agg->cnt == 1) || (lag < agg->min_lag))
-								agg->min_lag = lag;
-							if ((agg->cnt == 1) || (lag > agg->max_lag))
-								agg->max_lag = lag;
-						}
-					}
-					else
-					{
-						/*
-						 * Loop until we reach the interval of the current
-						 * transaction (and print all the empty intervals in
-						 * between).
-						 */
-						while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(now))
-						{
-							/*
-							 * This is a non-Windows branch (thanks to the
-							 * ifdef in usage), so we don't need to handle
-							 * this in a special way (see below).
-							 */
-							fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
-									agg->start_time,
-									agg->cnt,
-									agg->sum_latency,
-									agg->sum2_latency,
-									agg->min_latency,
-									agg->max_latency);
-							if (throttle_delay)
-								fprintf(logfile, " %.0f %.0f %.0f %.0f",
-										agg->sum_lag,
-										agg->sum2_lag,
-										agg->min_lag,
-										agg->max_lag);
-							fputc('\n', logfile);
-
-							/* move to the next inteval */
-							agg->start_time = agg->start_time + agg_interval;
-
-							/* reset for "no transaction" intervals */
-							agg->cnt = 0;
-							agg->min_latency = 0;
-							agg->max_latency = 0;
-							agg->sum_latency = 0;
-							agg->sum2_latency = 0;
-							agg->min_lag = 0;
-							agg->max_lag = 0;
-							agg->sum_lag = 0;
-							agg->sum2_lag = 0;
-						}
-
-						/*
-						 * and now update the reset values (include the
-						 * current)
-						 */
-						agg->cnt = 1;
-						agg->min_latency = latency;
-						agg->max_latency = latency;
-						agg->sum_latency = latency;
-						agg->sum2_latency = latency * latency;
-						agg->min_lag = lag;
-						agg->max_lag = lag;
-						agg->sum_lag = lag;
-						agg->sum2_lag = lag * lag;
-					}
-				}
-				else
-				{
-					/* no, print raw transactions */
-#ifndef WIN32
-
-					/*
-					 * This is more than we really ought to know about
-					 * instr_time
-					 */
-					fprintf(logfile, "%d %d %.0f %d %ld %ld",
-							st->id, st->cnt, latency, st->use_file,
-							(long) now.tv_sec, (long) now.tv_usec);
-#else
-
-					/*
-					 * On Windows, instr_time doesn't provide a timestamp
-					 * anyway
-					 */
-					fprintf(logfile, "%d %d %.0f %d 0 0",
-							st->id, st->cnt, latency, st->use_file);
-#endif
-					if (throttle_delay)
-						fprintf(logfile, " %.0f", lag);
-					fputc('\n', logfile);
-				}
+				latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
+
+				st->txn_latencies += latency;
+
+				/*
+				 * XXX In a long benchmark run of high-latency transactions,
+				 * this int64 addition eventually overflows.  For example, 100
+				 * threads running 10s transactions will overflow it in 2.56
+				 * hours.  With a more-typical OLTP workload of .1s
+				 * transactions, overflow would take 256 hours.
+				 */
+				st->txn_sqlats += latency * latency;
 			}
+
+			/* record the time it took in the log */
+			if (logfile)
+				doLog(thread, st, logfile, &now, agg);
 		}
 
 		if (commands[st->state]->type == SQL_COMMAND)
@@ -1734,6 +1601,137 @@ top:
 	return true;
 }
 
+/*
+ * print log entry after completing one transaction.
+ */
+static void
+doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
+{
+	double		lag;
+	double		latency;
+
+	/*
+	 * Skip the log entry if sampling is enabled and this row doesn't belong
+	 * to the random sample.
+	 */
+	if (sample_rate != 0.0 &&
+		pg_erand48(thread->random_state) > sample_rate)
+		return;
+
+	if (INSTR_TIME_IS_ZERO(*now))
+		INSTR_TIME_SET_CURRENT(*now);
+
+	latency = (double) (INSTR_TIME_GET_MICROSEC(*now) - st->txn_scheduled);
+	lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
+
+	/* should we aggregate the results or not? */
+	if (agg_interval > 0)
+	{
+		/*
+		 * Are we still in the same interval? If yes, accumulate the values
+		 * (print them otherwise)
+		 */
+		if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(*now))
+		{
+			agg->cnt += 1;
+			agg->sum_latency += latency;
+			agg->sum2_latency += latency * latency;
+
+			/* first in this aggregation interval */
+			if ((agg->cnt == 1) || (latency < agg->min_latency))
+				agg->min_latency = latency;
+
+			if ((agg->cnt == 1) || (latency > agg->max_latency))
+				agg->max_latency = latency;
+
+			/* and the same for schedule lag */
+			if (throttle_delay)
+			{
+				agg->sum_lag += lag;
+				agg->sum2_lag += lag * lag;
+
+				if ((agg->cnt == 1) || (lag < agg->min_lag))
+					agg->min_lag = lag;
+				if ((agg->cnt == 1) || (lag > agg->max_lag))
+					agg->max_lag = lag;
+			}
+		}
+		else
+		{
+			/*
+			 * Loop until we reach the interval of the current transaction
+			 * (and print all the empty intervals in between).
+			 */
+			while (agg->start_time + agg_interval < INSTR_TIME_GET_DOUBLE(*now))
+			{
+				/*
+				 * This is a non-Windows branch (thanks to the
+				 * ifdef in usage), so we don't need to handle
+				 * this in a special way (see below).
+				 */
+				fprintf(logfile, "%ld %d %.0f %.0f %.0f %.0f",
+						agg->start_time,
+						agg->cnt,
+						agg->sum_latency,
+						agg->sum2_latency,
+						agg->min_latency,
+						agg->max_latency);
+				if (throttle_delay)
+					fprintf(logfile, " %.0f %.0f %.0f %.0f",
+							agg->sum_lag,
+							agg->sum2_lag,
+							agg->min_lag,
+							agg->max_lag);
+				fputc('\n', logfile);
+
+				/* move to the next inteval */
+				agg->start_time = agg->start_time + agg_interval;
+
+				/* reset for "no transaction" intervals */
+				agg->cnt = 0;
+				agg->min_latency = 0;
+				agg->max_latency = 0;
+				agg->sum_latency = 0;
+				agg->sum2_latency = 0;
+				agg->min_lag = 0;
+				agg->max_lag = 0;
+				agg->sum_lag = 0;
+				agg->sum2_lag = 0;
+			}
+
+			/* reset the values to include only the current transaction. */
+			agg->cnt = 1;
+			agg->min_latency = latency;
+			agg->max_latency = latency;
+			agg->sum_latency = latency;
+			agg->sum2_latency = latency * latency;
+			agg->min_lag = lag;
+			agg->max_lag = lag;
+			agg->sum_lag = lag;
+			agg->sum2_lag = lag * lag;
+		}
+	}
+	else
+	{
+		/* no, print raw transactions */
+#ifndef WIN32
+
+		/* This is more than we really ought to know about instr_time */
+		fprintf(logfile, "%d %d %.0f %d %ld %ld",
+				st->id, st->cnt, latency, st->use_file,
+				(long) now->tv_sec, (long) now->tv_usec);
+#else
+
+		/* On Windows, instr_time doesn't provide a timestamp anyway */
+		fprintf(logfile, "%d %d %.0f %d 0 0",
+				st->id, st->cnt, latency, st->use_file);
+#endif
+		if (throttle_delay)
+			fprintf(logfile, " %.0f", lag);
+		fputc('\n', logfile);
+	}
+}
+
 /* discard connections */
 static void
 disconnect_all(CState *state, int length)
-- 
2.1.1

0002-Add-latency-limit-option-to-pgbench.patchtext/x-diff; name=0002-Add-latency-limit-option-to-pgbench.patchDownload

>From 834cc53a95bda4710e3364643e56772451170d38 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 2 Oct 2014 13:42:13 +0300
Subject: [PATCH 2/2] Add --latency-limit option to pgbench.

This allows transactions that take longer than specified limit to be counted
separately. With --rate, transactions that are already late by the time we
get to execute them are skipped altogether. Using --latency-limit with
--rate allows you to "catch up" more quickly, if there's a hickup in the
server causing a lot of transactions to stall momentarily.

Fabien COELHO, reviewed by Rukh Meski and heavily refactored by me.
---
 contrib/pgbench/pgbench.c | 228 +++++++++++++++++++++++++++++++++++-----------
 doc/src/sgml/pgbench.sgml |  43 +++++++--
 2 files changed, 207 insertions(+), 64 deletions(-)

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index c14a577..1ed412c 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,18 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer that this limit are counted as late
+ * and reported as such, although they are completed anyway.
+ *
+ * When under throttling: execution time slots which are more than
+ * this late (in us) are simply skipped, and the corresponding transaction
+ * is counted as such... it is not even started;
+ * otherwise above the limit transactions are counted as such, with the latency
+ * measured wrt the transaction schedule, not its actual start.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +250,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late;	/* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +264,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -284,6 +300,8 @@ typedef struct
 
 	long		start_time;		/* when does the interval start */
 	int			cnt;			/* number of transactions */
+	int			skipped;		/* number of transactions skipped under
+								 * --rate and --latency-limit */
 
 	double		min_latency;	/* min/max latencies */
 	double		max_latency;
@@ -348,7 +366,7 @@ static void setalarm(int seconds);
 static void *threadRun(void *arg);
 
 static void doLog(TState *thread, CState *st, FILE *logfile, instr_time *now,
-	  AggVals *agg);
+	  AggVals *agg, bool skipped);
 
 static void
 usage(void)
@@ -375,6 +393,10 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --latency-limit=NUM  count transactions lasting more than NUM ms.\n"
+		   "                           under throttling (--rate), transactions behind schedule\n"
+		   "                           more than NUM ms are skipped, and those finishing more\n"
+		   "                           than NUM ms after their scheduled start are counted.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -994,7 +1016,9 @@ void
 agg_vals_init(AggVals *aggs, instr_time start)
 {
 	/* basic counters */
-	aggs->cnt = 0;				/* number of transactions */
+	aggs->cnt = 0;				/* number of transactions (includes skipped) */
+	aggs->skipped = 0;			/* xacts skipped under --rate --latency-limit */
+
 	aggs->sum_latency = 0;		/* SUM(latency) */
 	aggs->sum2_latency = 0;				/* SUM(latency*latency) */
 
@@ -1050,6 +1074,30 @@ top:
 		int64		wait = getPoissonRand(thread, throttle_delay);
 
 		thread->throttle_trigger += wait;
+		
+		/*
+		 * If this --lateny-limit is used, and this slot is already late so
+		 * that the transaction will miss the latency limit even if it
+		 * completed immediately, we skip this time slot and iterate till the
+		 * next slot that isn't late yet.
+		 */
+		if (latency_limit)
+		{
+			int64		now_us;
+
+			if (INSTR_TIME_IS_ZERO(now))
+				INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				wait = getPoissonRand(thread, throttle_delay);
+				thread->throttle_trigger += wait;
+				thread->throttle_latency_skipped++;
+
+				if (logfile)
+					doLog(thread, st, logfile, &now, agg, true);
+			}
+		}
 
 		st->txn_scheduled = thread->throttle_trigger;
 		st->sleeping = 1;
@@ -1119,12 +1167,13 @@ top:
 		if (commands[st->state + 1] == NULL)
 		{
 			/* only calculate latency if an option is used that needs it */
-			if (progress || throttle_delay)
+			if (progress || throttle_delay || latency_limit)
 			{
 				int64		latency;
 
 				if (INSTR_TIME_IS_ZERO(now))
 					INSTR_TIME_SET_CURRENT(now);
+
 				latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
 
 				st->txn_latencies += latency;
@@ -1137,11 +1186,15 @@ top:
 				 * transactions, overflow would take 256 hours.
 				 */
 				st->txn_sqlats += latency * latency;
+
+				/* record over the limit transactions if needed. */
+				if (latency_limit && latency > latency_limit)
+					thread->latency_late++;
 			}
 
 			/* record the time it took in the log */
 			if (logfile)
-				doLog(thread, st, logfile, &now, agg);
+				doLog(thread, st, logfile, &now, agg, false);
 		}
 
 		if (commands[st->state]->type == SQL_COMMAND)
@@ -1227,7 +1280,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
@@ -1605,7 +1658,8 @@ top:
  * print log entry after completing one transaction.
  */
 static void
-doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
+doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg,
+	  bool skipped)
 {
 	double		lag;
 	double		latency;
@@ -1634,26 +1688,34 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 		if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(*now))
 		{
 			agg->cnt += 1;
-			agg->sum_latency += latency;
-			agg->sum2_latency += latency * latency;
+			if (skipped)
+			{
+				/* there is no latency to record if the transaction was skipped */
+				agg->skipped += 1;
+			}
+			else
+			{
+				agg->sum_latency += latency;
+				agg->sum2_latency += latency * latency;
 
-			/* first in this aggregation interval */
-			if ((agg->cnt == 1) || (latency < agg->min_latency))
-				agg->min_latency = latency;
+				/* first in this aggregation interval */
+				if ((agg->cnt == 1) || (latency < agg->min_latency))
+					agg->min_latency = latency;
 
-			if ((agg->cnt == 1) || (latency > agg->max_latency))
-				agg->max_latency = latency;
+				if ((agg->cnt == 1) || (latency > agg->max_latency))
+					agg->max_latency = latency;
 
-			/* and the same for schedule lag */
-			if (throttle_delay)
-			{
-				agg->sum_lag += lag;
-				agg->sum2_lag += lag * lag;
+				/* and the same for schedule lag */
+				if (throttle_delay)
+				{
+					agg->sum_lag += lag;
+					agg->sum2_lag += lag * lag;
 
-				if ((agg->cnt == 1) || (lag < agg->min_lag))
-					agg->min_lag = lag;
-				if ((agg->cnt == 1) || (lag > agg->max_lag))
-					agg->max_lag = lag;
+					if ((agg->cnt == 1) || (lag < agg->min_lag))
+						agg->min_lag = lag;
+					if ((agg->cnt == 1) || (lag > agg->max_lag))
+						agg->max_lag = lag;
+				}
 			}
 		}
 		else
@@ -1677,11 +1739,15 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 						agg->min_latency,
 						agg->max_latency);
 				if (throttle_delay)
+				{
 					fprintf(logfile, " %.0f %.0f %.0f %.0f",
 							agg->sum_lag,
 							agg->sum2_lag,
 							agg->min_lag,
 							agg->max_lag);
+					if (latency_limit)
+						fprintf(logfile, " %d", agg->skipped);
+				}
 				fputc('\n', logfile);
 
 				/* move to the next inteval */
@@ -1689,6 +1755,7 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 
 				/* reset for "no transaction" intervals */
 				agg->cnt = 0;
+				agg->skipped = 0;
 				agg->min_latency = 0;
 				agg->max_latency = 0;
 				agg->sum_latency = 0;
@@ -1701,10 +1768,11 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 
 			/* reset the values to include only the current transaction. */
 			agg->cnt = 1;
+			agg->skipped = skipped ? 1 : 0;
 			agg->min_latency = latency;
 			agg->max_latency = latency;
-			agg->sum_latency = latency;
-			agg->sum2_latency = latency * latency;
+			agg->sum_latency = skipped ? 0.0 : latency;
+			agg->sum2_latency = skipped ? 0.0 : latency * latency;
 			agg->min_lag = lag;
 			agg->max_lag = lag;
 			agg->sum_lag = lag;
@@ -1717,14 +1785,23 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 #ifndef WIN32
 
 		/* This is more than we really ought to know about instr_time */
-		fprintf(logfile, "%d %d %.0f %d %ld %ld",
-				st->id, st->cnt, latency, st->use_file,
-				(long) now->tv_sec, (long) now->tv_usec);
+		if (skipped)
+			fprintf(logfile, "%d %d skipped %d %ld %ld",
+					st->id, st->cnt, st->use_file,
+					(long) now->tv_sec, (long) now->tv_usec);
+		else
+			fprintf(logfile, "%d %d %.0f %d %ld %ld",
+					st->id, st->cnt, latency, st->use_file,
+					(long) now->tv_sec, (long) now->tv_usec);
 #else
 
 		/* On Windows, instr_time doesn't provide a timestamp anyway */
-		fprintf(logfile, "%d %d %.0f %d 0 0",
-				st->id, st->cnt, latency, st->use_file);
+		if (skipped)
+			fprintf(logfile, "%d %d skipped %d 0 0",
+					st->id, st->cnt, st->use_file);
+		else
+			fprintf(logfile, "%d %d %.0f %d 0 0",
+					st->id, st->cnt, latency, st->use_file);
 #endif
 		if (throttle_delay)
 			fprintf(logfile, " %.0f", lag);
@@ -2424,7 +2501,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2463,7 +2541,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2578,6 +2666,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"latency-limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2607,6 +2696,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2651,7 +2742,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2848,6 +2939,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3143,6 +3246,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3217,6 +3322,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3239,7 +3346,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3264,7 +3372,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3467,7 +3576,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3490,23 +3600,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", skipped " INT64_FORMAT, skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3525,7 +3638,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3550,23 +3664,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3587,6 +3704,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index c4e0cbd..b6770c6 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,23 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--latency-limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Under normal operations, transaction which last more than
+        <replaceable>limit</> milliseconds are counted and reported.
+       </para>
+       <para>
+        Under throttling option <option>--rate=...</>, transactions behind
+        schedule by <replaceable>limit</> ms are simply skipped and counted
+        as such, then transactions finished later than the scheduled time plus
+        <replaceable>limit</> are counted and reported.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -439,7 +456,9 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. It can be changed with
+        option <option>--latency-limit</> which skips and counts transactions
+        late by more than this delay.
        </para>
        <para>
         When throttling is active, the transaction latency reported at the
@@ -940,7 +959,7 @@ END;
    The format of the log is:
 
 <synopsis>
-<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable>]
+<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>time</> is the total elapsed transaction time in microseconds,
@@ -950,10 +969,13 @@ END;
    UNIX epoch format timestamp and an offset
    in microseconds (suitable for creating an ISO 8601
    timestamp with fractional seconds) showing when
-   the transaction completed. The last field, <replaceable>schedule_lag</>, is
-   the difference between the transaction's scheduled start time, and the
-   time it actually started, in microseconds. It is only present when the
-   <option>--rate</> option is used.
+   the transaction completed.
+   Field <replaceable>schedule_lag</> is the difference between the transaction's
+   scheduled start time, and the time it actually started, in microseconds. It is only present
+   when the <option>--rate</> option is used.
+   The last field <replaceable>skipped_transactions</> reports the number of transactions
+   skipped because they were too far behind schedule. It is only present when both options
+   <option>--rate</> and <option>--latency-limit</> are used.
   </para>
 
   <para>
@@ -974,12 +996,12 @@ END;
 
  <refsect2>
   <title>Aggregated Logging</title>
-  
+
   <para>
    With the <option>--aggregate-interval</option> option, the logs use a bit different format:
 
 <synopsis>
-<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</>]
+<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>interval_start</> is the start of the interval (UNIX epoch
@@ -990,8 +1012,9 @@ END;
    <replaceable>latency_2_sum</> is a sum of 2nd powers of latencies. The last two
    fields are <replaceable>min_latency</> - a minimum latency within the interval, and
    <replaceable>max_latency</> - maximum latency within the interval. A transaction is
-   counted into the interval when it was committed. The last four fields, 
-   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the --rate option is used.
+   counted into the interval when it was committed. The fields in the end,
+   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the <option>--rate</> option is used.
+   The very last one <replaceable>skipped_transactions</> also requires option <option>--latency-limit</>.
    They are calculated from the time each transaction had to wait for the
    previous one to finish, i.e. the difference between each transaction's
    scheduled start time and the time it actually started.
-- 
2.1.1

#50

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#49)

Re: pgbench throttling latency limit

Hello Heikki,

Here are new patches, again the first one is just refactoring, and the second
one contains this feature. I'm planning to commit the first one shortly, and
the second one later after people have had a chance to look at it.

I looked at it. It looks ok, but for a few spurious spacing changes here
and there. No big deal.

I tested it, everything I tested behaved as expected, so it is ok for me.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#51

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#50)

1 attachment(s)

Re: pgbench throttling latency limit

On 10/05/2014 10:43 AM, Fabien COELHO wrote:

Hello Heikki,

Here are new patches, again the first one is just refactoring, and the second
one contains this feature. I'm planning to commit the first one shortly, and
the second one later after people have had a chance to look at it.

I looked at it. It looks ok, but for a few spurious spacing changes here
and there. No big deal.

I tested it, everything I tested behaved as expected, so it is ok for me.

Thanks!

I committed the refactoring patch earlier, and just went through the
second patch again. I wordsmithed the documentation and comments, and
fixed the documentation on the log format. I also fixed the logging of
skipped transactions so that the schedule lag is reported correctly for
them.

One thing bothers me with the log format. Here's an example:

0 81 4621 0 1412881037 912698 3005
0 82 6173 0 1412881037 914578 4304
0 83 skipped 0 1412881037 914578 5217
0 83 skipped 0 1412881037 914578 5099
0 83 4722 0 1412881037 916203 3108
0 84 4142 0 1412881037 918023 2333
0 85 2465 0 1412881037 919759 740

Note how the transaction counter is not incremented for skipped
transactions. That's understandable, since we're not including skipped
transactions in the number of transactions executed, but it means that
the skipped transactions don't have a unique ID. That's annoying.

Here's a new version of the patch. I'll sleep over it before committing,
but I think it's fine now, except maybe for the unique ID thing.

- Heikki

Attachments:

0002-Add-latency-limit-option-to-pgbench-2.patchtext/x-diff; name=0002-Add-latency-limit-option-to-pgbench-2.patchDownload

>From 79dc4fba57e4c8a52a074ab302aee72eb78ce6fe Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 9 Oct 2014 22:04:27 +0300
Subject: [PATCH 1/1] Add --latency-limit option to pgbench.

This allows transactions that take longer than specified limit to be counted
separately. With --rate, transactions that are already late by the time we
get to execute them are skipped altogether. Using --latency-limit with
--rate allows you to "catch up" more quickly, if there's a hickup in the
server causing a lot of transactions to stall momentarily.

Fabien COELHO, reviewed by Rukh Meski and heavily refactored by me.
---
 contrib/pgbench/pgbench.c | 231 +++++++++++++++++++++++++++++++++++-----------
 doc/src/sgml/pgbench.sgml |  70 +++++++++++---
 2 files changed, 233 insertions(+), 68 deletions(-)

diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index e9431ee..5506346 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,14 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * Transactions which take longer than this limit (in usec) are counted as
+ * late, and reported as such, although they are completed anyway. When
+ * throttling is enabled, execution time slots that are more than this late
+ * are skipped altogether, and counted separately.
+ */
+int64		latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +246,8 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
+	int64		latency_late;	/* late transactions */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +260,8 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
+	int64		latency_late;
 } TResult;
 
 /*
@@ -284,6 +296,8 @@ typedef struct
 
 	long		start_time;		/* when does the interval start */
 	int			cnt;			/* number of transactions */
+	int			skipped;		/* number of transactions skipped under
+								 * --rate and --latency-limit */
 
 	double		min_latency;	/* min/max latencies */
 	double		max_latency;
@@ -348,7 +362,7 @@ static void setalarm(int seconds);
 static void *threadRun(void *arg);
 
 static void doLog(TState *thread, CState *st, FILE *logfile, instr_time *now,
-	  AggVals *agg);
+	  AggVals *agg, bool skipped);
 
 static void
 usage(void)
@@ -375,6 +389,8 @@ usage(void)
 		 "  -f, --file=FILENAME      read transaction script from FILENAME\n"
 		   "  -j, --jobs=NUM           number of threads (default: 1)\n"
 		   "  -l, --log                write transaction times to log file\n"
+		   "  -L, --latency-limit=NUM  count transactions lasting more than NUM ms\n"
+		   "                           as late.\n"
 		   "  -M, --protocol=simple|extended|prepared\n"
 		   "                           protocol for submitting queries (default: simple)\n"
 		   "  -n, --no-vacuum          do not run VACUUM before tests\n"
@@ -994,7 +1010,9 @@ void
 agg_vals_init(AggVals *aggs, instr_time start)
 {
 	/* basic counters */
-	aggs->cnt = 0;				/* number of transactions */
+	aggs->cnt = 0;				/* number of transactions (includes skipped) */
+	aggs->skipped = 0;			/* xacts skipped under --rate --latency-limit */
+
 	aggs->sum_latency = 0;		/* SUM(latency) */
 	aggs->sum2_latency = 0;				/* SUM(latency*latency) */
 
@@ -1050,8 +1068,34 @@ top:
 		int64		wait = getPoissonRand(thread, throttle_delay);
 
 		thread->throttle_trigger += wait;
-
 		st->txn_scheduled = thread->throttle_trigger;
+
+		/*
+		 * If this --lateny-limit is used, and this slot is already late so
+		 * that the transaction will miss the latency limit even if it
+		 * completed immediately, we skip this time slot and iterate till the
+		 * next slot that isn't late yet.
+		 */
+		if (latency_limit)
+		{
+			int64		now_us;
+
+			if (INSTR_TIME_IS_ZERO(now))
+				INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread->throttle_trigger < now_us - latency_limit)
+			{
+				thread->throttle_latency_skipped++;
+
+				if (logfile)
+					doLog(thread, st, logfile, &now, agg, true);
+
+				wait = getPoissonRand(thread, throttle_delay);
+				thread->throttle_trigger += wait;
+				st->txn_scheduled = thread->throttle_trigger;
+			}
+		}
+
 		st->sleeping = 1;
 		st->throttling = true;
 		st->is_throttled = true;
@@ -1119,12 +1163,13 @@ top:
 		if (commands[st->state + 1] == NULL)
 		{
 			/* only calculate latency if an option is used that needs it */
-			if (progress || throttle_delay)
+			if (progress || throttle_delay || latency_limit)
 			{
 				int64		latency;
 
 				if (INSTR_TIME_IS_ZERO(now))
 					INSTR_TIME_SET_CURRENT(now);
+
 				latency = INSTR_TIME_GET_MICROSEC(now) - st->txn_scheduled;
 
 				st->txn_latencies += latency;
@@ -1137,11 +1182,15 @@ top:
 				 * transactions, overflow would take 256 hours.
 				 */
 				st->txn_sqlats += latency * latency;
+
+				/* record over the limit transactions if needed. */
+				if (latency_limit && latency > latency_limit)
+					thread->latency_late++;
 			}
 
 			/* record the time it took in the log */
 			if (logfile)
-				doLog(thread, st, logfile, &now, agg);
+				doLog(thread, st, logfile, &now, agg, false);
 		}
 
 		if (commands[st->state]->type == SQL_COMMAND)
@@ -1227,7 +1276,7 @@ top:
 	}
 
 	/* Record transaction start time under logging, progress or throttling */
-	if ((logfile || progress || throttle_delay) && st->state == 0)
+	if ((logfile || progress || throttle_delay || latency_limit) && st->state == 0)
 	{
 		INSTR_TIME_SET_CURRENT(st->txn_begin);
 
@@ -1605,7 +1654,8 @@ top:
  * print log entry after completing one transaction.
  */
 static void
-doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
+doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg,
+	  bool skipped)
 {
 	double		lag;
 	double		latency;
@@ -1622,7 +1672,10 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 		INSTR_TIME_SET_CURRENT(*now);
 
 	latency = (double) (INSTR_TIME_GET_MICROSEC(*now) - st->txn_scheduled);
-	lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
+	if (skipped)
+		lag = latency;
+	else
+		lag = (double) (INSTR_TIME_GET_MICROSEC(st->txn_begin) - st->txn_scheduled);
 
 	/* should we aggregate the results or not? */
 	if (agg_interval > 0)
@@ -1634,26 +1687,34 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 		if (agg->start_time + agg_interval >= INSTR_TIME_GET_DOUBLE(*now))
 		{
 			agg->cnt += 1;
-			agg->sum_latency += latency;
-			agg->sum2_latency += latency * latency;
+			if (skipped)
+			{
+				/* there is no latency to record if the transaction was skipped */
+				agg->skipped += 1;
+			}
+			else
+			{
+				agg->sum_latency += latency;
+				agg->sum2_latency += latency * latency;
 
-			/* first in this aggregation interval */
-			if ((agg->cnt == 1) || (latency < agg->min_latency))
-				agg->min_latency = latency;
+				/* first in this aggregation interval */
+				if ((agg->cnt == 1) || (latency < agg->min_latency))
+					agg->min_latency = latency;
 
-			if ((agg->cnt == 1) || (latency > agg->max_latency))
-				agg->max_latency = latency;
+				if ((agg->cnt == 1) || (latency > agg->max_latency))
+					agg->max_latency = latency;
 
-			/* and the same for schedule lag */
-			if (throttle_delay)
-			{
-				agg->sum_lag += lag;
-				agg->sum2_lag += lag * lag;
+				/* and the same for schedule lag */
+				if (throttle_delay)
+				{
+					agg->sum_lag += lag;
+					agg->sum2_lag += lag * lag;
 
-				if ((agg->cnt == 1) || (lag < agg->min_lag))
-					agg->min_lag = lag;
-				if ((agg->cnt == 1) || (lag > agg->max_lag))
-					agg->max_lag = lag;
+					if ((agg->cnt == 1) || (lag < agg->min_lag))
+						agg->min_lag = lag;
+					if ((agg->cnt == 1) || (lag > agg->max_lag))
+						agg->max_lag = lag;
+				}
 			}
 		}
 		else
@@ -1677,11 +1738,15 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 						agg->min_latency,
 						agg->max_latency);
 				if (throttle_delay)
+				{
 					fprintf(logfile, " %.0f %.0f %.0f %.0f",
 							agg->sum_lag,
 							agg->sum2_lag,
 							agg->min_lag,
 							agg->max_lag);
+					if (latency_limit)
+						fprintf(logfile, " %d", agg->skipped);
+				}
 				fputc('\n', logfile);
 
 				/* move to the next inteval */
@@ -1689,6 +1754,7 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 
 				/* reset for "no transaction" intervals */
 				agg->cnt = 0;
+				agg->skipped = 0;
 				agg->min_latency = 0;
 				agg->max_latency = 0;
 				agg->sum_latency = 0;
@@ -1701,10 +1767,11 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 
 			/* reset the values to include only the current transaction. */
 			agg->cnt = 1;
+			agg->skipped = skipped ? 1 : 0;
 			agg->min_latency = latency;
 			agg->max_latency = latency;
-			agg->sum_latency = latency;
-			agg->sum2_latency = latency * latency;
+			agg->sum_latency = skipped ? 0.0 : latency;
+			agg->sum2_latency = skipped ? 0.0 : latency * latency;
 			agg->min_lag = lag;
 			agg->max_lag = lag;
 			agg->sum_lag = lag;
@@ -1717,14 +1784,23 @@ doLog(TState *thread, CState *st, FILE *logfile, instr_time *now, AggVals *agg)
 #ifndef WIN32
 
 		/* This is more than we really ought to know about instr_time */
-		fprintf(logfile, "%d %d %.0f %d %ld %ld",
-				st->id, st->cnt, latency, st->use_file,
-				(long) now->tv_sec, (long) now->tv_usec);
+		if (skipped)
+			fprintf(logfile, "%d %d skipped %d %ld %ld",
+					st->id, st->cnt, st->use_file,
+					(long) now->tv_sec, (long) now->tv_usec);
+		else
+			fprintf(logfile, "%d %d %.0f %d %ld %ld",
+					st->id, st->cnt, latency, st->use_file,
+					(long) now->tv_sec, (long) now->tv_usec);
 #else
 
 		/* On Windows, instr_time doesn't provide a timestamp anyway */
-		fprintf(logfile, "%d %d %.0f %d 0 0",
-				st->id, st->cnt, latency, st->use_file);
+		if (skipped)
+			fprintf(logfile, "%d %d skipped %d 0 0",
+					st->id, st->cnt, st->use_file);
+		else
+			fprintf(logfile, "%d %d %.0f %d 0 0",
+					st->id, st->cnt, latency, st->use_file);
 #endif
 		if (throttle_delay)
 			fprintf(logfile, " %.0f", lag);
@@ -2424,7 +2500,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped, int64 latency_late)
 {
 	double		time_include,
 				tps_include,
@@ -2463,7 +2540,17 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			   normal_xacts);
 	}
 
-	if (throttle_delay || progress)
+	if (throttle_delay && latency_limit)
+		printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n",
+			   throttle_latency_skipped,
+			   100.0 * throttle_latency_skipped / (throttle_latency_skipped + normal_xacts));
+
+	if (latency_limit)
+		printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT " (%.3f %%)\n",
+			   latency_limit / 1000.0, latency_late,
+			   100.0 * latency_late / (throttle_latency_skipped + normal_xacts));
+
+	if (throttle_delay || progress || latency_limit)
 	{
 		/* compute and show latency average and standard deviation */
 		double		latency = 0.001 * total_latencies / normal_xacts;
@@ -2578,6 +2665,7 @@ main(int argc, char **argv)
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
 		{"rate", required_argument, NULL, 'R'},
+		{"latency-limit", required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2607,6 +2695,8 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
+	int64		latency_late = 0;
 
 	int			i;
 
@@ -2651,7 +2741,7 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
-	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "ih:nvp:dqSNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -2848,6 +2938,18 @@ main(int argc, char **argv)
 					throttle_delay = (int64) (1000000.0 / throttle_value);
 				}
 				break;
+			case 'L':
+				{
+					double limit_ms = atof(optarg);
+					if (limit_ms <= 0.0)
+					{
+						fprintf(stderr, "invalid latency limit: %s\n", optarg);
+						exit(1);
+					}
+					benchmarking_option_set = true;
+					latency_limit = (int64) (limit_ms * 1000);
+				}
+				break;
 			case 0:
 				/* This covers long options which take no argument. */
 				if (foreign_keys || unlogged_tables)
@@ -3143,6 +3245,8 @@ main(int argc, char **argv)
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
+		thread->throttle_latency_skipped = 0;
+		thread->latency_late = 0;
 
 		if (is_latencies)
 		{
@@ -3217,6 +3321,8 @@ main(int argc, char **argv)
 			total_latencies += r->latencies;
 			total_sqlats += r->sqlats;
 			throttle_lag += r->throttle_lag;
+			throttle_latency_skipped += r->throttle_latency_skipped;
+			latency_late += r->latency_late;
 			if (r->throttle_lag_max > throttle_lag_max)
 				throttle_lag_max = r->throttle_lag_max;
 			INSTR_TIME_ADD(conn_total_time, r->conn_time);
@@ -3239,7 +3345,8 @@ main(int argc, char **argv)
 	INSTR_TIME_SUBTRACT(total_time, start_time);
 	printResults(ttype, total_xacts, nclients, threads, nthreads,
 				 total_time, conn_total_time, total_latencies, total_sqlats,
-				 throttle_lag, throttle_lag_max);
+				 throttle_lag, throttle_lag_max, throttle_latency_skipped,
+				 latency_late);
 
 	return 0;
 }
@@ -3264,7 +3371,8 @@ threadRun(void *arg)
 	int64		last_count = 0,
 				last_lats = 0,
 				last_sqlats = 0,
-				last_lags = 0;
+				last_lags = 0,
+				last_skipped = 0;
 
 	AggVals		aggs;
 
@@ -3467,7 +3575,8 @@ threadRun(void *arg)
 				/* generate and show report */
 				int64		count = 0,
 							lats = 0,
-							sqlats = 0;
+							sqlats = 0,
+							skipped = 0;
 				int64		lags = thread->throttle_lag;
 				int64		run = now - last_report;
 				double		tps,
@@ -3490,23 +3599,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress %d: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						thread->tid, total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							thread->tid, total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress %d: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							thread->tid, total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", skipped " INT64_FORMAT, skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3525,7 +3637,8 @@ threadRun(void *arg)
 				int64		count = 0,
 							lats = 0,
 							sqlats = 0,
-							lags = 0;
+							lags = 0,
+							skipped = 0;
 				int64		run = now - last_report;
 				double		tps,
 							total_run,
@@ -3550,23 +3663,26 @@ threadRun(void *arg)
 				sqlat = 1.0 * (sqlats - last_sqlats) / (count - last_count);
 				stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
 				lag = 0.001 * (lags - last_lags) / (count - last_count);
+				skipped = thread->throttle_latency_skipped - last_skipped;
 
+				fprintf(stderr,
+						"progress: %.1f s, %.1f tps, "
+						"lat %.3f ms stddev %.3f",
+						total_run, tps, latency, stdev);
 				if (throttle_delay)
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f, lag %.3f ms\n",
-							total_run, tps, latency, stdev, lag);
-				else
-					fprintf(stderr,
-							"progress: %.1f s, %.1f tps, "
-							"lat %.3f ms stddev %.3f\n",
-							total_run, tps, latency, stdev);
+				{
+					fprintf(stderr, ", lag %.3f ms", lag);
+					if (latency_limit)
+						fprintf(stderr, ", " INT64_FORMAT " skipped", skipped);
+				}
+				fprintf(stderr, "\n");
 
 				last_count = count;
 				last_lats = lats;
 				last_sqlats = sqlats;
 				last_lags = lags;
 				last_report = now;
+				last_skipped = thread->throttle_latency_skipped;
 				next_report += (int64) progress *1000000;
 			}
 		}
@@ -3587,6 +3703,9 @@ done:
 	}
 	result->throttle_lag = thread->throttle_lag;
 	result->throttle_lag_max = thread->throttle_lag_max;
+	result->throttle_latency_skipped = thread->throttle_latency_skipped;
+	result->latency_late = thread->latency_late;
+
 	INSTR_TIME_SET_CURRENT(end);
 	INSTR_TIME_ACCUM_DIFF(result->conn_time, end, start);
 	if (logfile)
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index c4e0cbd..a084fe3 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -345,6 +345,24 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </varlistentry>
 
      <varlistentry>
+      <term><option>-L</option> <replaceable>limit</></term>
+      <term><option>--latency-limit=</option><replaceable>limit</></term>
+      <listitem>
+       <para>
+        Transaction which last more than <replaceable>limit</> milliseconds
+        are counted and reported separately, as <firstterm>late</>.
+       </para>
+       <para>
+        When throttling is used (<option>--rate=...</>), transactions that
+        lag behind schedule by more than <replaceable>limit</> ms, and thus
+        have no hope of meeting the latency limit, are not sent to the server
+        at all. They and are counted and reported separately as
+        <firstterm>skipped</>.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <term><option>--protocol=</option><replaceable>querymode</></term>
       <listitem>
@@ -439,7 +457,7 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
         schedule moves forward based on when the client first started, not
         when the previous transaction ended.  That approach means that when
         transactions go past their original scheduled end time, it is
-        possible for later ones to catch up again.
+        possible for later ones to catch up again. 
        </para>
        <para>
         When throttling is active, the transaction latency reported at the
@@ -454,6 +472,15 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
        </para>
 
        <para>
+        If <option>--latency-limit</> is used together with <option>--rate</>,
+        a transaction can lag behind so much that it is already over the
+        latency limit when the previous transaction ends, because the latency
+        is calculated from the scheduled start time. Such transactions are
+        not sent to the server, but are skipped altogether and counted
+        separately.
+       </para>
+
+       <para>
         A high schedule lag time is an indication that the system cannot
         process transactions at the specified rate, with the chosen number of
         clients and threads. When the average transaction execution time is
@@ -940,7 +967,7 @@ END;
    The format of the log is:
 
 <synopsis>
-<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable>]
+<replaceable>client_id</> <replaceable>transaction_no</> <replaceable>time</> <replaceable>file_no</> <replaceable>time_epoch</> <replaceable>time_us</> [<replaceable>schedule_lag</replaceable>
 </synopsis>
 
    where <replaceable>time</> is the total elapsed transaction time in microseconds,
@@ -950,20 +977,38 @@ END;
    UNIX epoch format timestamp and an offset
    in microseconds (suitable for creating an ISO 8601
    timestamp with fractional seconds) showing when
-   the transaction completed. The last field, <replaceable>schedule_lag</>, is
-   the difference between the transaction's scheduled start time, and the
-   time it actually started, in microseconds. It is only present when the
-   <option>--rate</> option is used.
+   the transaction completed.
+   Field <replaceable>schedule_lag</> is the difference between the transaction's
+   scheduled start time, and the time it actually started, in microseconds. It is only present
+   when the <option>--rate</> option is used.
+   The last field <replaceable>skipped_transactions</> reports the number of transactions
+   skipped because they were too far behind schedule. It is only present when both options
+   <option>--rate</> and <option>--latency-limit</> are used.
   </para>
 
   <para>
-   Here are example outputs:
+   Here is a snippet of the log file generated:
 <screen>
  0 199 2241 0 1175850568 995598
  0 200 2465 0 1175850568 998079
  0 201 2513 0 1175850569 608
  0 202 2038 0 1175850569 2663
-</screen></para>
+</screen>
+
+  Another example with --rate=100 and --latency-limit=5 (note the additional <replaceable>schedule_lag</> column):
+<screen>
+ 0 81 4621 0 1412881037 912698 3005
+ 0 82 6173 0 1412881037 914578 4304
+ 0 83 skipped 0 1412881037 914578 5217
+ 0 83 skipped 0 1412881037 914578 5099
+ 0 83 4722 0 1412881037 916203 3108
+ 0 84 4142 0 1412881037 918023 2333
+ 0 85 2465 0 1412881037 919759 740
+</screen>
+  In this example, transaction 82 was late, because it's latency (6.173 ms) was
+  over the 5 ms limit. The next two transactions were skipped, because they
+  were already late before they were even started.
+  </para>
 
   <para>
    When running a long test on hardware that can handle a lot of transactions,
@@ -974,12 +1019,12 @@ END;
 
  <refsect2>
   <title>Aggregated Logging</title>
-  
+
   <para>
    With the <option>--aggregate-interval</option> option, the logs use a bit different format:
 
 <synopsis>
-<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</>]
+<replaceable>interval_start</> <replaceable>num_of_transactions</> <replaceable>latency_sum</> <replaceable>latency_2_sum</> <replaceable>min_latency</> <replaceable>max_latency</> [<replaceable>lag_sum</> <replaceable>lag_2_sum</> <replaceable>min_lag</> <replaceable>max_lag</> [<replaceable>skipped_transactions</>]]
 </synopsis>
 
    where <replaceable>interval_start</> is the start of the interval (UNIX epoch
@@ -990,8 +1035,9 @@ END;
    <replaceable>latency_2_sum</> is a sum of 2nd powers of latencies. The last two
    fields are <replaceable>min_latency</> - a minimum latency within the interval, and
    <replaceable>max_latency</> - maximum latency within the interval. A transaction is
-   counted into the interval when it was committed. The last four fields, 
-   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the --rate option is used.
+   counted into the interval when it was committed. The fields in the end,
+   <replaceable>lag_sum</>, <replaceable>lag_2_sum</>, <replaceable>min_lag</>, and <replaceable>max_lag</>, are only present if the <option>--rate</> option is used.
+   The very last one <replaceable>skipped_transactions</> also requires option <option>--latency-limit</>.
    They are calculated from the time each transaction had to wait for the
    previous one to finish, i.e. the difference between each transaction's
    scheduled start time and the time it actually started.
-- 
2.1.1

#52

Fabien COELHO

coelho@cri.ensmp.fr

over 11 years ago

In reply to: Heikki Linnakangas (#51)

Re: pgbench throttling latency limit

One thing bothers me with the log format. Here's an example:

0 81 4621 0 1412881037 912698 3005
0 82 6173 0 1412881037 914578 4304
0 83 skipped 0 1412881037 914578 5217
0 83 skipped 0 1412881037 914578 5099
0 83 4722 0 1412881037 916203 3108
0 84 4142 0 1412881037 918023 2333
0 85 2465 0 1412881037 919759 740

Note how the transaction counter is not incremented for skipped transactions.
That's understandable, since we're not including skipped transactions in the
number of transactions executed, but it means that the skipped transactions
don't have a unique ID. That's annoying.

Indeed. As transactions were not done, it does not make much sense to
identify them. Otherwise it should report "intended" transactions and
"performed" transactions, which would not help clarify the matter much.

My idea of "skipped" transactions, which are not transactions as such, is
just a health quality measurement for both the throttling process and the
database latency, so I would really let it as it is.

Here's a new version of the patch. I'll sleep over it before committing, but
I think it's fine now, except maybe for the unique ID thing.

I saw a typo in a comment: "lateny" -> "latency". Otherwise it looks ok,
and the documentation seems indeed clearer than before.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#53

Heikki Linnakangas

hlinnakangas@vmware.com

over 11 years ago

In reply to: Fabien COELHO (#52)

Re: pgbench throttling latency limit

On 10/09/2014 10:39 PM, Fabien COELHO wrote:

One thing bothers me with the log format. Here's an example:

0 81 4621 0 1412881037 912698 3005
0 82 6173 0 1412881037 914578 4304
0 83 skipped 0 1412881037 914578 5217
0 83 skipped 0 1412881037 914578 5099
0 83 4722 0 1412881037 916203 3108
0 84 4142 0 1412881037 918023 2333
0 85 2465 0 1412881037 919759 740

Note how the transaction counter is not incremented for skipped transactions.
That's understandable, since we're not including skipped transactions in the
number of transactions executed, but it means that the skipped transactions
don't have a unique ID. That's annoying.

Indeed. As transactions were not done, it does not make much sense to
identify them. Otherwise it should report "intended" transactions and
"performed" transactions, which would not help clarify the matter much.

My idea of "skipped" transactions, which are not transactions as such, is
just a health quality measurement for both the throttling process and the
database latency, so I would really let it as it is.

Hmm. I wonder if this is going to be a problem for programs that might
try to load the log file into a database table. No using transaction ID
as a unique key. Then again, you'll have to somehow deal with "skipped"
anyway.

Here's a new version of the patch. I'll sleep over it before committing, but
I think it's fine now, except maybe for the unique ID thing.

I saw a typo in a comment: "lateny" -> "latency". Otherwise it looks ok,
and the documentation seems indeed clearer than before.

Ok, committed after a few more typo-fixes.

Greg Smith, I'd still appreciate it if you could take a look at this, to
check how this will work for pgbench-tools.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#54

Gregory Smith

gregsmithpgsql@gmail.com

over 11 years ago

In reply to: Heikki Linnakangas (#53)

Re: pgbench throttling latency limit

On 10/13/14, 1:54 PM, Heikki Linnakangas wrote:

Greg Smith, I'd still appreciate it if you could take a look at this,
to check how this will work for pgbench-tools.

I'll do a QA pass on the committed version looking for issues, and
update the toolchain I publish to be compatible with it along the way too.

--
Greg Smith greg.smith@crunchydatasolutions.com
Chief PostgreSQL Evangelist - http://crunchydatasolutions.com/

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#55

Andres Freund

andres@anarazel.de

about 10 years ago

In reply to: Fabien COELHO (#1)

Re: pgbench throttling latency limit

On 2014-08-14 15:01:53 +0200, Fabien COELHO wrote:

Add --limit to limit latency under throttling

Under throttling, transactions are scheduled for execution at certain times.
Transactions may be far behind schedule and the system may catch up with the
load later. This option allows to change this behavior by skipping
transactions which are too far behind schedule, and count those as skipped.

The idea is to help simulate a latency-constrained environment such as a
database used by a web server.

I was just trying to run tests with this, but as far as I can see it
doesn't really work:

pgbench postgres -M prepared -c 72 -j 72 -P 5 -T 3600 -R40000 -L100
...
progress: 240.0 s, 40191.8 tps, lat 1.250 ms stddev 0.965, lag 0.501 ms, 0 skipped
progress: 245.0 s, 39722.1 tps, lat 1.128 ms stddev 0.946, lag 0.435 ms, 0 skipped
progress: 250.0 s, 40074.5 tps, lat 1.059 ms stddev 0.745, lag 0.391 ms, 0 skipped
progress: 255.0 s, 40001.4 tps, lat 1.038 ms stddev 0.680, lag 0.377 ms, 0 skipped
progress: 260.0 s, 40147.6 tps, lat 1.161 ms stddev 0.950, lag 0.448 ms, 0 skipped
progress: 265.0 s, 39980.1 tps, lat 1.186 ms stddev 0.862, lag 0.457 ms, 0 skipped
progress: 270.0 s, 40090.9 tps, lat 1.292 ms stddev 1.239, lag 0.544 ms, 0 skipped
progress: 275.0 s, 33847.8 tps, lat 26.617 ms stddev 41.681, lag 25.317 ms, 26698 skipped
progress: 280.0 s, 20237.9 tps, lat 96.041 ms stddev 11.983, lag 92.510 ms, 97745 skipped
progress: 285.0 s, 24385.0 tps, lat 94.490 ms stddev 10.865, lag 91.514 ms, 80944 skipped
progress: 290.0 s, 27349.6 tps, lat 92.755 ms stddev 10.905, lag 90.136 ms, 62268 skipped
progress: 295.0 s, 28382.1 tps, lat 92.752 ms stddev 10.238, lag 90.212 ms, 58253 skipped
progress: 300.0 s, 28798.3 tps, lat 92.673 ms stddev 10.506, lag 90.172 ms, 56741 skipped
progress: 305.0 s, 29346.6 tps, lat 91.659 ms stddev 10.982, lag 89.210 ms, 53163 skipped
progress: 310.0 s, 30072.9 tps, lat 91.190 ms stddev 11.071, lag 88.802 ms, 48370 skipped
progress: 315.0 s, 30733.2 tps, lat 90.893 ms stddev 11.312, lag 88.548 ms, 47020 skipped
progress: 320.0 s, 31170.9 tps, lat 89.498 ms stddev 12.132, lag 87.192 ms, 43403 skipped
progress: 325.0 s, 33399.0 tps, lat 85.795 ms stddev 15.196, lag 83.639 ms, 32923 skipped
progress: 330.0 s, 22969.8 tps, lat 91.929 ms stddev 14.762, lag 88.805 ms, 84780 skipped
progress: 335.0 s, 18913.3 tps, lat 95.236 ms stddev 14.523, lag 91.444 ms, 104960 skipped
progress: 340.0 s, 20061.2 tps, lat 95.258 ms stddev 13.284, lag 91.660 ms, 100396 skipped
progress: 345.0 s, 20405.3 tps, lat 94.781 ms stddev 13.794, lag 91.255 ms, 98510 skipped
progress: 350.0 s, 20596.0 tps, lat 94.661 ms stddev 13.345, lag 91.189 ms, 95426 skipped
progress: 355.0 s, 13635.7 tps, lat 96.998 ms stddev 38.039, lag 91.691 ms, 132598 skipped
progress: 360.0 s, 16648.0 tps, lat 95.138 ms stddev 26.329, lag 90.809 ms, 117129 skipped
progress: 365.0 s, 18392.1 tps, lat 94.857 ms stddev 23.917, lag 90.980 ms, 106244 skipped

100k skipped transactions at a rate limit of 40k? That doesn't seem right.

(that's master server/pgbench as of 5be94a9eb15a)

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#56

Andres Freund

andres@anarazel.de

about 10 years ago

In reply to: Andres Freund (#55)

Re: pgbench throttling latency limit

On 2015-10-20 20:55:46 +0200, Andres Freund wrote:

On 2014-08-14 15:01:53 +0200, Fabien COELHO wrote:

Add --limit to limit latency under throttling

Under throttling, transactions are scheduled for execution at certain times.
Transactions may be far behind schedule and the system may catch up with the
load later. This option allows to change this behavior by skipping
transactions which are too far behind schedule, and count those as skipped.

The idea is to help simulate a latency-constrained environment such as a
database used by a web server.

I was just trying to run tests with this, but as far as I can see it
doesn't really work:

pgbench postgres -M prepared -c 72 -j 72 -P 5 -T 3600 -R40000 -L100

progress: 365.0 s, 18392.1 tps, lat 94.857 ms stddev 23.917, lag 90.980 ms, 106244 skipped

100k skipped transactions at a rate limit of 40k? That doesn't seem right.

Argh. It's just because I used -P5. It's a bit confusing that the other
options are per second, and this is per interval...

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#57

Fabien COELHO

coelho@cri.ensmp.fr

about 10 years ago

In reply to: Andres Freund (#56)

Re: pgbench throttling latency limit

Hello Andres,

pgbench postgres -M prepared -c 72 -j 72 -P 5 -T 3600 -R40000 -L100

progress: 365.0 s, 18392.1 tps, lat 94.857 ms stddev 23.917, lag 90.980 ms, 106244 skipped

100k skipped transactions at a rate limit of 40k? That doesn't seem right.

Argh. It's just because I used -P5. It's a bit confusing that the other
options are per second, and this is per interval...

I agree, but I'm unsure of a fix, beyond what is already done which is to
show units next to the figures...

ISTM that people expect "tps" for performance, even on several seconds.
When it comes to skipped transactions, a count seemed more natural. I
really just see this as an indicator that things are not going smoothly.

Maybe it could be shown as a percentage of scheduled transactions,
possibly with an option?

A mitigation is to always run with -P 1 :-).

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#58

Amit Langote

Langote_Amit_f8@lab.ntt.co.jp

about 10 years ago

In reply to: Fabien COELHO (#57)

Re: pgbench throttling latency limit

On 2015/10/22 18:20, Fabien COELHO wrote:

progress: 365.0 s, 18392.1 tps, lat 94.857 ms stddev 23.917, lag 90.980
ms, 106244 skipped

100k skipped transactions at a rate limit of 40k? That doesn't seem right.

Argh. It's just because I used -P5. It's a bit confusing that the other
options are per second, and this is per interval...

I agree, but I'm unsure of a fix, beyond what is already done which is to
show units next to the figures...

ISTM that people expect "tps" for performance, even on several seconds.
When it comes to skipped transactions, a count seemed more natural. I
really just see this as an indicator that things are not going smoothly.

Maybe it could be shown as a percentage of scheduled transactions,
possibly with an option?

A mitigation is to always run with -P 1 :-).

Wouldn't printing average (per second) over the interval work?

Thanks,
Amit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#59

Fabien COELHO

coelho@cri.ensmp.fr

about 10 years ago

In reply to: Amit Langote (#58)

Re: pgbench throttling latency limit

Argh. It's just because I used -P5. It's a bit confusing that the other
options are per second, and this is per interval...

I agree, but I'm unsure of a fix, beyond what is already done which is to
show units next to the figures...

ISTM that people expect "tps" for performance, even on several seconds.
When it comes to skipped transactions, a count seemed more natural. I
really just see this as an indicator that things are not going smoothly.

Maybe it could be shown as a percentage of scheduled transactions,
possibly with an option?

A mitigation is to always run with -P 1 :-).

Wouldn't printing average (per second) over the interval work?

Yes it would. That would be "skipped tps". Why not. The percentage seems
also attractive to me, because it does not matter whether you get big
figures or small figures as it is relative.

--
Fabien.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers