External Sort timing debug statements
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.
Each major point in the algorithms is marked as shown in this example:
postgres=# set debug_sort=true;
SET
postgres=# explain analyze select * from test2 order by col1,col2;
NOTICE: tuplesort begin work_mem= 1024
NOTICE: +0 secs heap sort nkeys= 2
NOTICE: +0 secs switching to external sort
NOTICE: +1129 secs starting build of next run
NOTICE: +2229 secs run building complete nruns= 2
NOTICE: +2229 secs merging runs with 6 tapes
...
NOTICE: +3036 secs starting final merge
I'll add other information, as requested.
The "6 tapes" is currently hardcoded, though is included in expectation
of implementing variable numbers of tapes.
I'm not sure if I got the header file correct for full portability of
gettimeofday(). Please correct me, if this is the case.
Please post sort performance data back via this post.
Best Regards, Simon Riggs
Attachments:
debug_sort.patchtext/x-patch; charset=UTF-8; name=debug_sort.patchDownload
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.289
diff -c -c -r1.289 guc.c
*** src/backend/utils/misc/guc.c 24 Sep 2005 22:54:39 -0000 1.289
--- src/backend/utils/misc/guc.c 2 Oct 2005 18:00:45 -0000
***************
*** 92,97 ****
--- 92,98 ----
extern int CommitSiblings;
extern char *default_tablespace;
extern bool fullPageWrites;
+ extern bool debug_sort;
static const char *assign_log_destination(const char *value,
bool doit, GucSource source);
***************
*** 764,770 ****
false, NULL, NULL
},
#endif
!
{
{"log_hostname", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs the host name in the connection logs."),
--- 765,779 ----
false, NULL, NULL
},
#endif
! {
! {"debug_sort", PGC_USERSET, DEVELOPER_OPTIONS,
! gettext_noop("no description available"),
! NULL,
! GUC_NOT_IN_SAMPLE
! },
! &debug_sort,
! false, NULL, NULL
! },
{
{"log_hostname", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs the host name in the connection logs."),
Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.50
diff -c -c -r1.50 tuplesort.c
*** src/backend/utils/sort/tuplesort.c 23 Sep 2005 15:36:57 -0000 1.50
--- src/backend/utils/sort/tuplesort.c 2 Oct 2005 18:00:46 -0000
***************
*** 394,399 ****
--- 394,402 ----
*/
static Tuplesortstate *qsort_tuplesortstate;
+ /* debug variables */
+ bool debug_sort = false;
+ struct timeval start_t, current_t;
/*
* tuplesort_begin_xxx
***************
*** 422,427 ****
--- 425,437 ----
state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
+ if (debug_sort)
+ {
+ ereport(NOTICE,
+ (errmsg("tuplesort begin work_mem= %u", workMem)));
+ gettimeofday(&start_t, NULL);
+ }
+
state->status = TSS_INITIAL;
state->randomAccess = randomAccess;
state->availMem = workMem * 1024L;
***************
*** 454,459 ****
--- 464,473 ----
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
int i;
+ if (debug_sort)
+ ereport(NOTICE,
+ (errmsg("+0 secs heap sort nkeys= %u", nkeys)));
+
AssertArg(nkeys > 0);
state->comparetup = comparetup_heap;
***************
*** 499,504 ****
--- 513,522 ----
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+ if (debug_sort)
+ ereport(NOTICE,
+ (errmsg("+0 secs index sort")));
+
state->comparetup = comparetup_index;
state->copytup = copytup_index;
state->writetup = writetup_index;
***************
*** 666,671 ****
--- 684,697 ----
/*
* Nope; time to switch to tape-based operation.
*/
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs switching to external sort",
+ (long) (current_t.tv_sec - start_t.tv_sec),
+ state->currentRun)));
+ }
inittapes(state);
/*
***************
*** 1042,1047 ****
--- 1068,1082 ----
int j;
int a;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting build of next run",
+ (long) (current_t.tv_sec - start_t.tv_sec),
+ state->currentRun)));
+ }
+
/* Step D3: advance j (destTape) */
if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
{
***************
*** 1082,1087 ****
--- 1117,1131 ----
Assert(state->status == TSS_BUILDRUNS);
Assert(state->memtupcount == 0);
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs run building complete nruns= %u",
+ (long) (current_t.tv_sec - start_t.tv_sec),
+ state->currentRun)));
+ }
+
/*
* If we produced only one initial run (quite likely if the total data
* volume is between 1X and 2X workMem), we can just use that tape as
***************
*** 1093,1100 ****
--- 1137,1158 ----
/* must freeze and rewind the finished output tape */
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs no final merge required",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
return;
}
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs merging runs with 6 tapes",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
/* End of step D2: rewind all output tapes to prepare for merging */
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
***************
*** 1126,1131 ****
--- 1184,1196 ----
/* Initialize for the final merge pass */
beginmerge(state);
state->status = TSS_FINALMERGE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting final merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
return;
}
if (allDummy)
***************
*** 1177,1182 ****
--- 1242,1254 ----
state->result_tape = state->tp_tapenum[TAPERANGE];
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting final merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
}
/*
***************
*** 1243,1248 ****
--- 1315,1327 ----
*/
markrunend(state, destTape);
state->tp_runs[TAPERANGE]++;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs intermediate merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
}
/*
On Sun, 2005-10-02 at 19:43 +0100, Simon Riggs wrote:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.
Minor update of patch, use this version please.
Show quoted text
Best Regards, Simon Riggs
Attachments:
debug_sort.patchtext/x-patch; charset=UTF-8; name=debug_sort.patchDownload
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.289
diff -c -c -r1.289 guc.c
*** src/backend/utils/misc/guc.c 24 Sep 2005 22:54:39 -0000 1.289
--- src/backend/utils/misc/guc.c 2 Oct 2005 19:58:02 -0000
***************
*** 92,97 ****
--- 92,98 ----
extern int CommitSiblings;
extern char *default_tablespace;
extern bool fullPageWrites;
+ extern bool debug_sort;
static const char *assign_log_destination(const char *value,
bool doit, GucSource source);
***************
*** 764,770 ****
false, NULL, NULL
},
#endif
!
{
{"log_hostname", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs the host name in the connection logs."),
--- 765,779 ----
false, NULL, NULL
},
#endif
! {
! {"debug_sort", PGC_USERSET, DEVELOPER_OPTIONS,
! gettext_noop("no description available"),
! NULL,
! GUC_NOT_IN_SAMPLE
! },
! &debug_sort,
! false, NULL, NULL
! },
{
{"log_hostname", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs the host name in the connection logs."),
Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.50
diff -c -c -r1.50 tuplesort.c
*** src/backend/utils/sort/tuplesort.c 23 Sep 2005 15:36:57 -0000 1.50
--- src/backend/utils/sort/tuplesort.c 2 Oct 2005 19:58:02 -0000
***************
*** 85,90 ****
--- 85,92 ----
#include "postgres.h"
+ #include <sys/time.h>
+
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/pg_amop.h"
***************
*** 394,399 ****
--- 396,404 ----
*/
static Tuplesortstate *qsort_tuplesortstate;
+ /* debug variables */
+ bool debug_sort = false;
+ struct timeval start_t, current_t;
/*
* tuplesort_begin_xxx
***************
*** 422,427 ****
--- 427,439 ----
state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
+ if (debug_sort)
+ {
+ ereport(NOTICE,
+ (errmsg("tuplesort begin work_mem= %u", workMem)));
+ gettimeofday(&start_t, NULL);
+ }
+
state->status = TSS_INITIAL;
state->randomAccess = randomAccess;
state->availMem = workMem * 1024L;
***************
*** 454,459 ****
--- 466,475 ----
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
int i;
+ if (debug_sort)
+ ereport(NOTICE,
+ (errmsg("+0 secs heap sort nkeys= %u", nkeys)));
+
AssertArg(nkeys > 0);
state->comparetup = comparetup_heap;
***************
*** 499,504 ****
--- 515,524 ----
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+ if (debug_sort)
+ ereport(NOTICE,
+ (errmsg("+0 secs index sort")));
+
state->comparetup = comparetup_index;
state->copytup = copytup_index;
state->writetup = writetup_index;
***************
*** 666,671 ****
--- 686,698 ----
/*
* Nope; time to switch to tape-based operation.
*/
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs switching to external sort",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
inittapes(state);
/*
***************
*** 1042,1047 ****
--- 1069,1082 ----
int j;
int a;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting build of next run",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
+
/* Step D3: advance j (destTape) */
if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
{
***************
*** 1082,1087 ****
--- 1117,1131 ----
Assert(state->status == TSS_BUILDRUNS);
Assert(state->memtupcount == 0);
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs run building complete nruns= %u",
+ (long) (current_t.tv_sec - start_t.tv_sec),
+ state->currentRun)));
+ }
+
/*
* If we produced only one initial run (quite likely if the total data
* volume is between 1X and 2X workMem), we can just use that tape as
***************
*** 1093,1100 ****
--- 1137,1158 ----
/* must freeze and rewind the finished output tape */
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs no final merge required",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
return;
}
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs merging runs with 6 tapes",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
/* End of step D2: rewind all output tapes to prepare for merging */
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
***************
*** 1126,1131 ****
--- 1184,1196 ----
/* Initialize for the final merge pass */
beginmerge(state);
state->status = TSS_FINALMERGE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting final merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
return;
}
if (allDummy)
***************
*** 1177,1182 ****
--- 1242,1254 ----
state->result_tape = state->tp_tapenum[TAPERANGE];
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs starting final merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
}
/*
***************
*** 1243,1248 ****
--- 1315,1327 ----
*/
markrunend(state, destTape);
state->tp_runs[TAPERANGE]++;
+ if (debug_sort)
+ {
+ gettimeofday(¤t_t, NULL);
+ ereport(NOTICE,
+ (errmsg("+%ld secs intermediate merge",
+ (long) (current_t.tv_sec - start_t.tv_sec))));
+ }
}
/*
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.
What do people think about putting something like this into 8.1?
Strictly speaking it's a new feature, but the patch seems pretty
noninvasive, and we'd be much more likely to get data points if the
code exists in the mainline release than if people have to patch
their copies.
postgres=# set debug_sort=true;
I'm a bit inclined to call it trace_sort instead, and to document it
under "Developer Options". Comments?
regards, tom lane
I'm not averse to it. I think it's a good option and I support trace_sort
(it really is more of a trace).
On 10/3/05, Tom Lane <tgl@sss.pgh.pa.us> wrote:
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.What do people think about putting something like this into 8.1?
Strictly speaking it's a new feature, but the patch seems pretty
noninvasive, and we'd be much more likely to get data points if the
code exists in the mainline release than if people have to patch
their copies.postgres=# set debug_sort=true;
I'm a bit inclined to call it trace_sort instead, and to document it
under "Developer Options". Comments?regards, tom lane
---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster
--
Respectfully,
Jonah H. Harris, Database Internals Architect
EnterpriseDB Corporation
http://www.enterprisedb.com/
-----Original Message-----
From: pgsql-patches-owner@postgresql.org
[mailto:pgsql-patches-owner@postgresql.org] On Behalf Of Tom Lane
Sent: 03 October 2005 15:37
To: Simon Riggs
Cc: pgsql-hackers@postgresql.org; pgsql-patches@postgresql.org
Subject: Re: [PATCHES] [HACKERS] External Sort timing debug
statementsSimon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing
statements
aimed at understanding external sort performance. There is
no attempt to
alter the algorithms.
What do people think about putting something like this into 8.1?
Strictly speaking it's a new feature, but the patch seems pretty
noninvasive, and we'd be much more likely to get data points if the
code exists in the mainline release than if people have to patch
their copies.
Agreed.
postgres=# set debug_sort=true;
I'm a bit inclined to call it trace_sort instead, and to document it
under "Developer Options". Comments?
Sounds reasonable to me.
Regards, Dave.
Import Notes
Resolved by subject fallback
On Mon, Oct 03, 2005 at 10:36:57AM -0400, Tom Lane wrote:
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing
statements aimed at understanding external sort performance. There
is no attempt to alter the algorithms.What do people think about putting something like this into 8.1?
Strictly speaking it's a new feature, but the patch seems pretty
noninvasive, and we'd be much more likely to get data points if the
code exists in the mainline release than if people have to patch
their copies.
This would be good :)
postgres=# set debug_sort=true;
I'm a bit inclined to call it trace_sort instead, and to document it
under "Developer Options". Comments?
+1
Cheers,
D
--
David Fetter david@fetter.org http://fetter.org/
phone: +1 510 893 6100 mobile: +1 415 235 3778
Remember to vote!
On Mon, 2005-10-03 at 10:36 -0400, Tom Lane wrote:
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.What do people think about putting something like this into 8.1?
Strictly speaking it's a new feature, but the patch seems pretty
noninvasive, and we'd be much more likely to get data points if the
code exists in the mainline release than if people have to patch
their copies.postgres=# set debug_sort=true;
I'm a bit inclined to call it trace_sort instead, and to document it
under "Developer Options". Comments?
Sounds good to me.
Any other requests for info in the trace output.... shout now
Best Regards, Simon Riggs
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.
Minor update of patch, use this version please.
Applied with revisions: I made it use the VacRUsage code so that we
could see both CPU and elapsed time, and moved the report points around
a bit. The output with trace_sort enabled looks like this:
NOTICE: begin tuple sort: nkeys = 1, workMem = 1024, randomAccess = t
NOTICE: switching to external sort: CPU 0.05s/0.10u sec elapsed 0.15 sec
NOTICE: finished writing run 1: CPU 0.14s/0.83u sec elapsed 0.99 sec
NOTICE: finished writing run 2: CPU 0.25s/1.67u sec elapsed 1.94 sec
NOTICE: finished writing run 3: CPU 0.37s/2.51u sec elapsed 2.90 sec
NOTICE: finished writing run 4: CPU 0.48s/3.36u sec elapsed 3.86 sec
...
NOTICE: finished writing run 45: CPU 5.06s/38.26u sec elapsed 43.55 sec
NOTICE: performsort starting: CPU 5.10s/38.62u sec elapsed 43.95 sec
NOTICE: finished writing run 46: CPU 5.11s/38.84u sec elapsed 44.18 sec
NOTICE: finished writing final run 47: CPU 5.11s/38.88u sec elapsed 44.22 sec
NOTICE: finished merge step: CPU 5.12s/39.02u sec elapsed 44.37 sec
NOTICE: finished merge step: CPU 5.13s/39.16u sec elapsed 44.53 sec
...
NOTICE: finished merge step: CPU 6.57s/67.78u sec elapsed 74.83 sec
NOTICE: performsort done: CPU 6.57s/67.78u sec elapsed 74.84 sec
NOTICE: sort ended: CPU 10.80s/74.73u sec elapsed 86.21 sec
regards, tom lane
On Mon, 2005-10-03 at 18:59 -0400, Tom Lane wrote:
Simon Riggs <simon@2ndquadrant.com> writes:
The following patch implements a fairly light set of timing statements
aimed at understanding external sort performance. There is no attempt to
alter the algorithms.Minor update of patch, use this version please.
Applied with revisions: I made it use the VacRUsage code so that we
could see both CPU and elapsed time, and moved the report points around
a bit. The output with trace_sort enabled looks like this:NOTICE: begin tuple sort: nkeys = 1, workMem = 1024, randomAccess = t
NOTICE: switching to external sort: CPU 0.05s/0.10u sec elapsed 0.15 sec
NOTICE: finished writing run 1: CPU 0.14s/0.83u sec elapsed 0.99 sec
NOTICE: finished writing run 2: CPU 0.25s/1.67u sec elapsed 1.94 sec
NOTICE: finished writing run 3: CPU 0.37s/2.51u sec elapsed 2.90 sec
NOTICE: finished writing run 4: CPU 0.48s/3.36u sec elapsed 3.86 sec
...
NOTICE: finished writing run 45: CPU 5.06s/38.26u sec elapsed 43.55 sec
NOTICE: performsort starting: CPU 5.10s/38.62u sec elapsed 43.95 sec
NOTICE: finished writing run 46: CPU 5.11s/38.84u sec elapsed 44.18 sec
NOTICE: finished writing final run 47: CPU 5.11s/38.88u sec elapsed 44.22 sec
NOTICE: finished merge step: CPU 5.12s/39.02u sec elapsed 44.37 sec
NOTICE: finished merge step: CPU 5.13s/39.16u sec elapsed 44.53 sec
...
NOTICE: finished merge step: CPU 6.57s/67.78u sec elapsed 74.83 sec
NOTICE: performsort done: CPU 6.57s/67.78u sec elapsed 74.84 sec
NOTICE: sort ended: CPU 10.80s/74.73u sec elapsed 86.21 sec
Many thanks,
Best Regards, Simon Riggs
Tom Lane <tgl@sss.pgh.pa.us> writes:
Applied with revisions: I made it use the VacRUsage code so that we
could see both CPU and elapsed time, and moved the report points around
a bit. The output with trace_sort enabled looks like this:NOTICE: begin tuple sort: nkeys = 1, workMem = 1024, randomAccess = t
NOTICE: switching to external sort: CPU 0.05s/0.10u sec elapsed 0.15 sec
NOTICE: finished writing run 1: CPU 0.14s/0.83u sec elapsed 0.99 sec
NOTICE: finished writing run 2: CPU 0.25s/1.67u sec elapsed 1.94 sec
NOTICE: finished writing run 3: CPU 0.37s/2.51u sec elapsed 2.90 sec
NOTICE: finished writing run 4: CPU 0.48s/3.36u sec elapsed 3.86 sec
I'm not actually involved in this so maybe I'm completely off base here. But
wouldn't you want to know how many tuples are being sorted and how many data
are being written in these runs in order to be able to actually make sense of
these timing measurements?
Otherwise you would never know whether the results people will send you are
really comparable. And even if you're testing things in a well controlled
environments you'll never know if you missed something and you're comparing
two incomparable results.
--
greg
Greg Stark <gsstark@mit.edu> writes:
I'm not actually involved in this so maybe I'm completely off base here. But
wouldn't you want to know how many tuples are being sorted and how many data
are being written in these runs in order to be able to actually make sense of
these timing measurements?
In all plausible test cases you should be able to know the total number
of tuples being sorted independently. Counting the tuples processed in
each run would require extra per-tuple overhead, which I'd rather avoid
until proven necessary.
The total-data-volume aspect may or may not be interesting, not sure
yet. Let's see what we can learn from the present low-impact patch.
regards, tom lane