advance local xmin more aggressively

Started by Jeff Davisalmost 17 years ago25 messages
#1Jeff Davis
pgsql@j-davis.com
1 attachment(s)

With the new snapshot maintenance code, it looks like we can advance the
xmin more aggressively.

For instance:

S1:
INSERT INTO foo VALUES(1);

S2:
BEGIN;
DECLARE c1 CURSOR FOR SELECT i FROM foo;

S1:
DELETE FROM foo;

S2:
DECLARE c2 CURSOR FOR SELECT i FROM foo;
CLOSE c1;

S1:
VACUUM VERBOSE foo;

The VACUUM should be able to clean up the deleted tuple, because it's no
longer visible to anyone.

Attached a small patch to accomplish this. I don't expect it to be put
in 8.4, but it's small enough that I thought I should at least send it
in just in case.

Regards,
Jeff Davis

Attachments:

advance_xmin.patchtext/x-patch; charset=UTF-8; name=advance_xmin.patchDownload
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 7b6e15b..06bf425 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -21,6 +21,7 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "access/transam.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
@@ -1026,6 +1027,47 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot)
 }
 
 /*
+ * Find the smallest xmin among all ResourceOwners under owner.
+ */
+TransactionId
+ResourceOwnerSnapshotsMinXmin(ResourceOwner owner)
+{
+	ResourceOwner		 child;
+	Snapshot			*snapshots = owner->snapshots;
+	int					 ns1	   = owner->nsnapshots - 1;
+	int					 i;
+	TransactionId		 min_xmin  = InvalidTransactionId;
+
+	for (i = ns1; i >= 0; i--)
+	{
+		TransactionId xmin = snapshots[i]->xmin;
+
+		if (!TransactionIdIsValid(xmin))
+			continue;
+
+		if (!TransactionIdIsValid(min_xmin) ||
+			TransactionIdPrecedes(xmin, min_xmin))
+			min_xmin = xmin;
+	}
+
+	/* Recurse to handle descendants */
+	for (child = owner->firstchild; child != NULL; child = child->nextchild)
+	{
+		TransactionId xmin = ResourceOwnerSnapshotsMinXmin(child);
+
+		if (!TransactionIdIsValid(xmin))
+			continue;
+
+		if (!TransactionIdIsValid(min_xmin) ||
+			TransactionIdPrecedes(xmin, min_xmin))
+			min_xmin = xmin;
+	}
+
+	return min_xmin;
+}
+
+
+/*
  * Debugging subroutine
  */
 static void
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 9992895..b7b0506 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -107,6 +107,7 @@ static bool				registered_serializable = false;
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
 static void	SnapshotResetXmin(void);
+static TransactionId GetTrueLocalXmin(void);
 
 
 /*
@@ -432,8 +433,53 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 static void
 SnapshotResetXmin(void)
 {
+	TransactionId local_xmin;
+
 	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	{
 		MyProc->xmin = InvalidTransactionId;
+		return;
+	}
+
+	/*
+	 * The transaction may have a snapshot but no xmin during abort
+	 * when the transaction has a registered snapshot that is not
+	 * active.
+	 */
+	if (!TransactionIdIsValid(MyProc->xmin))
+		return;
+
+	local_xmin = GetTrueLocalXmin();
+	if (!TransactionIdIsValid(local_xmin) ||
+		TransactionIdPrecedes(MyProc->xmin, local_xmin))
+		MyProc->xmin = local_xmin;
+}
+
+/*
+ * Returns the smallest xmin value in use by any of the active
+ * snapshots in the current process.
+ */
+static TransactionId
+GetTrueLocalXmin(void)
+{
+	TransactionId		 min_xmin = InvalidTransactionId;
+	ActiveSnapshotElt	*active_elt;
+
+	min_xmin = ResourceOwnerSnapshotsMinXmin(TopTransactionResourceOwner);
+
+	for (active_elt = ActiveSnapshot; active_elt != NULL; active_elt = active_elt->as_next)
+	{
+		TransactionId xmin = active_elt->as_snap->xmin;
+
+		if (!TransactionIdIsValid(xmin))
+			continue;
+
+		if (!TransactionIdIsValid(min_xmin) ||
+			TransactionIdPrecedes(xmin, min_xmin))
+			min_xmin = xmin;
+	}
+
+	return min_xmin;
 }
 
 /*
@@ -458,7 +504,7 @@ AtSubCommit_Snapshot(int level)
 
 /*
  * AtSubAbort_Snapshot
- * 		Clean up snapshots after a subtransaction abort
+ *		Clean up snapshots after a subtransaction abort
  */
 void
 AtSubAbort_Snapshot(int level)
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index 3f05bf4..f4e051e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -128,5 +128,6 @@ extern void ResourceOwnerRememberSnapshot(ResourceOwner owner,
 							  Snapshot snapshot);
 extern void ResourceOwnerForgetSnapshot(ResourceOwner owner,
 							Snapshot snapshot);
+extern TransactionId ResourceOwnerSnapshotsMinXmin(ResourceOwner owner);
 
 #endif   /* RESOWNER_H */
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Jeff Davis (#1)
Re: advance local xmin more aggressively

Jeff Davis <pgsql@j-davis.com> writes:

With the new snapshot maintenance code, it looks like we can advance the
xmin more aggressively.

The original design for that contemplated having snapmgr.c track
all the snapshots (cf the comment for RegisteredSnapshots). I don't
particularly care for having it assume that it can find all the resource
owners.

But really the more important problem is to demonstrate that you
actually get a benefit commensurate with the additional cycles spent.
IIRC the reason the code is the way it is is that we concluded that for
typical usage patterns there wouldn't be any win from tracking things
more aggressively. As somebody pointed out recently, SnapshotResetXmin
is called quite a lot; if it's expensive it's going to be a problem.

regards, tom lane

#3Alvaro Herrera
alvherre@commandprompt.com
In reply to: Tom Lane (#2)
Re: advance local xmin more aggressively

Tom Lane wrote:

Jeff Davis <pgsql@j-davis.com> writes:

With the new snapshot maintenance code, it looks like we can advance the
xmin more aggressively.

The original design for that contemplated having snapmgr.c track
all the snapshots (cf the comment for RegisteredSnapshots). I don't
particularly care for having it assume that it can find all the resource
owners.

But really the more important problem is to demonstrate that you
actually get a benefit commensurate with the additional cycles spent.
IIRC the reason the code is the way it is is that we concluded that for
typical usage patterns there wouldn't be any win from tracking things
more aggressively. As somebody pointed out recently, SnapshotResetXmin
is called quite a lot; if it's expensive it's going to be a problem.

I think Jeff is coming from the Truviso point of view: they have very
long running transactions, and they normally have a number of snapshots
that are always being updated, but it's rare that there are no snapshots
at all. So this optimization actually buys a chance to update Xmin at
all; with the current code, they keep the same xmin all the time because
there's always some snapshot.

I'm not sure if the best answer is to just state that Truviso should
keep maintaining this patch privately. It would be, of course, much
better to come up with a way to keep track of this in a cheaper way.

For example, maybe we could keep track of counts of snapshots removed
since the last xmin calculation, and only run this routine if the number
is different from zero (or some small positive integer).

#4Tom Lane
tgl@sss.pgh.pa.us
In reply to: Alvaro Herrera (#3)
Re: advance local xmin more aggressively

Alvaro Herrera <alvherre@commandprompt.com> writes:

For example, maybe we could keep track of counts of snapshots removed
since the last xmin calculation, and only run this routine if the number
is different from zero (or some small positive integer).

I think most of the callers of SnapshotResetXmin already know they
removed something.

It might be interesting for FreeSnapshot or something nearby to note
whether the snapshot being killed has xmin = proc's xmin, and only do
the update calculation if so.

I still dislike the assumption that all resource owners are children of
a known owner. I suspect in fact that it's demonstrably wrong right
now, let alone in future (cf comments in PortalRun). If we're going to
do this then snapmgr.c needs to track the snapshots for itself. Of
course that's going to make the "is it worth it" question even more
pressing.

regards, tom lane

#5Robert Haas
robertmhaas@gmail.com
In reply to: Jeff Davis (#1)
Re: advance local xmin more aggressively

On Tue, Feb 10, 2009 at 3:06 PM, Jeff Davis <pgsql@j-davis.com> wrote:

With the new snapshot maintenance code, it looks like we can advance the
xmin more aggressively.

Can you clarify the circumstances in which this patch would show a
benefit over the current system?

...Robert

#6Jeff Davis
pgsql@j-davis.com
In reply to: Robert Haas (#5)
Re: advance local xmin more aggressively

On Wed, 2009-02-11 at 10:20 -0500, Robert Haas wrote:

On Tue, Feb 10, 2009 at 3:06 PM, Jeff Davis <pgsql@j-davis.com> wrote:

With the new snapshot maintenance code, it looks like we can advance the
xmin more aggressively.

Can you clarify the circumstances in which this patch would show a
benefit over the current system?

In the current code, if the process is always holding at least one
snapshot, the process' xmin never advances. That means VACUUM will never
be able to reclaim tuples visible during the first snapshot taken during
the transaction.

With the patch, as long as snapshots are being released, the process'
xmin will keep advancing to reflect the oldest snapshot currently held
by that process.

In order to accomplish that, every time a snapshot is released I have to
look at every snapshot that the process still holds to find the new
local minimum xmin. The current code will only change the process' xmin
if there are no snapshots at all.

As Tom pointed out, one of the assumptions I made writing the patch is
not always true. I am still trying to determine the implications of
that.

Regards,
Jeff Davis

#7Tom Lane
tgl@sss.pgh.pa.us
In reply to: Jeff Davis (#6)
Re: advance local xmin more aggressively

Jeff Davis <pgsql@j-davis.com> writes:

On Wed, 2009-02-11 at 10:20 -0500, Robert Haas wrote:

Can you clarify the circumstances in which this patch would show a
benefit over the current system?

In the current code, if the process is always holding at least one
snapshot, the process' xmin never advances.

Right, and the question is the scope of the circumstances in which
that's the case and your patch makes things better. I believe that

* a process outside a transaction has no snapshots, so your patch
won't change anything

* a process in a serializable transaction will hold the serializable
snapshot till end of xact, so your patch won't change anything

* a process in a read-committed transaction will typically hold
snapshot(s) for the duration of each query, and then release them
all, so your patch won't change anything

You pointed out the case of opening a cursor in a read-committed
transaction, and then closing it before end of transaction, as a
place where your patch could hope to improve matters. Are there
others? Could your application be made to close that cursor before
opening another one (so that its set of open snapshots momentarily
goes to zero)? It seems like the use case for more complex
bookkeeping for open snapshots is a tad narrow.

regards, tom lane

#8Jeff Davis
pgsql@j-davis.com
In reply to: Tom Lane (#7)
Re: advance local xmin more aggressively

On Wed, 2009-02-11 at 12:20 -0500, Tom Lane wrote:

You pointed out the case of opening a cursor in a read-committed
transaction, and then closing it before end of transaction, as a
place where your patch could hope to improve matters. Are there
others? Could your application be made to close that cursor before
opening another one (so that its set of open snapshots momentarily
goes to zero)? It seems like the use case for more complex
bookkeeping for open snapshots is a tad narrow.

I'm approaching this from the perspective of our system at Truviso. I
used the cursor example to illustrate the issue in normal postgresql,
but our problem isn't directly related to cursors.

I don't have a compelling use case right now for normal postgresql,
because of the reasons you mention. However, at Truviso, we have to come
up with some kind of solution to this anyway. Ideally, I'd like to make
something that's acceptable to postgres in general (meaning no
performance impact or code ugliness).

I could imagine some situations that this could be more useful in the
future. For instance, Hot Standby will increase the consequences of not
advancing xmin when it's possible to do so.

Regards,
Jeff Davis

#9Tom Lane
tgl@sss.pgh.pa.us
In reply to: Jeff Davis (#8)
Re: advance local xmin more aggressively

Jeff Davis <pgsql@j-davis.com> writes:

I could imagine some situations that this could be more useful in the
future. For instance, Hot Standby will increase the consequences of not
advancing xmin when it's possible to do so.

The question wasn't really about the consequences; it was about whether
there was any hope of this patch being able to advance xmin more than
the code that's there, for common usage patterns.

regards, tom lane

#10Robert Haas
robertmhaas@gmail.com
In reply to: Tom Lane (#4)
Re: advance local xmin more aggressively

[ reviving a very old thread ]

On Tue, Feb 10, 2009 at 4:11 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Alvaro Herrera <alvherre@commandprompt.com> writes:

For example, maybe we could keep track of counts of snapshots removed
since the last xmin calculation, and only run this routine if the number
is different from zero (or some small positive integer).

I think most of the callers of SnapshotResetXmin already know they
removed something.

It might be interesting for FreeSnapshot or something nearby to note
whether the snapshot being killed has xmin = proc's xmin, and only do
the update calculation if so.

I still dislike the assumption that all resource owners are children of
a known owner. I suspect in fact that it's demonstrably wrong right
now, let alone in future (cf comments in PortalRun). If we're going to
do this then snapmgr.c needs to track the snapshots for itself. Of
course that's going to make the "is it worth it" question even more
pressing.

I've run into essentially the same problem Jeff originally complained
about with a large customer who has long-running transactions that
make extensive use of cursors. Cursors are opened and closed over
time but it is rare for the number open to reach exactly zero, so what
ends up happening is that the backend xmin does not advance. As you
can imagine, that doesn't work out well.

The approach I came up with initially was similar to Jeff's: start at
the topmost resource owner and traverse them all, visiting every
snapshot along the way. But then I found this thread and saw the
comment that this might be "demonstrably wrong" and referring to the
comments in PortalRun. Having reviewed those comments, which don't
seem to have changed much in the last five years, I can't understand
how they related to this issue. It's true that the TopTransaction
resource owner could get swapped out under us during an internal
commit, but why should SnapshotResetXmin() have to care? It just
traverses the one that is in effect at the time it gets called. The
only real danger I see here is that there could be *more than one*
toplevel resource owner. I wonder if we could solve that problem by
adding a registry of active toplevel resource owners, so that if we
have a forest rather than a tree we can still find everything.

The problem I see with having snapmgr.c track the snapshots for itself
is that it is mostly duplicating of bookkeeping which is already being
done. Since this problem doesn't affect the majority of users, it's
not desirable to add a lot of extra bookkeeping to cater to it - but
even if it did affect a lot of users, we still want it to be as cheap
as possible, and reusing the tracking that resource owners are already
doing seems like the way to get there.

I think there are a couple of things we can do to make this cheaper.
Suppose we keep track of the oldest xmin of any registered snapshot
and the number of registered snapshots that have that particular xmin.
Every time we deregister a snapshot, we check whether it is one of the
ones with the minimum xmin; if it is, we decrement the count. When
the count reaches 0, we know that a traversal of all registered
snapshots is guaranteed to find a newer value to advertise in
MyProc->xmin; that way, we can arrange to do the work only when it's
likely to pay off. In most cases this won't happen until the last
snapshot is unregistered, because our snapshots normally form a stack,
with newer snapshots having been taken later. But if somebody
unregisters the oldest snapshot we'll immediately notice and
recalculate.

Thoughts?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#10)
Re: advance local xmin more aggressively

On 12/05/2014 05:05 PM, Robert Haas wrote:

[ reviving a very old thread ]

On Tue, Feb 10, 2009 at 4:11 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Alvaro Herrera <alvherre@commandprompt.com> writes:

For example, maybe we could keep track of counts of snapshots removed
since the last xmin calculation, and only run this routine if the number
is different from zero (or some small positive integer).

I think most of the callers of SnapshotResetXmin already know they
removed something.

It might be interesting for FreeSnapshot or something nearby to note
whether the snapshot being killed has xmin = proc's xmin, and only do
the update calculation if so.

I still dislike the assumption that all resource owners are children of
a known owner. I suspect in fact that it's demonstrably wrong right
now, let alone in future (cf comments in PortalRun). If we're going to
do this then snapmgr.c needs to track the snapshots for itself. Of
course that's going to make the "is it worth it" question even more
pressing.

I've run into essentially the same problem Jeff originally complained
about with a large customer who has long-running transactions that
make extensive use of cursors. Cursors are opened and closed over
time but it is rare for the number open to reach exactly zero, so what
ends up happening is that the backend xmin does not advance. As you
can imagine, that doesn't work out well.

The approach I came up with initially was similar to Jeff's: start at
the topmost resource owner and traverse them all, visiting every
snapshot along the way. But then I found this thread and saw the
comment that this might be "demonstrably wrong" and referring to the
comments in PortalRun. Having reviewed those comments, which don't
seem to have changed much in the last five years, I can't understand
how they related to this issue. It's true that the TopTransaction
resource owner could get swapped out under us during an internal
commit, but why should SnapshotResetXmin() have to care? It just
traverses the one that is in effect at the time it gets called. The
only real danger I see here is that there could be *more than one*
toplevel resource owner. I wonder if we could solve that problem by
adding a registry of active toplevel resource owners, so that if we
have a forest rather than a tree we can still find everything.

I don't immediately see the problem either, but I have to say that
grovelling through all the resource owners seems ugly anyway. Resource
owners are not meant to be traversed like that. And there could be a lot
of them, and you have to visit every one of them. That could get
expensive if there are a lot of resource owners.

BTW, you could easily detect that you haven't seen all the registered
snapshots, after traversing the resource owner, as we keep the counter
of them. So you could just fall back to not advancing the xmin if it
happens.

The problem I see with having snapmgr.c track the snapshots for itself
is that it is mostly duplicating of bookkeeping which is already being
done. Since this problem doesn't affect the majority of users, it's
not desirable to add a lot of extra bookkeeping to cater to it - but
even if it did affect a lot of users, we still want it to be as cheap
as possible, and reusing the tracking that resource owners are already
doing seems like the way to get there.

I would prefer doing separate bookkeeping in snapmgr.c. It seems like it
wouldn't be difficult to do. It has to be cheap, but I don't see any
reason to believe that it'd be more expensive than traversing through
all resource owners. A binary heap or some other priority queue
implementation should work pretty well for this.

I think there are a couple of things we can do to make this cheaper.
Suppose we keep track of the oldest xmin of any registered snapshot
and the number of registered snapshots that have that particular xmin.
Every time we deregister a snapshot, we check whether it is one of the
ones with the minimum xmin; if it is, we decrement the count. When
the count reaches 0, we know that a traversal of all registered
snapshots is guaranteed to find a newer value to advertise in
MyProc->xmin; that way, we can arrange to do the work only when it's
likely to pay off. In most cases this won't happen until the last
snapshot is unregistered, because our snapshots normally form a stack,
with newer snapshots having been taken later. But if somebody
unregisters the oldest snapshot we'll immediately notice and
recalculate.

Yeah, that's a reasonable optimization. It's a reasonable optimization
even if you do the bookkeeping in snapmgr.c.

And that optimization won't save you in the cases where it doesn't
apply. For example, what if snapshots happen to form a queue, rather
than a stack:

DECLARE c1 CURSOR FOR ...;
DECLARE c2 CURSOR FOR ...;
DECLARE c3 CURSOR FOR ...;
...
DECLARE c1000 CURSOR FOR ...;

CLOSE c1;
CLOSE c2;
CLOSE c3;
...
CLOSE c1000;

It's not hard to imagine an application doing that. Sure, you could
rewrite it to close the cursors in different order, but on the face of
it that's not an unreasonable thing for an application to do. I think we
should avoid the O(n^2) behaviour in that case.

(But +1 for addressing this problem, with the extra bookkeeping in
snapmgr.c)

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#11)
Re: advance local xmin more aggressively

On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I don't immediately see the problem either, but I have to say that
grovelling through all the resource owners seems ugly anyway. Resource
owners are not meant to be traversed like that. And there could be a lot of
them, and you have to visit every one of them. That could get expensive if
there are a lot of resource owners.

1. I don't really see why resource owners shouldn't be traversed like
that. They are clearly intended to form a hierarchy, and there's
existing code that recurses through the hierarchy from a given level
downward. What's ugly about that?

2. If you have a lot of resource owners, you probably have a lot of
snapshots, so walking a list will be expensive, too. It will be
disproportionately expensive to walk the resource owner tree only if
there are lots of resource owners but very few of them have any
snapshots. But I don't think that can really happen. If you've got
lots of resource owners and each one has a snapshot, you'll traverse
~3 pointers per snapshot: ~1 to find the next ResourceOwner, 1 to find
the snapshot array, and 1 to reach the snapshot itself. A non-inlined
list would traverse only 2 pointers per snapshot, but that doesn't
seem like enough of a difference to get excited about.

BTW, you could easily detect that you haven't seen all the registered
snapshots, after traversing the resource owner, as we keep the counter of
them. So you could just fall back to not advancing the xmin if it happens.

Not a bad idea. Or we could elog(FATAL) or fail an assertion if we
don't see them all, and then if it happens we call it a bug and fix
it.

I would prefer doing separate bookkeeping in snapmgr.c. It seems like it
wouldn't be difficult to do. It has to be cheap, but I don't see any reason
to believe that it'd be more expensive than traversing through all resource
owners. A binary heap or some other priority queue implementation should
work pretty well for this.

That's optimizing for making the xmin recomputation cheap, but we
don't expect xmin recomputation to happen very often, so I'm not sure
that's the right trade-off.

And that optimization won't save you in the cases where it doesn't apply.
For example, what if snapshots happen to form a queue, rather than a stack:

DECLARE c1 CURSOR FOR ...;
DECLARE c2 CURSOR FOR ...;
DECLARE c3 CURSOR FOR ...;
...
DECLARE c1000 CURSOR FOR ...;

CLOSE c1;
CLOSE c2;
CLOSE c3;
...
CLOSE c1000;

It's not hard to imagine an application doing that. Sure, you could rewrite
it to close the cursors in different order, but on the face of it that's not
an unreasonable thing for an application to do. I think we should avoid the
O(n^2) behaviour in that case.

You won't actually get O(n^2) behavior here unless those DECLARE
CURSOR statements all get snapshots with different xmins; if many of
them have snapshots that share an xmin, then the optimization of
recomputing only when there are no snaps with a given xmin will save
you. That's a bit pathological, but maybe we should try to cater to
it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#12)
1 attachment(s)
Re: advance local xmin more aggressively

On Mon, Dec 8, 2014 at 9:31 AM, Robert Haas <robertmhaas@gmail.com> wrote:

On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I don't immediately see the problem either, but I have to say that
grovelling through all the resource owners seems ugly anyway. Resource
owners are not meant to be traversed like that. And there could be a lot of
them, and you have to visit every one of them. That could get expensive if
there are a lot of resource owners.

1. I don't really see why resource owners shouldn't be traversed like
that. They are clearly intended to form a hierarchy, and there's
existing code that recurses through the hierarchy from a given level
downward. What's ugly about that?

Here's a patch. I looked at the issue of tracking parent-less
resource owners a bit more closely. It turns out that resource owners
are always created in TopMemoryContext "since they should only be
freed explicitly" (cf. resowner.c). I was a bit worried about that,
because it would be bad to keep a list of all parent-less resource
owners if list elements could vanish in a context reset. But that
doesn't seem to be a concern. So I stole the nextchild links of
parent-less resource owners, which are not used for anything
currently, to keep a list of such resource owners.

It occurred to me that there's probably not much value in recomputing
xmin when the active snapshot stack is non-empty. It's not impossible
that a PL/pgsql function could close a cursor with an old xmin and
then do lots of other work (or just sleep for a long time) before
returning to the top-level, but it is pretty unlikely. So the
attached patch only considers recomputing the advertised xmin when the
active snapshot stack is empty. That'll happen at the end of each
statement, which seems soon enough.

Upthread, I suggested keeping a tally of the number of snapshots with
the advertised xmin and recomputing the xmin to advertise only when it
reaches 0. This patch doesn't implementation that optimization, but
it does have code that aborts the traversal of the resource owner
hierarchy as soon as we see an xmin that will preclude advancing our
advertised xmin. Releasing N resource owners could therefore cost
O(N^2) in the worst case, but note that releasing N resource owners is
*already* an O(N^2) operation in the worst case, because the list of
children of a particular parent resource owner is singly linked, and
thus deleting a resource owner is O(N). It's been that way for an
awfully long time without anyone complaining, probably because (a)
it's not particularly common to have large numbers of cursors open
simultaneously and (b) even if you do have that, the constant factor
is pretty low.

Following Heikki's previous suggestion, the patch contains checks to
make sure that we find exactly the number of registered snapshots that
we expect to find. We could consider demoting these to asserts or
something, but this is more likely to catch bugs if there are any.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Attachments:

advance-xmin-more.patchtext/x-patch; charset=US-ASCII; name=advance-xmin-more.patchDownload
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 0955bcc..186fa91 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -113,6 +113,7 @@ typedef struct ResourceOwnerData
 ResourceOwner CurrentResourceOwner = NULL;
 ResourceOwner CurTransactionResourceOwner = NULL;
 ResourceOwner TopTransactionResourceOwner = NULL;
+ResourceOwner FirstRootResourceOwner = NULL;
 
 /*
  * List of add-on callbacks for resource releasing
@@ -167,6 +168,11 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 		owner->nextchild = parent->firstchild;
 		parent->firstchild = owner;
 	}
+	else
+	{
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
+	}
 
 	return owner;
 }
@@ -442,6 +448,8 @@ ResourceOwnerDelete(ResourceOwner owner)
 	 * the owner tree.  Better a leak than a crash.
 	 */
 	ResourceOwnerNewParent(owner, NULL);
+	Assert(FirstRootResourceOwner == owner);
+	FirstRootResourceOwner = owner->nextchild;
 
 	/* And free the object. */
 	if (owner->buffers)
@@ -502,6 +510,14 @@ ResourceOwnerNewParent(ResourceOwner owner,
 			}
 		}
 	}
+	else
+	{
+		ResourceOwner *link = &FirstRootResourceOwner;
+
+		while (*link != owner)
+			link = &((*link)->nextchild);
+		*link = owner->nextchild;
+	}
 
 	if (newparent)
 	{
@@ -513,7 +529,8 @@ ResourceOwnerNewParent(ResourceOwner owner,
 	else
 	{
 		owner->parent = NULL;
-		owner->nextchild = NULL;
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
 	}
 }
 
@@ -1161,6 +1178,50 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot)
 }
 
 /*
+ * Invoke a caller-supplied function on every snapshot registered with this
+ * resource owner or any descendent resource owner.  The callback can abort
+ * the traversal by returning true.
+ */
+bool
+ResourceOwnerWalkSnapshots(ResourceOwner owner,
+						   ResourceWalkSnapshotCallback callback,
+						   void *arg)
+{
+	ResourceOwner child;
+	int		i;
+
+	/* Invoke the callback for each of our snapshots */
+	for (i = 0; i < owner->nsnapshots; ++i)
+		if (callback(owner->snapshots[i], arg))
+			return true;
+
+	/* Recurse to handle descendants */
+	for (child = owner->firstchild; child != NULL; child = child->nextchild)
+		if (ResourceOwnerWalkSnapshots(child, callback, arg))
+			return true;
+
+	return false;
+}
+
+/*
+ * Invoke a caller-supplied function on every snapshot registered with any
+ * resource owner in the system.
+ */
+bool
+ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback, void *arg)
+{
+	ResourceOwner	owner;
+
+	for (owner = FirstRootResourceOwner;
+		 owner != NULL;
+		 owner = owner->nextchild)
+		if (ResourceOwnerWalkSnapshots(owner, callback, arg))
+			return true;
+
+	return false;
+}
+
+/*
  * Debugging subroutine
  */
 static void
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d601efe..50e348c 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -122,13 +122,15 @@ typedef struct ActiveSnapshotElt
 /* Top of the stack of active snapshots */
 static ActiveSnapshotElt *ActiveSnapshot = NULL;
 
-/*
- * How many snapshots is resowner.c tracking for us?
- *
- * Note: for now, a simple counter is enough.  However, if we ever want to be
- * smarter about advancing our MyPgXact->xmin we will need to be more
- * sophisticated about this, perhaps keeping our own list of snapshots.
- */
+/* Data for recomputing xmin */
+typedef struct SnapshotXminData
+{
+	TransactionId	previous_xmin;
+	TransactionId	new_xmin;
+	int				snapshots_remaining;
+} SnapshotXminData;
+
+/* How many snapshots is resowner.c tracking for us? */
 static int	RegisteredSnapshots = 0;
 
 /* first GetTransactionSnapshot call in a transaction? */
@@ -153,6 +155,7 @@ static List *exportedSnapshots = NIL;
 
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
+static bool SnapshotResetXminWorker(Snapshot snapshot, void *data);
 static void SnapshotResetXmin(void);
 
 
@@ -688,12 +691,65 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
+ *
+ * Even if there are some remaining snapshots, we may be able to advance our
+ * PGXACT->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * the active snapshot stack is empty.
  */
 static void
 SnapshotResetXmin(void)
 {
-	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	SnapshotXminData	sxd;
+	ListCell	   *lc;
+
+	if (ActiveSnapshot != NULL)
+		return;
+
+	if (RegisteredSnapshots == 0)
+	{
 		MyPgXact->xmin = InvalidTransactionId;
+		return;
+	}
+
+	sxd.previous_xmin = MyPgXact->xmin;
+	sxd.new_xmin = InvalidTransactionId;
+	sxd.snapshots_remaining = RegisteredSnapshots;
+
+	if (FirstXactSnapshot != NULL &&
+		SnapshotResetXminWorker(FirstXactSnapshot, &sxd))
+		return;
+
+	foreach(lc, exportedSnapshots)
+	{
+		Snapshot	snapshot = lfirst(lc);
+
+		if (SnapshotResetXminWorker(snapshot, &sxd))
+			return;
+	}
+
+	if (ResourceOwnerWalkAllSnapshots(SnapshotResetXminWorker, &sxd))
+		return;
+
+	if (sxd.snapshots_remaining > 0)
+		elog(FATAL, "failed to traverse all snapshots");
+
+	MyPgXact->xmin = sxd.new_xmin;
+}
+
+static bool
+SnapshotResetXminWorker(Snapshot snapshot, void *data)
+{
+	SnapshotXminData   *sxd = data;
+
+	if (!TransactionIdIsValid(sxd->new_xmin)
+		|| TransactionIdPrecedes(snapshot->xmin, sxd->new_xmin))
+		sxd->new_xmin = snapshot->xmin;
+
+	if (--sxd->snapshots_remaining < 0)
+		elog(FATAL, "traversed too many snapshots");
+
+	return !TransactionIdPrecedes(sxd->previous_xmin, sxd->new_xmin);
 }
 
 /*
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index b8fd1a9..aabbb07 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -73,6 +73,11 @@ extern void ResourceOwnerRememberSnapshot(ResourceOwner owner,
 							  Snapshot snapshot);
 extern void ResourceOwnerForgetSnapshot(ResourceOwner owner,
 							Snapshot snapshot);
+typedef bool (*ResourceWalkSnapshotCallback) (Snapshot snapshot, void *arg);
+extern bool ResourceOwnerWalkSnapshots(ResourceOwner resowner,
+		ResourceWalkSnapshotCallback callback, void *arg);
+extern bool ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback,
+							  void *arg);
 
 /* support for temporary file management */
 extern void ResourceOwnerEnlargeFiles(ResourceOwner owner);
#14Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#13)
1 attachment(s)
Re: advance local xmin more aggressively

On 12/09/2014 10:35 PM, Robert Haas wrote:

On Mon, Dec 8, 2014 at 9:31 AM, Robert Haas <robertmhaas@gmail.com> wrote:

On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

I don't immediately see the problem either, but I have to say that
grovelling through all the resource owners seems ugly anyway. Resource
owners are not meant to be traversed like that. And there could be a lot of
them, and you have to visit every one of them. That could get expensive if
there are a lot of resource owners.

1. I don't really see why resource owners shouldn't be traversed like
that. They are clearly intended to form a hierarchy, and there's
existing code that recurses through the hierarchy from a given level
downward. What's ugly about that?

I can't exactly point my finger on it, but it just feels wrong from a
modularity point of view. Their purpose is to make sure that we don't
leak resources on abort, by allowing easy an "release everything"
operation. It's not designed for finding objects based on some other
criteria.

There is similar double bookkeeping of many other things that are
tracked by resource owners. Heavy-weight locks are tracked by LOCALLOCK
structs, buffer pins in PrivateRefCount array etc. Those things existed
before resource owners were invented, but if we were starting from
scratch, that design would still make sense, as different objects have
different access criteria. fd.c needs to be able to find the
least-recently-used open file, for example, and you need to find the
snapshot with the lowest xmin.

Upthread, I suggested keeping a tally of the number of snapshots with
the advertised xmin and recomputing the xmin to advertise only when it
reaches 0. This patch doesn't implementation that optimization, but
it does have code that aborts the traversal of the resource owner
hierarchy as soon as we see an xmin that will preclude advancing our
advertised xmin. Releasing N resource owners could therefore cost
O(N^2) in the worst case, but note that releasing N resource owners is
*already* an O(N^2) operation in the worst case, because the list of
children of a particular parent resource owner is singly linked, and
thus deleting a resource owner is O(N). It's been that way for an
awfully long time without anyone complaining, probably because (a)
it's not particularly common to have large numbers of cursors open
simultaneously and (b) even if you do have that, the constant factor
is pretty low.

I think you're confusing the N and the N above. It's true that deleting
a resource owner is O(M), where the M is the number of children of that
resource owner. It does not follow that releasing N resource owners is
O(N^2), where N is the number of resource owners released. Calling
ResourceOwnerDelete(x) will only visit each resource owner in that tree
once, so it's O(N), where N is the total number of resource owners in
the tree.

I did some testing of the worst case scenario. The attached script first
creates a lot of cursors, then a lot of savepoints, and finally closes
the cursors in FIFO order. When the number of savepoints is high enough,
this actually segfaults with your patch, because you run out of stack
space when recursing the subxact resource owners. That's hardly this
patch's fault, I'm actually surprised it doesn't crash without it,
because we recurse into all resource owners in ResourceOwnerRelease too.
Apparently the subxacts are closed in LIFO order at commit, but there
might be are other cases where you could trigger that. In any case, a
stack-depth check would be nice.

- Heikki

Attachments:

cursors.shapplication/x-sh; name=cursors.shDownload
#15Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#14)
Re: advance local xmin more aggressively

On Wed, Dec 10, 2014 at 7:34 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

1. I don't really see why resource owners shouldn't be traversed like
that. They are clearly intended to form a hierarchy, and there's
existing code that recurses through the hierarchy from a given level
downward. What's ugly about that?

I can't exactly point my finger on it, but it just feels wrong from a
modularity point of view. Their purpose is to make sure that we don't leak
resources on abort, by allowing easy an "release everything" operation. It's
not designed for finding objects based on some other criteria.

There is similar double bookkeeping of many other things that are tracked by
resource owners. Heavy-weight locks are tracked by LOCALLOCK structs, buffer
pins in PrivateRefCount array etc. Those things existed before resource
owners were invented, but if we were starting from scratch, that design
would still make sense, as different objects have different access criteria.
fd.c needs to be able to find the least-recently-used open file, for
example, and you need to find the snapshot with the lowest xmin.

I think all of these are performance trade-offs rather than questions
of style. LOCALLOCK structs and private buffer pin tracking existed
before resource owners because we need to do lookups of locks by lock
tag or buffers by buffer number frequently enough that, if we had to
grovel trough all the locks or buffer pins in the system without any
indexing, it would be slow. We *could* do it that way now that we
have resource owners, but it's pretty obvious that it would suck.

That's less obvious here. If we throw all of the registered snapshots
into a binary heap, finding the lowest xmin will be cheaper every time
we do it, but we'll have the overhead of maintaining that binary heap
even if it never gets used.

I think you're confusing the N and the N above. It's true that deleting a
resource owner is O(M), where the M is the number of children of that
resource owner. It does not follow that releasing N resource owners is
O(N^2), where N is the number of resource owners released. Calling
ResourceOwnerDelete(x) will only visit each resource owner in that tree
once, so it's O(N), where N is the total number of resource owners in the
tree.

Not if the portals are dropped in separate commands with an
unpredictable ordering. Then you have N separate calls to
ResourceOwnerDelete.

I did some testing of the worst case scenario. The attached script first
creates a lot of cursors, then a lot of savepoints, and finally closes the
cursors in FIFO order. When the number of savepoints is high enough, this
actually segfaults with your patch, because you run out of stack space when
recursing the subxact resource owners. That's hardly this patch's fault, I'm
actually surprised it doesn't crash without it, because we recurse into all
resource owners in ResourceOwnerRelease too. Apparently the subxacts are
closed in LIFO order at commit, but there might be are other cases where you
could trigger that. In any case, a stack-depth check would be nice.

Thanks for the test case; that's helpful.

I added a stack depth check, but it doesn't help much:

ERROR: stack depth limit exceeded
HINT: Increase the configuration parameter "max_stack_depth"
(currently 2048kB), after ensuring the platform's stack depth limit is
adequate.
STATEMENT: CLOSE cur0;
ERROR: stack depth limit exceeded
HINT: Increase the configuration parameter "max_stack_depth"
(currently 2048kB), after ensuring the platform's stack depth limit is
adequate.
WARNING: AbortSubTransaction while in ABORT state
ERROR: stack depth limit exceeded
HINT: Increase the configuration parameter "max_stack_depth"
(currently 2048kB), after ensuring the platform's stack depth limit is
adequate.
WARNING: AbortSubTransaction while in ABORT state
ERROR: stack depth limit exceeded
HINT: Increase the configuration parameter "max_stack_depth"
(currently 2048kB), after ensuring the platform's stack depth limit is
adequate.
WARNING: AbortSubTransaction while in ABORT state
ERROR: stack depth limit exceeded
HINT: Increase the configuration parameter "max_stack_depth"
(currently 2048kB), after ensuring the platform's stack depth limit is
adequate.
PANIC: ERRORDATA_STACK_SIZE exceeded

That's at 100k subtransactions. I took some performance data for
lower numbers of subtransactions:

-- at 1000 subtransactions --
master 0.156s 0.157s 0.154s
advance-xmin 0.177s 0.175s 0.177s

-- at 10000 subtransactions --
master 0.458s 0.457s 0.456s
advance-xmin 0.639s 0.637s 0.633

I guess this bears some further thought. I certainly don't like the
fact that this makes the whole system crap out at a lower number of
subtransactions than presently. The actual performance numbers don't
bother me very much; I'm comfortable with the possibility that closing
a cursor will be some modest percentage slower if you've got thousands
of active savepoints.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#15)
1 attachment(s)
Re: advance local xmin more aggressively

On Wed, Dec 10, 2014 at 9:49 AM, Robert Haas <robertmhaas@gmail.com> wrote:

I guess this bears some further thought. I certainly don't like the
fact that this makes the whole system crap out at a lower number of
subtransactions than presently. The actual performance numbers don't
bother me very much; I'm comfortable with the possibility that closing
a cursor will be some modest percentage slower if you've got thousands
of active savepoints.

Here's a new version with two changes:

1. I changed the traversal of the resource owner tree to iterate
instead of recurse. It now does a depth-first, pre-order traversal of
the tree; when we reach the last child of a node, we follow its parent
pointer to get back to where we were. That way, we don't need to keep
anything on the stack. That fixed the crash at 100k cursors, but it
was still 4x slower.

2. Instead of traversing the tree until we find an xmin equal to the
one we're currently advertising, the code now traverses the entire
tree each time it runs. However, it also keeps a record of how many
times the oldest xmin occurred in the tree, which is decremented each
time we unregister a snapshot with that xmin; the traversal doesn't
run again until that count reaches 0. That fixed the performance
regression on your test case.

With a million subtransactions:

master 34.464s 33.742s 34.317s
advance-xmin 34.516s 34.069s 34.196s

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Attachments:

advance-xmin-more-v2.patchtext/x-patch; charset=US-ASCII; name=advance-xmin-more-v2.patchDownload
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 0955bcc..529209f 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -21,6 +21,7 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "miscadmin.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
@@ -113,6 +114,7 @@ typedef struct ResourceOwnerData
 ResourceOwner CurrentResourceOwner = NULL;
 ResourceOwner CurTransactionResourceOwner = NULL;
 ResourceOwner TopTransactionResourceOwner = NULL;
+ResourceOwner FirstRootResourceOwner = NULL;
 
 /*
  * List of add-on callbacks for resource releasing
@@ -167,6 +169,11 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 		owner->nextchild = parent->firstchild;
 		parent->firstchild = owner;
 	}
+	else
+	{
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
+	}
 
 	return owner;
 }
@@ -442,6 +449,8 @@ ResourceOwnerDelete(ResourceOwner owner)
 	 * the owner tree.  Better a leak than a crash.
 	 */
 	ResourceOwnerNewParent(owner, NULL);
+	Assert(FirstRootResourceOwner == owner);
+	FirstRootResourceOwner = owner->nextchild;
 
 	/* And free the object. */
 	if (owner->buffers)
@@ -502,6 +511,14 @@ ResourceOwnerNewParent(ResourceOwner owner,
 			}
 		}
 	}
+	else
+	{
+		ResourceOwner *link = &FirstRootResourceOwner;
+
+		while (*link != owner)
+			link = &((*link)->nextchild);
+		*link = owner->nextchild;
+	}
 
 	if (newparent)
 	{
@@ -513,7 +530,8 @@ ResourceOwnerNewParent(ResourceOwner owner,
 	else
 	{
 		owner->parent = NULL;
-		owner->nextchild = NULL;
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
 	}
 }
 
@@ -1161,6 +1179,59 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot)
 }
 
 /*
+ * Invoke a caller-supplied function on every snapshot registered with any
+ * resource owner in the system.  The callback can abort the traversal by
+ * returning true.
+ */
+bool
+ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback, void *arg)
+{
+	ResourceOwner	owner = FirstRootResourceOwner;
+	bool		visited = false;
+
+	/*
+	 * We do this traversal non-recursively in order to avoid running out of
+	 * stack space, which can otherwise happen with large numbers of nested
+	 * subtransactions.  The easiest way to do that is to search depth-first,
+	 * so that we visit all of a given node's descendents before reaching its
+	 * right sibling.  When we've visited all of the node's descendents, we'll
+	 * follow the last child's parent pointer back to that node, but visited
+	 * will be set to true at that point, so we'll advance to the right
+	 * sibling instead of traversing it again.
+	 */
+	while (owner != NULL)
+	{
+		if (!visited)
+		{
+			int	i;
+
+			for (i = 0; i < owner->nsnapshots; ++i)
+				if (callback(owner->snapshots[i], arg))
+					return true;
+
+			if (owner->firstchild != NULL)
+			{
+				owner = owner->firstchild;
+				continue;
+			}
+		}
+
+		if (owner->nextchild != NULL)
+		{
+			owner = owner->nextchild;
+			visited = false;
+		}
+		else
+		{
+			owner = owner->parent;
+			visited = true;
+		}
+	}
+
+	return false;
+}
+
+/*
  * Debugging subroutine
  */
 static void
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d601efe..1fd73c8 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -122,14 +122,28 @@ typedef struct ActiveSnapshotElt
 /* Top of the stack of active snapshots */
 static ActiveSnapshotElt *ActiveSnapshot = NULL;
 
+/* Data for recomputing xmin */
+typedef struct SnapshotXminData
+{
+	TransactionId	new_xmin;
+	int				new_xmin_count;
+	int				snapshots_remaining;
+} SnapshotXminData;
+
+/* How many snapshots is resowner.c tracking for us? */
+static int	RegisteredSnapshots = 0;
+
 /*
- * How many snapshots is resowner.c tracking for us?
- *
- * Note: for now, a simple counter is enough.  However, if we ever want to be
- * smarter about advancing our MyPgXact->xmin we will need to be more
- * sophisticated about this, perhaps keeping our own list of snapshots.
+ * When a cursor is released, it can be advantageous to recompute our
+ * advertised xmin to avoid system-wide impacts.  However, we don't want
+ * to do this recomputation needlessly.  After the first xmin recomputation
+ * in a particular transaction, these variables will indicate the result of
+ * the previous recomputation and the number of snapshots which at that time
+ * had that xmin.  The actual number of snapshots with that xmin can be higher
+ * than the value indicated here, but cannot be lower.
  */
-static int	RegisteredSnapshots = 0;
+static TransactionId RegisteredSnapshotOldestXmin = InvalidTransactionId;
+static int	RegisteredSnapshotOldestXminCount = 0;
 
 /* first GetTransactionSnapshot call in a transaction? */
 bool		FirstSnapshotSet = false;
@@ -153,6 +167,7 @@ static List *exportedSnapshots = NIL;
 
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
+static bool SnapshotResetXminWorker(Snapshot snapshot, void *data);
 static void SnapshotResetXmin(void);
 
 
@@ -675,6 +690,8 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 
 	ResourceOwnerForgetSnapshot(owner, snapshot);
 	RegisteredSnapshots--;
+	if (TransactionIdEquals(snapshot->xmin, RegisteredSnapshotOldestXmin))
+		RegisteredSnapshotOldestXminCount--;
 	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
 	{
 		FreeSnapshot(snapshot);
@@ -688,12 +705,77 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
+ *
+ * Even if there are some remaining snapshots, we may be able to advance our
+ * PGXACT->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * the active snapshot stack is empty.
  */
 static void
 SnapshotResetXmin(void)
 {
-	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	SnapshotXminData	sxd;
+	ListCell	   *lc;
+
+	if (ActiveSnapshot != NULL)
+		return;
+
+	if (RegisteredSnapshots == 0)
+	{
 		MyPgXact->xmin = InvalidTransactionId;
+		RegisteredSnapshotOldestXmin = InvalidTransactionId;
+		RegisteredSnapshotOldestXminCount = 0;
+		return;
+	}
+
+	if (RegisteredSnapshotOldestXminCount > 0)
+		return;
+
+	sxd.new_xmin = InvalidTransactionId;
+	sxd.new_xmin_count = 0;
+	sxd.snapshots_remaining = RegisteredSnapshots;
+
+	if (FirstXactSnapshot != NULL)
+		SnapshotResetXminWorker(FirstXactSnapshot, &sxd);
+
+	foreach(lc, exportedSnapshots)
+	{
+		Snapshot	snapshot = lfirst(lc);
+
+		SnapshotResetXminWorker(snapshot, &sxd);
+	}
+
+	ResourceOwnerWalkAllSnapshots(SnapshotResetXminWorker, &sxd);
+
+	if (sxd.snapshots_remaining > 0)
+		elog(FATAL, "failed to traverse all snapshots");
+
+	MyPgXact->xmin = sxd.new_xmin;
+	RegisteredSnapshotOldestXmin = sxd.new_xmin;
+	RegisteredSnapshotOldestXminCount = sxd.new_xmin_count;
+}
+
+static bool
+SnapshotResetXminWorker(Snapshot snapshot, void *data)
+{
+	SnapshotXminData   *sxd = data;
+
+	if (TransactionIdEquals(snapshot->xmin, sxd->new_xmin))
+	{
+		Assert(TransactionIdIsNormal(snapshot->xmin));
+		sxd->new_xmin_count++;
+	}
+	else if (!TransactionIdIsValid(sxd->new_xmin)
+		|| TransactionIdPrecedes(snapshot->xmin, sxd->new_xmin))
+	{
+		sxd->new_xmin = snapshot->xmin;
+		sxd->new_xmin_count = 1;
+	}
+
+	if (--sxd->snapshots_remaining < 0)
+		elog(FATAL, "traversed too many snapshots");
+
+	return false;
 }
 
 /*
@@ -830,6 +912,8 @@ AtEOXact_Snapshot(bool isCommit)
 	 */
 	ActiveSnapshot = NULL;
 	RegisteredSnapshots = 0;
+	RegisteredSnapshotOldestXmin = InvalidTransactionId;
+	RegisteredSnapshotOldestXminCount = 0;
 
 	CurrentSnapshot = NULL;
 	SecondarySnapshot = NULL;
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index b8fd1a9..9944144 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -73,6 +73,9 @@ extern void ResourceOwnerRememberSnapshot(ResourceOwner owner,
 							  Snapshot snapshot);
 extern void ResourceOwnerForgetSnapshot(ResourceOwner owner,
 							Snapshot snapshot);
+typedef bool (*ResourceWalkSnapshotCallback) (Snapshot snapshot, void *arg);
+extern bool ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback,
+							  void *arg);
 
 /* support for temporary file management */
 extern void ResourceOwnerEnlargeFiles(ResourceOwner owner);
#17Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#16)
Re: advance local xmin more aggressively

On 12/10/2014 06:56 PM, Robert Haas wrote:

On Wed, Dec 10, 2014 at 9:49 AM, Robert Haas <robertmhaas@gmail.com> wrote:

I guess this bears some further thought. I certainly don't like the
fact that this makes the whole system crap out at a lower number of
subtransactions than presently. The actual performance numbers don't
bother me very much; I'm comfortable with the possibility that closing
a cursor will be some modest percentage slower if you've got thousands
of active savepoints.

Here's a new version with two changes:

1. I changed the traversal of the resource owner tree to iterate
instead of recurse. It now does a depth-first, pre-order traversal of
the tree; when we reach the last child of a node, we follow its parent
pointer to get back to where we were. That way, we don't need to keep
anything on the stack. That fixed the crash at 100k cursors, but it
was still 4x slower.

Clever. Could we use that method in ResourceOwnerReleaseInternal and
ResourceOwnerDelete, too? Might be best to have a
ResourceOwnerWalk(resowner, callback) function for walking all resource
owners in a tree, instead of one for walking the snapshots in them.

2. Instead of traversing the tree until we find an xmin equal to the
one we're currently advertising, the code now traverses the entire
tree each time it runs. However, it also keeps a record of how many
times the oldest xmin occurred in the tree, which is decremented each
time we unregister a snapshot with that xmin; the traversal doesn't
run again until that count reaches 0. That fixed the performance
regression on your test case.

With a million subtransactions:

master 34.464s 33.742s 34.317s
advance-xmin 34.516s 34.069s 34.196s

Well, you can still get the slowness back by running other stuff in the
background. I admit that it's a very obscure case, probably fine in
practice. I would still feel better if snapmgr.c did its own
bookkeeping, from an aesthetic point of view. In a heap, or even just a
linked list, if the performance characteristics of that is acceptable.

It occurs to me that the pairing heap I just posted in another thread
(/messages/by-id/54886BB8.9040000@vmware.com) would
be a good fit for this. It's extremely cheap to insert to and to find
the minimum item (O(1)), and the delete operation is O(log N),
amortized. I didn't implement a delete operation, for removing a
particular node, I only did delete-min, but it's basically the same
code. Using a pairing heap for this might be overkill, but if we have
that implementation anyway, the code in snapmgr.c to use it would be
very simple, so I see little reason not to. It might even be simpler
than your patch, because you wouldn't need to have the heuristics on
whether to attempt updating the xmin; it would be cheap enough to always
try it.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#17)
Re: advance local xmin more aggressively

On Wed, Dec 10, 2014 at 12:57 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Clever. Could we use that method in ResourceOwnerReleaseInternal and
ResourceOwnerDelete, too? Might be best to have a
ResourceOwnerWalk(resowner, callback) function for walking all resource
owners in a tree, instead of one for walking the snapshots in them.

Sure. It would be a little more complicated there since you want to
stop when you get back to the starting point, but not too bad. But is
that solving any actual problem?

2. Instead of traversing the tree until we find an xmin equal to the
one we're currently advertising, the code now traverses the entire
tree each time it runs. However, it also keeps a record of how many
times the oldest xmin occurred in the tree, which is decremented each
time we unregister a snapshot with that xmin; the traversal doesn't
run again until that count reaches 0. That fixed the performance
regression on your test case.

With a million subtransactions:

master 34.464s 33.742s 34.317s
advance-xmin 34.516s 34.069s 34.196s

Well, you can still get the slowness back by running other stuff in the
background. I admit that it's a very obscure case, probably fine in
practice. I would still feel better if snapmgr.c did its own bookkeeping,
from an aesthetic point of view. In a heap, or even just a linked list, if
the performance characteristics of that is acceptable.

It occurs to me that the pairing heap I just posted in another thread
(/messages/by-id/54886BB8.9040000@vmware.com) would be
a good fit for this. It's extremely cheap to insert to and to find the
minimum item (O(1)), and the delete operation is O(log N), amortized. I
didn't implement a delete operation, for removing a particular node, I only
did delete-min, but it's basically the same code. Using a pairing heap for
this might be overkill, but if we have that implementation anyway, the code
in snapmgr.c to use it would be very simple, so I see little reason not to.
It might even be simpler than your patch, because you wouldn't need to have
the heuristics on whether to attempt updating the xmin; it would be cheap
enough to always try it.

Care to code it up?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#18)
1 attachment(s)
Re: advance local xmin more aggressively

On 12/10/2014 08:35 PM, Robert Haas wrote:

On Wed, Dec 10, 2014 at 12:57 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Clever. Could we use that method in ResourceOwnerReleaseInternal and
ResourceOwnerDelete, too? Might be best to have a
ResourceOwnerWalk(resowner, callback) function for walking all resource
owners in a tree, instead of one for walking the snapshots in them.

Sure. It would be a little more complicated there since you want to
stop when you get back to the starting point, but not too bad. But is
that solving any actual problem?

I thought that a transaction commit or abort in some special
circumstances might call ResourceOwnerReleaseInternal on the top level,
but I can't make it happen. The machinery in xact.c is too clever, and
always releases the resource owners from the bottom up. And I can't find
a way to create a deep resource owner tree in any other way. So I guess
it's fine as it is.

MemoryContextCheck and MemoryContextPrint also recurse, however.
MemoryContextCheck is only enabled with --enable-cassert, but
MemoryContextPrint is called when you run out of memory. That could turn
a plain "out of memory" error into a stack overrun, triggering a server
crash and restart.

It occurs to me that the pairing heap I just posted in another thread
(/messages/by-id/54886BB8.9040000@vmware.com) would be
a good fit for this. It's extremely cheap to insert to and to find the
minimum item (O(1)), and the delete operation is O(log N), amortized. I
didn't implement a delete operation, for removing a particular node, I only
did delete-min, but it's basically the same code. Using a pairing heap for
this might be overkill, but if we have that implementation anyway, the code
in snapmgr.c to use it would be very simple, so I see little reason not to.
It might even be simpler than your patch, because you wouldn't need to have
the heuristics on whether to attempt updating the xmin; it would be cheap
enough to always try it.

Care to code it up?

Here you are.

- Heikki

Attachments:

advance-xmin-more-with-heap.patchtext/x-diff; name=advance-xmin-more-with-heap.patchDownload
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 327a1bc..b24ece6 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/lib
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = ilist.o binaryheap.o stringinfo.o
+OBJS = ilist.o binaryheap.o pairingheap.o stringinfo.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c
new file mode 100644
index 0000000..06cd117
--- /dev/null
+++ b/src/backend/lib/pairingheap.c
@@ -0,0 +1,237 @@
+/*-------------------------------------------------------------------------
+ *
+ * pairingheap.c
+ *	  A Pairing Heap implementation
+ *
+ * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/pairingheap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/pairingheap.h"
+
+static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
+static pairingheap_node *merge_children(pairingheap *heap, pairingheap_node *children);
+
+/*
+ * pairingheap_allocate
+ *
+ * Returns a pointer to a newly-allocated heap, with the heap property
+ * defined by the given comparator function, which will be invoked with the
+ * additional argument specified by 'arg'.
+ */
+pairingheap *
+pairingheap_allocate(pairingheap_comparator compare, void *arg)
+{
+	pairingheap *heap;
+
+	heap = (pairingheap *) palloc(sizeof(pairingheap));
+	heap->ph_compare = compare;
+	heap->ph_arg = arg;
+
+	heap->ph_root = NULL;
+
+	return heap;
+}
+
+/*
+ * pairingheap_free
+ *
+ * Releases memory used by the given pairingheap.
+ *
+ * Note: The items in the heap are not released!
+ */
+void
+pairingheap_free(pairingheap *heap)
+{
+	pfree(heap);
+}
+
+
+/* A helper function to merge two subheaps into one. */
+static pairingheap_node *
+merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
+{
+	if (a == NULL)
+		return b;
+	if (b == NULL)
+		return a;
+
+	/* Put the larger of the items as a child of the smaller one */
+	if (heap->ph_compare(a, b, heap->ph_arg) < 0)
+	{
+		pairingheap_node *tmp;
+
+		tmp = a;
+		a = b;
+		b = tmp;
+	}
+
+	if (a->first_child)
+		a->first_child->prev_or_parent = b;
+	b->prev_or_parent = a;
+	b->next_sibling = a->first_child;
+	a->first_child = b;
+	return a;
+}
+
+/*
+ * pairingheap_add
+ *
+ * Adds the given datum to the heap in O(1) time.
+ */
+void
+pairingheap_add(pairingheap *heap, pairingheap_node *d)
+{
+	d->first_child = NULL;
+
+	/* Link the new item as a new tree */
+	heap->ph_root = merge(heap, heap->ph_root, d);
+}
+
+/*
+ * pairingheap_first
+ *
+ * Returns a pointer to the first (root, topmost) node in the heap
+ * without modifying the heap. The caller must ensure that this
+ * routine is not used on an empty heap. Always O(1).
+ */
+pairingheap_node *
+pairingheap_first(pairingheap *heap)
+{
+	Assert(!pairingheap_empty(heap));
+	return heap->ph_root;
+}
+
+/*
+ * pairingheap_remove_first
+ *
+ * Removes the first (root, topmost) node in the heap and returns a
+ * pointer to it after rebalancing the heap. The caller must ensure
+ * that this routine is not used on an empty heap. O(log n) amortized.
+ */
+pairingheap_node *
+pairingheap_remove_first(pairingheap *heap)
+{
+	pairingheap_node *result;
+	pairingheap_node *children;
+
+	Assert(!pairingheap_empty(heap));
+
+	/* Remove the smallest root. */
+	result = heap->ph_root;
+	children = result->first_child;
+
+	heap->ph_root = merge_children(heap, children);
+
+	return result;
+}
+
+/*
+ * Merge a list of subheaps into a single heap.
+ *
+ * This implements the basic two-pass merging strategy, first forming
+ * pairs from left to right, and then merging the pairs.
+ */
+static pairingheap_node *
+merge_children(pairingheap *heap, pairingheap_node *children)
+{
+	pairingheap_node *item, *next;
+	pairingheap_node *pairs;
+	pairingheap_node *newroot;
+
+	if (children == NULL || children->next_sibling == NULL)
+		return children;
+
+	/* Walk the remaining subheaps from left to right, merging in pairs */
+	next = children;
+	pairs = NULL;
+	for (;;)
+	{
+		item = next;
+		if (item == NULL)
+			break;
+		if (item->next_sibling == NULL)
+		{
+			/* last odd item at the end of list */
+			item->next_sibling = pairs;
+			pairs = item;
+			break;
+		}
+		else
+		{
+			next = item->next_sibling->next_sibling;
+
+			item = merge(heap, item, item->next_sibling);
+			item->next_sibling = pairs;
+			pairs = item;
+		}
+	}
+
+	/*
+	 * Form a single (sub)heap from the pairs.
+	 */
+	newroot = pairs;
+	next = pairs->next_sibling;
+	while (next)
+	{
+		item = next;
+		next = item->next_sibling;
+
+		newroot = merge(heap, newroot, item);
+	}
+
+	return newroot;
+}
+
+/*
+ * Remove 'item' from the heap. O(log n) amortized.
+ */
+void
+pairingheap_remove(pairingheap *heap, pairingheap_node *item)
+{
+	pairingheap_node *children;
+	pairingheap_node *replacement;
+	pairingheap_node *next_sibling;
+	pairingheap_node **prev_ptr;
+
+	if (item == heap->ph_root)
+	{
+		(void) pairingheap_remove_first(heap);
+		return;
+	}
+
+	children = item->first_child;
+	next_sibling = item->next_sibling;
+
+	if (item->prev_or_parent->first_child == item)
+		prev_ptr = &item->prev_or_parent->first_child;
+	else
+		prev_ptr = &item->prev_or_parent->next_sibling;
+	Assert(*prev_ptr == item);
+
+	/* Form a new heap of the children */
+	replacement = merge_children(heap, children);
+
+	if (replacement == NULL)
+	{
+		*prev_ptr = next_sibling;
+		if (next_sibling)
+			next_sibling->prev_or_parent = item->prev_or_parent;
+	}
+	else
+	{
+		replacement->prev_or_parent = item->prev_or_parent;
+		replacement->next_sibling = item->next_sibling;
+		*prev_ptr = replacement;
+		if (next_sibling)
+			next_sibling->prev_or_parent = replacement;
+	}
+}
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d601efe..ebe9013 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -46,6 +46,7 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
+#include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -58,6 +59,12 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+/* Prototypes for local functions */
+static Snapshot CopySnapshot(Snapshot snapshot);
+static void FreeSnapshot(Snapshot snapshot);
+static void SnapshotResetXmin(void);
+static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg);
+
 
 /*
  * CurrentSnapshot points to the only snapshot taken in transaction-snapshot
@@ -122,14 +129,8 @@ typedef struct ActiveSnapshotElt
 /* Top of the stack of active snapshots */
 static ActiveSnapshotElt *ActiveSnapshot = NULL;
 
-/*
- * How many snapshots is resowner.c tracking for us?
- *
- * Note: for now, a simple counter is enough.  However, if we ever want to be
- * smarter about advancing our MyPgXact->xmin we will need to be more
- * sophisticated about this, perhaps keeping our own list of snapshots.
- */
-static int	RegisteredSnapshots = 0;
+/* Snapshots registered with resowners. Ordered in a heap by xmin. */
+static pairingheap RegisteredSnapshots = { &xmin_cmp, NULL, NULL };
 
 /* first GetTransactionSnapshot call in a transaction? */
 bool		FirstSnapshotSet = false;
@@ -151,11 +152,6 @@ static Snapshot FirstXactSnapshot = NULL;
 static List *exportedSnapshots = NIL;
 
 
-static Snapshot CopySnapshot(Snapshot snapshot);
-static void FreeSnapshot(Snapshot snapshot);
-static void SnapshotResetXmin(void);
-
-
 /*
  * GetTransactionSnapshot
  *		Get the appropriate snapshot for a new query in a transaction.
@@ -183,7 +179,7 @@ GetTransactionSnapshot(void)
 	/* First call in transaction? */
 	if (!FirstSnapshotSet)
 	{
-		Assert(RegisteredSnapshots == 0);
+		Assert(pairingheap_is_empty(&RegisteredSnapshots));
 		Assert(FirstXactSnapshot == NULL);
 
 		/*
@@ -205,7 +201,7 @@ GetTransactionSnapshot(void)
 			FirstXactSnapshot = CurrentSnapshot;
 			/* Mark it as "registered" in FirstXactSnapshot */
 			FirstXactSnapshot->regd_count++;
-			RegisteredSnapshots++;
+			pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 		}
 		else
 			CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
@@ -350,7 +346,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 	/* Caller should have checked this already */
 	Assert(!FirstSnapshotSet);
 
-	Assert(RegisteredSnapshots == 0);
+	Assert(pairingheap_is_empty(RegisteredSnapshots));
 	Assert(FirstXactSnapshot == NULL);
 	Assert(!HistoricSnapshotActive());
 
@@ -413,7 +409,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 		FirstXactSnapshot = CurrentSnapshot;
 		/* Mark it as "registered" in FirstXactSnapshot */
 		FirstXactSnapshot->regd_count++;
-		RegisteredSnapshots++;
+		pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 	}
 
 	FirstSnapshotSet = true;
@@ -639,7 +635,8 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner)
 	snap->regd_count++;
 	ResourceOwnerRememberSnapshot(owner, snap);
 
-	RegisteredSnapshots++;
+	if (snap->regd_count == 1)
+		pairingheap_add(&RegisteredSnapshots, &snap->ph_node);
 
 	return snap;
 }
@@ -671,11 +668,16 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 		return;
 
 	Assert(snapshot->regd_count > 0);
-	Assert(RegisteredSnapshots > 0);
+	Assert(!pairingheap_is_empty(&RegisteredSnapshots));
 
 	ResourceOwnerForgetSnapshot(owner, snapshot);
-	RegisteredSnapshots--;
-	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
+
+	snapshot->regd_count--;
+
+	if (snapshot->regd_count == 0)
+		pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node);
+
+	if (snapshot->regd_count == 0 && snapshot->active_count == 0)
 	{
 		FreeSnapshot(snapshot);
 		SnapshotResetXmin();
@@ -683,17 +685,54 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 }
 
 /*
+ * Comparison function for RegisteredSnapshots heap. Snapshots are ordered
+ * by xmin, so that the snapshot with smallest xmin is at the top.
+ */
+static int
+xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
+{
+	Snapshot asnap = pairingheap_container(SnapshotData, ph_node, (pairingheap_node *) a);
+	Snapshot bsnap = pairingheap_container(SnapshotData, ph_node, (pairingheap_node *) b);
+
+	if (TransactionIdPrecedes(asnap->xmin, bsnap->xmin))
+		return 1;
+	else if (TransactionIdFollows(asnap->xmin, bsnap->xmin))
+		return -1;
+	else
+		return 0;
+}
+
+/*
  * SnapshotResetXmin
  *
  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
+ *
+ * Even if there are some remaining snapshots, we may be able to advance our
+ * PGXACT->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * the active snapshot stack is empty.
  */
 static void
 SnapshotResetXmin(void)
 {
-	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	Snapshot minSnapshot;
+
+	if (ActiveSnapshot != NULL)
+		return;
+
+	if (pairingheap_is_empty(&RegisteredSnapshots))
+	{
 		MyPgXact->xmin = InvalidTransactionId;
+		return;
+	}
+
+	minSnapshot = pairingheap_container(SnapshotData, ph_node,
+									pairingheap_first(&RegisteredSnapshots));
+
+	if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin))
+		MyPgXact->xmin = minSnapshot->xmin;
 }
 
 /*
@@ -770,7 +809,7 @@ AtEOXact_Snapshot(bool isCommit)
 	{
 		Assert(FirstXactSnapshot->regd_count > 0);
 		Assert(RegisteredSnapshots > 0);
-		RegisteredSnapshots--;
+		pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 	}
 	FirstXactSnapshot = NULL;
 
@@ -782,6 +821,7 @@ AtEOXact_Snapshot(bool isCommit)
 		TransactionId myxid = GetTopTransactionId();
 		int			i;
 		char		buf[MAXPGPATH];
+		ListCell   *lc;
 
 		/*
 		 * Get rid of the files.  Unlink failure is only a WARNING because (1)
@@ -798,14 +838,13 @@ AtEOXact_Snapshot(bool isCommit)
 		/*
 		 * As with the FirstXactSnapshot, we needn't spend any effort on
 		 * cleaning up the per-snapshot data structures, but we do need to
-		 * adjust the RegisteredSnapshots count to prevent a warning below.
-		 *
-		 * Note: you might be thinking "why do we have the exportedSnapshots
-		 * list at all?  All we need is a counter!".  You're right, but we do
-		 * it this way in case we ever feel like improving xmin management.
+		 * unlink them from RegisteredSnapshots to prevent a warning below.
 		 */
-		Assert(RegisteredSnapshots >= list_length(exportedSnapshots));
-		RegisteredSnapshots -= list_length(exportedSnapshots);
+		foreach(lc, exportedSnapshots)
+		{
+			Snapshot snap = (Snapshot) lfirst(lc);
+			pairingheap_remove(&RegisteredSnapshots, &snap->ph_node);
+		}
 
 		exportedSnapshots = NIL;
 	}
@@ -815,9 +854,8 @@ AtEOXact_Snapshot(bool isCommit)
 	{
 		ActiveSnapshotElt *active;
 
-		if (RegisteredSnapshots != 0)
-			elog(WARNING, "%d registered snapshots seem to remain after cleanup",
-				 RegisteredSnapshots);
+		if (!pairingheap_is_empty(&RegisteredSnapshots))
+			elog(WARNING, "registered snapshots seem to remain after cleanup");
 
 		/* complain about unpopped active snapshots */
 		for (active = ActiveSnapshot; active != NULL; active = active->as_next)
@@ -829,7 +867,7 @@ AtEOXact_Snapshot(bool isCommit)
 	 * it'll go away with TopTransactionContext.
 	 */
 	ActiveSnapshot = NULL;
-	RegisteredSnapshots = 0;
+	pairingheap_reset(&RegisteredSnapshots);
 
 	CurrentSnapshot = NULL;
 	SecondarySnapshot = NULL;
@@ -900,8 +938,7 @@ ExportSnapshot(Snapshot snapshot)
 	 * Copy the snapshot into TopTransactionContext, add it to the
 	 * exportedSnapshots list, and mark it pseudo-registered.  We do this to
 	 * ensure that the snapshot's xmin is honored for the rest of the
-	 * transaction.  (Right now, because SnapshotResetXmin is so stupid, this
-	 * is overkill; but later we might make that routine smarter.)
+	 * transaction.
 	 */
 	snapshot = CopySnapshot(snapshot);
 
@@ -910,7 +947,7 @@ ExportSnapshot(Snapshot snapshot)
 	MemoryContextSwitchTo(oldcxt);
 
 	snapshot->regd_count++;
-	RegisteredSnapshots++;
+	pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node);
 
 	/*
 	 * Fill buf with a text serialization of the snapshot, plus identification
@@ -1303,7 +1340,8 @@ DeleteAllExportedSnapshotFiles(void)
 bool
 ThereAreNoPriorRegisteredSnapshots(void)
 {
-	if (RegisteredSnapshots <= 1)
+	if (pairingheap_is_empty(&RegisteredSnapshots) ||
+		pairingheap_is_singular(&RegisteredSnapshots))
 		return true;
 
 	return false;
diff --git a/src/include/lib/pairingheap.h b/src/include/lib/pairingheap.h
new file mode 100644
index 0000000..e78196d
--- /dev/null
+++ b/src/include/lib/pairingheap.h
@@ -0,0 +1,67 @@
+/*
+ * pairingheap.h
+ *
+ * A Pairing Heap implementation
+ *
+ * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * src/include/lib/pairingheap.h
+ */
+
+#ifndef PAIRINGHEAP_H
+#define PAIRINGHEAP_H
+
+/*
+ * This represents an element stored in the heap. Embed this in a larger
+ * struct containing the actual data you're storing.
+ */
+typedef struct pairingheap_node
+{
+	struct pairingheap_node *first_child;
+	struct pairingheap_node *next_sibling;
+	struct pairingheap_node *prev_or_parent;
+} pairingheap_node;
+
+/*
+ * Return the containing struct of 'type' where 'membername' is the
+ * pairingheap_node pointed at by 'ptr'.
+ *
+ * This is used to convert a pairingheap_node * back to its containing struct.
+ */
+#define pairingheap_container(type, membername, ptr)								\
+	(AssertVariableIsOfTypeMacro(ptr, pairingheap_node *),					\
+	 AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node),	\
+	 ((type *) ((char *) (ptr) - offsetof(type, membername))))
+
+/*
+ * For a max-heap, the comparator must return <0 iff a < b, 0 iff a == b,
+ * and >0 iff a > b.  For a min-heap, the conditions are reversed.
+ */
+typedef int (*pairingheap_comparator) (const pairingheap_node *a, const pairingheap_node *b, void *arg);
+
+/*
+ * A pairing heap.
+ */
+typedef struct pairingheap
+{
+	pairingheap_comparator ph_compare;	/* comparison function */
+	void	   *ph_arg;					/* opaque argument to ph_compare */
+	pairingheap_node *ph_root;			/* current root of the heap */
+} pairingheap;
+
+extern pairingheap *pairingheap_allocate(pairingheap_comparator compare,
+					void *arg);
+extern void pairingheap_free(pairingheap *heap);
+extern void pairingheap_add(pairingheap *heap, pairingheap_node *d);
+extern pairingheap_node *pairingheap_first(pairingheap *heap);
+extern pairingheap_node *pairingheap_remove_first(pairingheap *heap);
+extern void pairingheap_remove(pairingheap *heap, pairingheap_node *d);
+
+#define pairingheap_reset(h)			((h)->ph_root = NULL)
+
+#define pairingheap_is_empty(h)			((h)->ph_root == NULL)
+
+/* Returns true if the heap contains exactly one item */
+#define pairingheap_is_singular(h)		((h)->ph_root && (h)->ph_root->first_child == NULL)
+
+#endif   /* PAIRINGHEAP_H */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 53e474f..8b78a3a 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -14,6 +14,7 @@
 #define SNAPSHOT_H
 
 #include "access/htup.h"
+#include "lib/pairingheap.h"
 #include "storage/buf.h"
 
 
@@ -91,7 +92,9 @@ typedef struct SnapshotData
 	 */
 	CommandId	curcid;			/* in my xact, CID < curcid are visible */
 	uint32		active_count;	/* refcount on ActiveSnapshot stack */
-	uint32		regd_count;		/* refcount on RegisteredSnapshotList */
+	uint32		regd_count;		/* refcount on RegisteredSnapshots */
+
+	pairingheap_node ph_node;	/* link in the RegisteredSnapshots heap */
 } SnapshotData;
 
 /*
#20Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#19)
Re: advance local xmin more aggressively

On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Care to code it up?

Here you are.

That was quick.

You need to add a semicolon to the end of line 20 in pairingheap.c.

I wonder what the worst case for this is. I tried it on your
destruction test case from before and it doesn't seem to cost anything
material:

your patch: 0m38.714s 0m34.424s 0m35.191s
master: 0m35.260s 0m34.643s 0m34.945s
my patch: 0m34.728s 0m34.619s 0m35.015s

The first measurement with your patch is somewhat of an outlier, but
that was also the first measurement I took, which might have thrown
off the results. Basically, either your patch or my patch looks free
from here. I'm kind of suspicious about that: it doesn't seem like
the additional linked-list manipulation you're doing in
RegisterSnapshotOnOwner ought to cost something, but maybe that
function just isn't called enough for it to matter, at least on this
test case.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21Jeff Janes
jeff.janes@gmail.com
In reply to: Robert Haas (#20)
Re: advance local xmin more aggressively

On Wed, Dec 10, 2014 at 3:46 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Care to code it up?

Here you are.

That was quick.

You need to add a semicolon to the end of line 20 in pairingheap.c.

In addition to the semicolon, it doesn't build under cassert. There are
some pairingheap_empty that need to be pairingheap_is_empty, and snapmgr.c
needs an address of operator near line 355 and something is wrong
in snapmgr.c near line 811.

Cheers,

Jeff

#22Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Jeff Janes (#21)
1 attachment(s)
Re: advance local xmin more aggressively

On 12/16/2014 10:41 PM, Jeff Janes wrote:

On Wed, Dec 10, 2014 at 3:46 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Care to code it up?

Here you are.

That was quick.

You need to add a semicolon to the end of line 20 in pairingheap.c.

In addition to the semicolon, it doesn't build under cassert. There are
some pairingheap_empty that need to be pairingheap_is_empty, and snapmgr.c
needs an address of operator near line 355 and something is wrong
in snapmgr.c near line 811.

Here's an updated version, rebased over the pairing heap code that I
just committed, and fixing those bugs.

- Heikki

Attachments:

advance-xmin-more-with-heap-2.patchtext/x-diff; name=advance-xmin-more-with-heap-2.patchDownload
commit 4f37313a5b173c2952aebc91c41c744dcc3cf2df
Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date:   Mon Dec 22 12:22:39 2014 +0200

    Use pairing heap to keep registered snapshots in xmin-order.
    
    This allows us to advance the xmin in PGPROC more aggressively.

diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d601efe..08d6d3d 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -46,6 +46,7 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
+#include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -58,6 +59,12 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+/* Prototypes for local functions */
+static Snapshot CopySnapshot(Snapshot snapshot);
+static void FreeSnapshot(Snapshot snapshot);
+static void SnapshotResetXmin(void);
+static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg);
+
 
 /*
  * CurrentSnapshot points to the only snapshot taken in transaction-snapshot
@@ -122,14 +129,8 @@ typedef struct ActiveSnapshotElt
 /* Top of the stack of active snapshots */
 static ActiveSnapshotElt *ActiveSnapshot = NULL;
 
-/*
- * How many snapshots is resowner.c tracking for us?
- *
- * Note: for now, a simple counter is enough.  However, if we ever want to be
- * smarter about advancing our MyPgXact->xmin we will need to be more
- * sophisticated about this, perhaps keeping our own list of snapshots.
- */
-static int	RegisteredSnapshots = 0;
+/* Snapshots registered with resowners. Ordered in a heap by xmin. */
+static pairingheap RegisteredSnapshots = { &xmin_cmp, NULL, NULL };
 
 /* first GetTransactionSnapshot call in a transaction? */
 bool		FirstSnapshotSet = false;
@@ -151,11 +152,6 @@ static Snapshot FirstXactSnapshot = NULL;
 static List *exportedSnapshots = NIL;
 
 
-static Snapshot CopySnapshot(Snapshot snapshot);
-static void FreeSnapshot(Snapshot snapshot);
-static void SnapshotResetXmin(void);
-
-
 /*
  * GetTransactionSnapshot
  *		Get the appropriate snapshot for a new query in a transaction.
@@ -183,7 +179,7 @@ GetTransactionSnapshot(void)
 	/* First call in transaction? */
 	if (!FirstSnapshotSet)
 	{
-		Assert(RegisteredSnapshots == 0);
+		Assert(pairingheap_is_empty(&RegisteredSnapshots));
 		Assert(FirstXactSnapshot == NULL);
 
 		/*
@@ -205,7 +201,7 @@ GetTransactionSnapshot(void)
 			FirstXactSnapshot = CurrentSnapshot;
 			/* Mark it as "registered" in FirstXactSnapshot */
 			FirstXactSnapshot->regd_count++;
-			RegisteredSnapshots++;
+			pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 		}
 		else
 			CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
@@ -350,7 +346,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 	/* Caller should have checked this already */
 	Assert(!FirstSnapshotSet);
 
-	Assert(RegisteredSnapshots == 0);
+	Assert(pairingheap_is_empty(&RegisteredSnapshots));
 	Assert(FirstXactSnapshot == NULL);
 	Assert(!HistoricSnapshotActive());
 
@@ -413,7 +409,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 		FirstXactSnapshot = CurrentSnapshot;
 		/* Mark it as "registered" in FirstXactSnapshot */
 		FirstXactSnapshot->regd_count++;
-		RegisteredSnapshots++;
+		pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 	}
 
 	FirstSnapshotSet = true;
@@ -639,7 +635,8 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner)
 	snap->regd_count++;
 	ResourceOwnerRememberSnapshot(owner, snap);
 
-	RegisteredSnapshots++;
+	if (snap->regd_count == 1)
+		pairingheap_add(&RegisteredSnapshots, &snap->ph_node);
 
 	return snap;
 }
@@ -671,11 +668,16 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 		return;
 
 	Assert(snapshot->regd_count > 0);
-	Assert(RegisteredSnapshots > 0);
+	Assert(!pairingheap_is_empty(&RegisteredSnapshots));
 
 	ResourceOwnerForgetSnapshot(owner, snapshot);
-	RegisteredSnapshots--;
-	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
+
+	snapshot->regd_count--;
+
+	if (snapshot->regd_count == 0)
+		pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node);
+
+	if (snapshot->regd_count == 0 && snapshot->active_count == 0)
 	{
 		FreeSnapshot(snapshot);
 		SnapshotResetXmin();
@@ -683,17 +685,54 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 }
 
 /*
+ * Comparison function for RegisteredSnapshots heap. Snapshots are ordered
+ * by xmin, so that the snapshot with smallest xmin is at the top.
+ */
+static int
+xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
+{
+	const SnapshotData *asnap = pairingheap_const_container(SnapshotData, ph_node, a);
+	const SnapshotData *bsnap = pairingheap_const_container(SnapshotData, ph_node, b);
+
+	if (TransactionIdPrecedes(asnap->xmin, bsnap->xmin))
+		return 1;
+	else if (TransactionIdFollows(asnap->xmin, bsnap->xmin))
+		return -1;
+	else
+		return 0;
+}
+
+/*
  * SnapshotResetXmin
  *
  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
+ *
+ * Even if there are some remaining snapshots, we may be able to advance our
+ * PGXACT->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * the active snapshot stack is empty.
  */
 static void
 SnapshotResetXmin(void)
 {
-	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	Snapshot minSnapshot;
+
+	if (ActiveSnapshot != NULL)
+		return;
+
+	if (pairingheap_is_empty(&RegisteredSnapshots))
+	{
 		MyPgXact->xmin = InvalidTransactionId;
+		return;
+	}
+
+	minSnapshot = pairingheap_container(SnapshotData, ph_node,
+									pairingheap_first(&RegisteredSnapshots));
+
+	if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin))
+		MyPgXact->xmin = minSnapshot->xmin;
 }
 
 /*
@@ -769,8 +808,8 @@ AtEOXact_Snapshot(bool isCommit)
 	if (FirstXactSnapshot != NULL)
 	{
 		Assert(FirstXactSnapshot->regd_count > 0);
-		Assert(RegisteredSnapshots > 0);
-		RegisteredSnapshots--;
+		Assert(!pairingheap_is_empty(&RegisteredSnapshots));
+		pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
 	}
 	FirstXactSnapshot = NULL;
 
@@ -782,6 +821,7 @@ AtEOXact_Snapshot(bool isCommit)
 		TransactionId myxid = GetTopTransactionId();
 		int			i;
 		char		buf[MAXPGPATH];
+		ListCell   *lc;
 
 		/*
 		 * Get rid of the files.  Unlink failure is only a WARNING because (1)
@@ -798,14 +838,13 @@ AtEOXact_Snapshot(bool isCommit)
 		/*
 		 * As with the FirstXactSnapshot, we needn't spend any effort on
 		 * cleaning up the per-snapshot data structures, but we do need to
-		 * adjust the RegisteredSnapshots count to prevent a warning below.
-		 *
-		 * Note: you might be thinking "why do we have the exportedSnapshots
-		 * list at all?  All we need is a counter!".  You're right, but we do
-		 * it this way in case we ever feel like improving xmin management.
+		 * unlink them from RegisteredSnapshots to prevent a warning below.
 		 */
-		Assert(RegisteredSnapshots >= list_length(exportedSnapshots));
-		RegisteredSnapshots -= list_length(exportedSnapshots);
+		foreach(lc, exportedSnapshots)
+		{
+			Snapshot snap = (Snapshot) lfirst(lc);
+			pairingheap_remove(&RegisteredSnapshots, &snap->ph_node);
+		}
 
 		exportedSnapshots = NIL;
 	}
@@ -815,9 +854,8 @@ AtEOXact_Snapshot(bool isCommit)
 	{
 		ActiveSnapshotElt *active;
 
-		if (RegisteredSnapshots != 0)
-			elog(WARNING, "%d registered snapshots seem to remain after cleanup",
-				 RegisteredSnapshots);
+		if (!pairingheap_is_empty(&RegisteredSnapshots))
+			elog(WARNING, "registered snapshots seem to remain after cleanup");
 
 		/* complain about unpopped active snapshots */
 		for (active = ActiveSnapshot; active != NULL; active = active->as_next)
@@ -829,7 +867,7 @@ AtEOXact_Snapshot(bool isCommit)
 	 * it'll go away with TopTransactionContext.
 	 */
 	ActiveSnapshot = NULL;
-	RegisteredSnapshots = 0;
+	pairingheap_reset(&RegisteredSnapshots);
 
 	CurrentSnapshot = NULL;
 	SecondarySnapshot = NULL;
@@ -900,8 +938,7 @@ ExportSnapshot(Snapshot snapshot)
 	 * Copy the snapshot into TopTransactionContext, add it to the
 	 * exportedSnapshots list, and mark it pseudo-registered.  We do this to
 	 * ensure that the snapshot's xmin is honored for the rest of the
-	 * transaction.  (Right now, because SnapshotResetXmin is so stupid, this
-	 * is overkill; but later we might make that routine smarter.)
+	 * transaction.
 	 */
 	snapshot = CopySnapshot(snapshot);
 
@@ -910,7 +947,7 @@ ExportSnapshot(Snapshot snapshot)
 	MemoryContextSwitchTo(oldcxt);
 
 	snapshot->regd_count++;
-	RegisteredSnapshots++;
+	pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node);
 
 	/*
 	 * Fill buf with a text serialization of the snapshot, plus identification
@@ -1303,7 +1340,8 @@ DeleteAllExportedSnapshotFiles(void)
 bool
 ThereAreNoPriorRegisteredSnapshots(void)
 {
-	if (RegisteredSnapshots <= 1)
+	if (pairingheap_is_empty(&RegisteredSnapshots) ||
+		pairingheap_is_singular(&RegisteredSnapshots))
 		return true;
 
 	return false;
diff --git a/src/include/lib/pairingheap.h b/src/include/lib/pairingheap.h
index e6cae29..a9e3676 100644
--- a/src/include/lib/pairingheap.h
+++ b/src/include/lib/pairingheap.h
@@ -30,6 +30,25 @@ typedef struct pairingheap_node
 } pairingheap_node;
 
 /*
+ * Return the containing struct of 'type' where 'membername' is the
+ * pairingheap_node pointed at by 'ptr'.
+ *
+ * This is used to convert a pairingheap_node * back to its containing struct.
+ */
+#define pairingheap_container(type, membername, ptr) \
+	(AssertVariableIsOfTypeMacro(ptr, pairingheap_node *), \
+	 AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node),  \
+	 ((type *) ((char *) (ptr) - offsetof(type, membername))))
+
+/*
+ * Like pairingheap_container, but used when the pointer is 'const ptr'
+ */
+#define pairingheap_const_container(type, membername, ptr) \
+	(AssertVariableIsOfTypeMacro(ptr, const pairingheap_node *), \
+	 AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node),  \
+	 ((const type *) ((const char *) (ptr) - offsetof(type, membername))))
+
+/*
  * For a max-heap, the comparator must return <0 iff a < b, 0 iff a == b,
  * and >0 iff a > b.  For a min-heap, the conditions are reversed.
  */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 53e474f..8b78a3a 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -14,6 +14,7 @@
 #define SNAPSHOT_H
 
 #include "access/htup.h"
+#include "lib/pairingheap.h"
 #include "storage/buf.h"
 
 
@@ -91,7 +92,9 @@ typedef struct SnapshotData
 	 */
 	CommandId	curcid;			/* in my xact, CID < curcid are visible */
 	uint32		active_count;	/* refcount on ActiveSnapshot stack */
-	uint32		regd_count;		/* refcount on RegisteredSnapshotList */
+	uint32		regd_count;		/* refcount on RegisteredSnapshots */
+
+	pairingheap_node ph_node;	/* link in the RegisteredSnapshots heap */
 } SnapshotData;
 
 /*
#23Michael Paquier
michael.paquier@gmail.com
In reply to: Heikki Linnakangas (#22)
Re: advance local xmin more aggressively

On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Here's an updated version, rebased over the pairing heap code that I just
committed, and fixing those bugs.

So, are we reaching an outcome for the match happening here?
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#24Robert Haas
robertmhaas@gmail.com
In reply to: Michael Paquier (#23)
Re: advance local xmin more aggressively

On Thu, Jan 15, 2015 at 3:08 AM, Michael Paquier
<michael.paquier@gmail.com> wrote:

On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Here's an updated version, rebased over the pairing heap code that I just
committed, and fixing those bugs.

So, are we reaching an outcome for the match happening here?

Well, I still like using the existing ResourceOwner pointers to find
the snapshots more than introducing a separate data structure just for
that. But I'm OK with Heikki committing his version and, if anybody
notices the new code becoming a hotspot, we can revisit the issue.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Robert Haas (#24)
Re: advance local xmin more aggressively

On 01/15/2015 09:57 PM, Robert Haas wrote:

On Thu, Jan 15, 2015 at 3:08 AM, Michael Paquier
<michael.paquier@gmail.com> wrote:

On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Here's an updated version, rebased over the pairing heap code that I just
committed, and fixing those bugs.

So, are we reaching an outcome for the match happening here?

Well, I still like using the existing ResourceOwner pointers to find
the snapshots more than introducing a separate data structure just for
that. But I'm OK with Heikki committing his version and, if anybody
notices the new code becoming a hotspot, we can revisit the issue.

Ok, committed.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers