patch: fix race in SSI's CheckTargetForConflictsIn
While running some benchmarks to test SSI performance, I found a race
condition that's capable of causing a segfault. A patch is attached.
The bug is in CheckTargetForConflictsIn, which scans the list of SIREAD
locks on a lock target when it's modified. There's an optimization in
there where the writing transaction will remove a SIREAD lock that it
holds itself, because it's being replaced with a (stronger) write lock.
To do that, it needs to drop its shared lwlocks and reacquire them in
exclusive mode. The existing code deals with concurrent modifications
in that interval by redoing checks. However, it misses the case where
some other transaction removes all remaining locks on the target, and
proceeds to remove the lock target itself.
The attached patch fixes this by deferring the SIREAD lock removal
until the end of the function. At that point, there isn't any need to
worry about concurrent updates to the target's lock list. The resulting
code is also simpler.
Dan
--
Dan R. K. Ports MIT CSAIL http://drkp.net/
Attachments:
ssi-fix-checktargetforconflictsin-race.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 3309d07..c274bca 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3905,6 +3905,8 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
LWLockId partitionLock;
PREDICATELOCKTARGET *target;
PREDICATELOCK *predlock;
+ PREDICATELOCK *mypredlock = NULL;
+ PREDICATELOCKTAG mypredlocktag;
Assert(MySerializableXact != InvalidSerializableXact);
@@ -3950,139 +3952,21 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
if (sxact == MySerializableXact)
{
/*
- * If we're getting a write lock on the tuple and we're not in a
- * subtransaction, we don't need a predicate (SIREAD) lock. We
- * can't use this optimization within a subtransaction because the
- * subtransaction could be rolled back, and we would be left
- * without any lock at the top level.
+ * If we're getting a write lock on a tuple, we don't need
+ * a predicate (SIREAD) lock on the same tuple. We can
+ * safely remove our SIREAD lock, but we'll defer doing so
+ * until after the loop because that requires upgrading to
+ * an exclusive partition lock.
*
- * At this point our transaction already has an ExclusiveRowLock
- * on the relation, so we are OK to drop the predicate lock on the
- * tuple, if found, without fearing that another write against the
- * tuple will occur before the MVCC information makes it to the
- * buffer.
+ * We can't use this optimization within a subtransaction
+ * because the subtransaction could roll back, and we
+ * would be left without any lock at the top level.
*/
if (!IsSubTransaction()
&& GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
{
- uint32 predlockhashcode;
- PREDICATELOCKTARGET *rmtarget = NULL;
- PREDICATELOCK *rmpredlock;
- LOCALPREDICATELOCK *locallock,
- *rmlocallock;
-
- /*
- * This is a tuple on which we have a tuple predicate lock. We
- * only have shared LW locks now; release those, and get
- * exclusive locks only while we modify things.
- */
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
- LWLockAcquire(partitionLock, LW_EXCLUSIVE);
- LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
-
- /*
- * Remove the predicate lock from shared memory, if it wasn't
- * removed while the locks were released. One way that could
- * happen is from autovacuum cleaning up an index.
- */
- predlockhashcode = PredicateLockHashCodeFromTargetHashCode
- (&(predlock->tag), targettaghash);
- rmpredlock = (PREDICATELOCK *)
- hash_search_with_hash_value(PredicateLockHash,
- &(predlock->tag),
- predlockhashcode,
- HASH_FIND, NULL);
- if (rmpredlock)
- {
- Assert(rmpredlock == predlock);
-
- SHMQueueDelete(predlocktargetlink);
- SHMQueueDelete(&(predlock->xactLink));
-
- rmpredlock = (PREDICATELOCK *)
- hash_search_with_hash_value(PredicateLockHash,
- &(predlock->tag),
- predlockhashcode,
- HASH_REMOVE, NULL);
- Assert(rmpredlock == predlock);
-
- RemoveTargetIfNoLongerUsed(target, targettaghash);
-
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockRelease(SerializablePredicateLockListLock);
-
- locallock = (LOCALPREDICATELOCK *)
- hash_search_with_hash_value(LocalPredicateLockHash,
- targettag, targettaghash,
- HASH_FIND, NULL);
-
- /*
- * Remove entry in local lock table if it exists and has
- * no children. It's OK if it doesn't exist; that means
- * the lock was transferred to a new target by a different
- * backend.
- */
- if (locallock != NULL)
- {
- locallock->held = false;
-
- if (locallock->childLocks == 0)
- {
- rmlocallock = (LOCALPREDICATELOCK *)
- hash_search_with_hash_value(LocalPredicateLockHash,
- targettag, targettaghash,
- HASH_REMOVE, NULL);
- Assert(rmlocallock == locallock);
- }
- }
-
- DecrementParentLocks(targettag);
-
- /*
- * If we've cleaned up the last of the predicate locks for
- * the target, bail out before re-acquiring the locks.
- */
- if (rmtarget)
- return;
-
- /*
- * The list has been altered. Start over at the front.
- */
- LWLockAcquire(partitionLock, LW_SHARED);
- nextpredlock = (PREDICATELOCK *)
- SHMQueueNext(&(target->predicateLocks),
- &(target->predicateLocks),
- offsetof(PREDICATELOCK, targetLink));
-
- LWLockAcquire(SerializableXactHashLock, LW_SHARED);
- }
- else
- {
- /*
- * The predicate lock was cleared while we were attempting
- * to upgrade our lightweight locks. Revert to the shared
- * locks.
- */
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockRelease(SerializablePredicateLockListLock);
- LWLockAcquire(partitionLock, LW_SHARED);
-
- /*
- * The list may have been altered by another process while
- * we weren't holding the partition lock. Start over at
- * the front.
- */
- nextpredlock = (PREDICATELOCK *)
- SHMQueueNext(&(target->predicateLocks),
- &(target->predicateLocks),
- offsetof(PREDICATELOCK, targetLink));
-
- LWLockAcquire(SerializableXactHashLock, LW_SHARED);
- }
+ mypredlock = predlock;
+ mypredlocktag = predlock->tag;
}
}
else if (!SxactIsRolledBack(sxact)
@@ -4116,6 +4000,88 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
}
LWLockRelease(SerializableXactHashLock);
LWLockRelease(partitionLock);
+
+ /*
+ * If we found one of our own SIREAD locks to remove, remove it
+ * now.
+ *
+ * At this point our transaction already has an ExclusiveRowLock
+ * on the relation, so we are OK to drop the predicate lock on the
+ * tuple, if found, without fearing that another write against the
+ * tuple will occur before the MVCC information makes it to the
+ * buffer.
+ */
+ if (mypredlock != NULL)
+ {
+ uint32 predlockhashcode;
+ PREDICATELOCK *rmpredlock;
+ LOCALPREDICATELOCK *locallock,
+ *rmlocallock;
+
+ LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the predicate lock from shared memory, if it wasn't
+ * removed while the locks were released. One way that could
+ * happen is from autovacuum cleaning up an index.
+ */
+ predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+ (&mypredlocktag, targettaghash);
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_FIND, NULL);
+ if (rmpredlock)
+ {
+ Assert(rmpredlock == mypredlock);
+
+ SHMQueueDelete(&(mypredlock->targetLink));
+ SHMQueueDelete(&(mypredlock->xactLink));
+
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == mypredlock);
+
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+ LWLockRelease(SerializablePredicateLockListLock);
+
+ locallock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_FIND, NULL);
+
+ /*
+ * Remove entry in local lock table if it exists and has
+ * no children. It's OK if it doesn't exist; that means
+ * the lock was transferred to a new target by a different
+ * backend.
+ */
+ if (locallock != NULL)
+ {
+ locallock->held = false;
+
+ if (locallock->childLocks == 0)
+ {
+ rmlocallock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmlocallock == locallock);
+ }
+ }
+
+ DecrementParentLocks(targettag);
+ }
+ }
}
/*
Dan Ports <drkp@csail.mit.edu> wrote:
While running some benchmarks to test SSI performance, I found a
race condition that's capable of causing a segfault. A patch is
attached.The bug is in CheckTargetForConflictsIn, which scans the list of
SIREAD locks on a lock target when it's modified. There's an
optimization in there where the writing transaction will remove a
SIREAD lock that it holds itself, because it's being replaced with
a (stronger) write lock. To do that, it needs to drop its shared
lwlocks and reacquire them in exclusive mode. The existing code
deals with concurrent modifications in that interval by redoing
checks. However, it misses the case where some other transaction
removes all remaining locks on the target, and proceeds to remove
the lock target itself.The attached patch fixes this by deferring the SIREAD lock removal
until the end of the function. At that point, there isn't any need
to worry about concurrent updates to the target's lock list. The
resulting code is also simpler.
I just want to confirm that this addresses a real (although very
narrow) race condition. It results from code used to implement a
valuable optimization described in Cahill's thesis: don't get or
keep an SIREAD lock on a row which has a write lock. The write lock
is stronger and will cause a write/write conflict with any
overlapping transactions which would care about a read/write
conflict. The pattern of reading a row and then updating or
deleting it is so common that this optimization does a lot to avoid
promotion of predicate locks to coarser granularity, and thereby
helps avoid false positives.
While the optimization is valuable, the code used to implement it
was pretty horrid. (And that was me that wrote it.) It has already
been the cause of several other fixes since the main patch went in.
What Dan has done here is move the optimization out of the middle of
the loop which is doing the conflict detection, and in doing so has
reduced the number of lines of code needed, reduced the amount of
fiddling with LW locks, and all around made the code more robust and
more understandable.
I've reviewed the patch and it looks good to me. Dan has beat up on
it with the same DBT-2 run which exposed the race condition without
seeing a problem. Although a much smaller patch could address the
immediate problem, I strongly feel that Dan has taken the right
approach by refactoring this bit to something fundamentally cleaner
and less fragile.
-Kevin
On Thu, May 5, 2011 at 1:43 PM, Kevin Grittner
<Kevin.Grittner@wicourts.gov> wrote:
Dan Ports <drkp@csail.mit.edu> wrote:
While running some benchmarks to test SSI performance, I found a
race condition that's capable of causing a segfault. A patch is
attached.The bug is in CheckTargetForConflictsIn, which scans the list of
SIREAD locks on a lock target when it's modified. There's an
optimization in there where the writing transaction will remove a
SIREAD lock that it holds itself, because it's being replaced with
a (stronger) write lock. To do that, it needs to drop its shared
lwlocks and reacquire them in exclusive mode. The existing code
deals with concurrent modifications in that interval by redoing
checks. However, it misses the case where some other transaction
removes all remaining locks on the target, and proceeds to remove
the lock target itself.The attached patch fixes this by deferring the SIREAD lock removal
until the end of the function. At that point, there isn't any need
to worry about concurrent updates to the target's lock list. The
resulting code is also simpler.I just want to confirm that this addresses a real (although very
narrow) race condition. It results from code used to implement a
valuable optimization described in Cahill's thesis: don't get or
keep an SIREAD lock on a row which has a write lock. The write lock
is stronger and will cause a write/write conflict with any
overlapping transactions which would care about a read/write
conflict. The pattern of reading a row and then updating or
deleting it is so common that this optimization does a lot to avoid
promotion of predicate locks to coarser granularity, and thereby
helps avoid false positives.While the optimization is valuable, the code used to implement it
was pretty horrid. (And that was me that wrote it.) It has already
been the cause of several other fixes since the main patch went in.
What Dan has done here is move the optimization out of the middle of
the loop which is doing the conflict detection, and in doing so has
reduced the number of lines of code needed, reduced the amount of
fiddling with LW locks, and all around made the code more robust and
more understandable.I've reviewed the patch and it looks good to me. Dan has beat up on
it with the same DBT-2 run which exposed the race condition without
seeing a problem. Although a much smaller patch could address the
immediate problem, I strongly feel that Dan has taken the right
approach by refactoring this bit to something fundamentally cleaner
and less fragile.
Why does this HASH_FIND the applicable hash table entries and then
HASH_REMOVE it as a separate step, instead of just HASH_REMOVE-ing it
in one go?
Doesn't this fail to release the locks if rmpredlock == NULL?
I believe it's project style to test (rmpredlock != NULL) rather than
just (rmpredlock).
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Fri, May 06, 2011 at 09:35:39PM -0400, Robert Haas wrote:
Why does this HASH_FIND the applicable hash table entries and then
HASH_REMOVE it as a separate step, instead of just HASH_REMOVE-ing it
in one go?
For PredicateLockHash, we need to find the lock entry first so that we
can call SHMQueueDelete on its targetLink and xactLink fields.
For LocalPredicateLockHash, we check the resulting entry to decide
whether to remove it. Having looked at the code some more, however, we do
always remove it because we only apply this optimization to heap tuple
locks, which are not parents of other locks. So we can simplify this
down to a HASH_REMOVE.
Doesn't this fail to release the locks if rmpredlock == NULL?
Yikes. Indeed it does.
I believe it's project style to test (rmpredlock != NULL) rather than
just (rmpredlock).
That works for me (I prefer the != NULL myself). I believe I've seen
both elsewhere, though...
Will update the patch.
Dan
--
Dan R. K. Ports MIT CSAIL http://drkp.net/
On Fri, May 06, 2011 at 10:49:22PM -0400, Dan Ports wrote:
Will update the patch.
Updated patch (in response to Robert's comments) attached.
Dan
--
Dan R. K. Ports MIT CSAIL http://drkp.net/
Attachments:
ssi-fix-checktargetforconflictsin-race-v2.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 419e0d9..9d51f08 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3846,6 +3846,8 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
LWLockId partitionLock;
PREDICATELOCKTARGET *target;
PREDICATELOCK *predlock;
+ PREDICATELOCK *mypredlock = NULL;
+ PREDICATELOCKTAG mypredlocktag;
Assert(MySerializableXact != InvalidSerializableXact);
@@ -3891,139 +3893,21 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
if (sxact == MySerializableXact)
{
/*
- * If we're getting a write lock on the tuple and we're not in a
- * subtransaction, we don't need a predicate (SIREAD) lock. We
- * can't use this optimization within a subtransaction because the
- * subtransaction could be rolled back, and we would be left
- * without any lock at the top level.
+ * If we're getting a write lock on a tuple, we don't need
+ * a predicate (SIREAD) lock on the same tuple. We can
+ * safely remove our SIREAD lock, but we'll defer doing so
+ * until after the loop because that requires upgrading to
+ * an exclusive partition lock.
*
- * At this point our transaction already has an ExclusiveRowLock
- * on the relation, so we are OK to drop the predicate lock on the
- * tuple, if found, without fearing that another write against the
- * tuple will occur before the MVCC information makes it to the
- * buffer.
+ * We can't use this optimization within a subtransaction
+ * because the subtransaction could roll back, and we
+ * would be left without any lock at the top level.
*/
if (!IsSubTransaction()
&& GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
{
- uint32 predlockhashcode;
- PREDICATELOCKTARGET *rmtarget = NULL;
- PREDICATELOCK *rmpredlock;
- LOCALPREDICATELOCK *locallock,
- *rmlocallock;
-
- /*
- * This is a tuple on which we have a tuple predicate lock. We
- * only have shared LW locks now; release those, and get
- * exclusive locks only while we modify things.
- */
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
- LWLockAcquire(partitionLock, LW_EXCLUSIVE);
- LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
-
- /*
- * Remove the predicate lock from shared memory, if it wasn't
- * removed while the locks were released. One way that could
- * happen is from autovacuum cleaning up an index.
- */
- predlockhashcode = PredicateLockHashCodeFromTargetHashCode
- (&(predlock->tag), targettaghash);
- rmpredlock = (PREDICATELOCK *)
- hash_search_with_hash_value(PredicateLockHash,
- &(predlock->tag),
- predlockhashcode,
- HASH_FIND, NULL);
- if (rmpredlock)
- {
- Assert(rmpredlock == predlock);
-
- SHMQueueDelete(predlocktargetlink);
- SHMQueueDelete(&(predlock->xactLink));
-
- rmpredlock = (PREDICATELOCK *)
- hash_search_with_hash_value(PredicateLockHash,
- &(predlock->tag),
- predlockhashcode,
- HASH_REMOVE, NULL);
- Assert(rmpredlock == predlock);
-
- RemoveTargetIfNoLongerUsed(target, targettaghash);
-
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockRelease(SerializablePredicateLockListLock);
-
- locallock = (LOCALPREDICATELOCK *)
- hash_search_with_hash_value(LocalPredicateLockHash,
- targettag, targettaghash,
- HASH_FIND, NULL);
-
- /*
- * Remove entry in local lock table if it exists and has
- * no children. It's OK if it doesn't exist; that means
- * the lock was transferred to a new target by a different
- * backend.
- */
- if (locallock != NULL)
- {
- locallock->held = false;
-
- if (locallock->childLocks == 0)
- {
- rmlocallock = (LOCALPREDICATELOCK *)
- hash_search_with_hash_value(LocalPredicateLockHash,
- targettag, targettaghash,
- HASH_REMOVE, NULL);
- Assert(rmlocallock == locallock);
- }
- }
-
- DecrementParentLocks(targettag);
-
- /*
- * If we've cleaned up the last of the predicate locks for
- * the target, bail out before re-acquiring the locks.
- */
- if (rmtarget)
- return;
-
- /*
- * The list has been altered. Start over at the front.
- */
- LWLockAcquire(partitionLock, LW_SHARED);
- nextpredlock = (PREDICATELOCK *)
- SHMQueueNext(&(target->predicateLocks),
- &(target->predicateLocks),
- offsetof(PREDICATELOCK, targetLink));
-
- LWLockAcquire(SerializableXactHashLock, LW_SHARED);
- }
- else
- {
- /*
- * The predicate lock was cleared while we were attempting
- * to upgrade our lightweight locks. Revert to the shared
- * locks.
- */
- LWLockRelease(SerializableXactHashLock);
- LWLockRelease(partitionLock);
- LWLockRelease(SerializablePredicateLockListLock);
- LWLockAcquire(partitionLock, LW_SHARED);
-
- /*
- * The list may have been altered by another process while
- * we weren't holding the partition lock. Start over at
- * the front.
- */
- nextpredlock = (PREDICATELOCK *)
- SHMQueueNext(&(target->predicateLocks),
- &(target->predicateLocks),
- offsetof(PREDICATELOCK, targetLink));
-
- LWLockAcquire(SerializableXactHashLock, LW_SHARED);
- }
+ mypredlock = predlock;
+ mypredlocktag = predlock->tag;
}
}
else if (!SxactIsRolledBack(sxact)
@@ -4057,6 +3941,73 @@ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
}
LWLockRelease(SerializableXactHashLock);
LWLockRelease(partitionLock);
+
+ /*
+ * If we found one of our own SIREAD locks to remove, remove it
+ * now.
+ *
+ * At this point our transaction already has an ExclusiveRowLock
+ * on the relation, so we are OK to drop the predicate lock on the
+ * tuple, if found, without fearing that another write against the
+ * tuple will occur before the MVCC information makes it to the
+ * buffer.
+ */
+ if (mypredlock != NULL)
+ {
+ uint32 predlockhashcode;
+ PREDICATELOCK *rmpredlock;
+
+ LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the predicate lock from shared memory, if it wasn't
+ * removed while the locks were released. One way that could
+ * happen is from autovacuum cleaning up an index.
+ */
+ predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+ (&mypredlocktag, targettaghash);
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_FIND, NULL);
+ if (rmpredlock != NULL)
+ {
+ Assert(rmpredlock == mypredlock);
+
+ SHMQueueDelete(&(mypredlock->targetLink));
+ SHMQueueDelete(&(mypredlock->xactLink));
+
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == mypredlock);
+
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+ LWLockRelease(SerializablePredicateLockListLock);
+
+ if (rmpredlock != NULL)
+ {
+ /*
+ * Remove entry in local lock table if it exists. It's OK
+ * if it doesn't exist; that means the lock was
+ * transferred to a new target by a different backend.
+ */
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_REMOVE, NULL);
+
+ DecrementParentLocks(targettag);
+ }
+ }
}
/*
On Sun, May 8, 2011 at 8:31 PM, Dan Ports <drkp@csail.mit.edu> wrote:
On Fri, May 06, 2011 at 10:49:22PM -0400, Dan Ports wrote:
Will update the patch.
Updated patch (in response to Robert's comments) attached.
Committed.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company